From 23a1f8d44c0bca48f04fc2a2f1edafd826ce6133 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:31 +0200
Subject: mac80211: process and save VHT MU-MIMO group frame

The Group ID Management frame is an Action frame of
category VHT. It is transmitted by the AP to assign
or change the user position of a STA for one or more
group IDs.
Process and save the group membership data. Notify
underlying driver of changes.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  7 +++++++
 include/net/mac80211.h    | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 452c0b0d2f32..d9ddb89533a7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -843,6 +843,8 @@ enum ieee80211_vht_opmode_bits {
 };
 
 #define WLAN_SA_QUERY_TR_ID_LEN 2
+#define WLAN_MEMBERSHIP_LEN 8
+#define WLAN_USER_POSITION_LEN 16
 
 /**
  * struct ieee80211_tpc_report_ie
@@ -989,6 +991,11 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 operating_mode;
 				} __packed vht_opmode_notif;
+				struct {
+					u8 action_code;
+					u8 membership[WLAN_MEMBERSHIP_LEN];
+					u8 position[WLAN_USER_POSITION_LEN];
+				} __packed vht_group_notif;
 				struct {
 					u8 action_code;
 					u8 dialog_token;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7c30faff245f..8da483b2c067 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -298,6 +298,7 @@ struct ieee80211_vif_chanctx_switch {
  *	note that this is only called when it changes after the channel
  *	context had been assigned.
  * @BSS_CHANGED_OCB: OCB join status changed
+ * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
@@ -323,6 +324,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_BEACON_INFO		= 1<<20,
 	BSS_CHANGED_BANDWIDTH		= 1<<21,
 	BSS_CHANGED_OCB                 = 1<<22,
+	BSS_CHANGED_MU_GROUPS		= 1<<23,
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
@@ -435,6 +437,19 @@ struct ieee80211_event {
 	} u;
 };
 
+/**
+ * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data
+ *
+ * This structure describes the group id data of VHT MU-MIMO
+ *
+ * @membership: 64 bits array - a bit is set if station is member of the group
+ * @position: 2 bits per group id indicating the position in the group
+ */
+struct ieee80211_mu_group_data {
+	u8 membership[WLAN_MEMBERSHIP_LEN];
+	u8 position[WLAN_USER_POSITION_LEN];
+};
+
 /**
  * struct ieee80211_bss_conf - holds the BSS's changing parameters
  *
@@ -477,6 +492,7 @@ struct ieee80211_event {
  * @enable_beacon: whether beaconing should be enabled or not
  * @chandef: Channel definition for this BSS -- the hardware might be
  *	configured a higher bandwidth than this BSS uses, for example.
+ * @mu_group: VHT MU-MIMO group membership data
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
  *	This field is only valid when the channel is a wide HT/VHT channel.
  *	Note that with TDLS this can be the case (channel is HT, protection must
@@ -535,6 +551,7 @@ struct ieee80211_bss_conf {
 	s32 cqm_rssi_thold;
 	u32 cqm_rssi_hyst;
 	struct cfg80211_chan_def chandef;
+	struct ieee80211_mu_group_data mu_group;
 	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
 	int arp_addr_cnt;
 	bool qos;
-- 
cgit v1.2.3


From f9cfa5f354b11e56cd8f019c12e14a42585586cd Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:33 +0200
Subject: mac80211: add flag for duplication check

Add an option for driver to check for packet duplication
by itself.
This is needed for example by the iwlwifi driver which
parallelizes the RX path and does the duplication check
per queue.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 net/mac80211/rx.c      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8da483b2c067..ecab934dc8d9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1063,7 +1063,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_HT_GF			= BIT(13),
 	RX_FLAG_AMPDU_DETAILS		= BIT(14),
 	RX_FLAG_PN_VALIDATED		= BIT(15),
-	/* bit 16 free */
+	RX_FLAG_DUP_VALIDATED		= BIT(16),
 	RX_FLAG_AMPDU_LAST_KNOWN	= BIT(17),
 	RX_FLAG_AMPDU_IS_LAST		= BIT(18),
 	RX_FLAG_AMPDU_DELIM_CRC_ERROR	= BIT(19),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a5668b54015f..fe675d76f29c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1099,6 +1099,9 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
+	if (status->flag & RX_FLAG_DUP_VALIDATED)
+		return RX_CONTINUE;
+
 	/*
 	 * Drop duplicate 802.11 retransmissions
 	 * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
-- 
cgit v1.2.3


From fad471860c097844432c7cf5d3ae6a0a059c2bdc Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:34 +0200
Subject: mac80211: pass RX aggregation window size to driver

Currently mac80211 does not inform the driver of the window
size when starting an RX aggregation session.
To enable managing the reorder buffer in the driver or hardware
the window size is needed.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 8 +++++---
 net/mac80211/agg-rx.c  | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ecab934dc8d9..a990338a766e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3047,9 +3047,11 @@ enum ieee80211_reconfig_type {
  * 	ieee80211_ampdu_mlme_action. Starting sequence number (@ssn)
  * 	is the first frame we expect to perform the action on. Notice
  * 	that TX/RX_STOP can pass NULL for this parameter.
- *	The @buf_size parameter is only valid when the action is set to
- *	%IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's reorder
- *	buffer size (number of subframes) for this session -- the driver
+ *	The @buf_size parameter is valid only when the action is set to
+ *	%IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and
+ *	indicates the reorder buffer size (number of subframes) for this
+ *	session.
+ *	When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver
  *	may neither send aggregates containing more subframes than this
  *	nor send aggregates in a way that lost frames would exceed the
  *	buffer size. If just limiting the aggregate size, this would be
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 10ad4ac1fa0b..78672737fe3e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -323,7 +323,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
 
 	ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
-			       &sta->sta, tid, &start_seq_num, 0, false);
+			       &sta->sta, tid, &start_seq_num, buf_size, false);
 	ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
 	       sta->sta.addr, tid, ret);
 	if (ret) {
-- 
cgit v1.2.3


From 4352a4d7f6bfd0aed0276a13fa4993db35714db4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 8 Dec 2015 16:04:35 +0200
Subject: mac80211: document status.freq restrictions

It's not always necessary to set the status.freq field, for example
when this would be an expensive calculation. It must be set for all
management frames (as they might be reported to userspace), but for
data frames it's not really required. Document this.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a990338a766e..bdee1cc19c7e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1108,6 +1108,8 @@ enum mac80211_rx_vht_flags {
  *	it but can store it and pass it back to the driver for synchronisation
  * @band: the active band when this frame was received
  * @freq: frequency the radio was tuned to when receiving this frame, in MHz
+ *	This field must be set for management frames, but isn't strictly needed
+ *	for data (other) frames - for those it only affects radiotap reporting.
  * @signal: signal strength when receiving this frame, either in dBm, in dB or
  *	unspecified depending on the hardware capabilities flags
  *	@IEEE80211_HW_SIGNAL_*
-- 
cgit v1.2.3


From 50ea05efaf3bed7dd34bcc2635a8b3f53bd0ccc1 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sarasharon1@gmail.com>
Date: Wed, 30 Dec 2015 16:06:04 +0200
Subject: mac80211: pass block ack session timeout to to driver

Currently mac80211 does not inform the driver of the session
block ack timeout when starting a rx aggregation session.
Drivers that manage the reorder buffer need to know this
parameter.
Seeing that there are now too many arguments for the
drv_ampdu_action() function, wrap them inside a structure.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c              |  7 +--
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |  7 +--
 drivers/net/wireless/ath/ath9k/main.c              |  8 ++--
 drivers/net/wireless/ath/carl9170/main.c           |  8 ++--
 drivers/net/wireless/ath/wcn36xx/main.c            |  8 ++--
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  8 ++--
 drivers/net/wireless/intel/iwlegacy/4965-mac.c     |  8 ++--
 drivers/net/wireless/intel/iwlegacy/4965.h         |  4 +-
 drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |  9 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  9 ++--
 drivers/net/wireless/mac80211_hwsim.c              |  8 ++--
 drivers/net/wireless/marvell/mwl8k.c               | 10 ++--
 drivers/net/wireless/mediatek/mt7601u/main.c       |  8 ++--
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c     |  7 +--
 drivers/net/wireless/ralink/rt2x00/rt2800lib.h     |  4 +-
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c   |  6 +--
 drivers/net/wireless/realtek/rtlwifi/core.c        |  8 ++--
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        | 19 +++-----
 drivers/net/wireless/st/cw1200/sta.c               |  4 +-
 drivers/net/wireless/st/cw1200/sta.h               |  4 +-
 drivers/net/wireless/ti/wlcore/main.c              |  8 ++--
 include/net/mac80211.h                             | 44 ++++++++++++------
 net/mac80211/agg-rx.c                              | 25 ++++++++--
 net/mac80211/agg-tx.c                              | 53 ++++++++++++++--------
 net/mac80211/driver-ops.c                          | 10 ++--
 net/mac80211/driver-ops.h                          |  4 +-
 net/mac80211/trace.h                               | 43 ++++++++++--------
 27 files changed, 202 insertions(+), 139 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 6146a293601a..368de5e5a04f 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -6366,12 +6366,13 @@ static u64 ath10k_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 static int ath10k_ampdu_action(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
-			       enum ieee80211_ampdu_mlme_action action,
-			       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			       u8 buf_size, bool amsdu)
+			       struct ieee80211_ampdu_params *params)
 {
 	struct ath10k *ar = hw->priv;
 	struct ath10k_vif *arvif = ath10k_vif_to_arvif(vif);
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 
 	ath10k_dbg(ar, ATH10K_DBG_MAC, "mac ampdu vdev_id %i sta %pM tid %hu action %d\n",
 		   arvif->vdev_id, sta->addr, tid, action);
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index fe1fd1a5ae15..639294a9e34d 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1657,13 +1657,14 @@ static void ath9k_htc_reset_tsf(struct ieee80211_hw *hw,
 
 static int ath9k_htc_ampdu_action(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  enum ieee80211_ampdu_mlme_action action,
-				  struct ieee80211_sta *sta,
-				  u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+				  struct ieee80211_ampdu_params *params)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
 	struct ath9k_htc_sta *ista;
 	int ret = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 
 	mutex_lock(&priv->mutex);
 	ath9k_htc_ps_wakeup(priv);
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index c1b33fdcca08..cf58a304e9f0 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1864,14 +1864,16 @@ static void ath9k_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 static int ath9k_ampdu_action(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
-			      enum ieee80211_ampdu_mlme_action action,
-			      struct ieee80211_sta *sta,
-			      u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+			      struct ieee80211_ampdu_params *params)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	bool flush = false;
 	int ret = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	mutex_lock(&sc->mutex);
 
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 19d3d64416bf..4d1527a2e292 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1413,10 +1413,12 @@ static void carl9170_ampdu_work(struct work_struct *work)
 
 static int carl9170_op_ampdu_action(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
-				    enum ieee80211_ampdu_mlme_action action,
-				    struct ieee80211_sta *sta,
-				    u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+				    struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 	struct ar9170 *ar = hw->priv;
 	struct carl9170_sta_info *sta_info = (void *) sta->drv_priv;
 	struct carl9170_sta_tid *tid_info;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 7c169abdbafe..a27279c2c695 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -857,12 +857,14 @@ static int wcn36xx_resume(struct ieee80211_hw *hw)
 
 static int wcn36xx_ampdu_action(struct ieee80211_hw *hw,
 		    struct ieee80211_vif *vif,
-		    enum ieee80211_ampdu_mlme_action action,
-		    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		    u8 buf_size, bool amsdu)
+		    struct ieee80211_ampdu_params *params)
 {
 	struct wcn36xx *wcn = hw->priv;
 	struct wcn36xx_sta *sta_priv = NULL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac ampdu action action %d tid %d\n",
 		    action, tid);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index bec2dc1ca2e4..61ae2768132a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -818,13 +818,15 @@ brcms_ops_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 static int
 brcms_ops_ampdu_action(struct ieee80211_hw *hw,
 		    struct ieee80211_vif *vif,
-		    enum ieee80211_ampdu_mlme_action action,
-		    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		    u8 buf_size, bool amsdu)
+		    struct ieee80211_ampdu_params *params)
 {
 	struct brcms_info *wl = hw->priv;
 	struct scb *scb = &wl->wlc->pri_scb;
 	int status;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u8 buf_size = params->buf_size;
 
 	if (WARN_ON(scb->magic != SCB_MAGIC))
 		return -EIDRM;
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index fd38aa0763e4..b75f4ef3cdc7 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -5982,12 +5982,14 @@ il4965_mac_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 
 int
 il4965_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 * ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
 	struct il_priv *il = hw->priv;
 	int ret = -EINVAL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	D_HT("A-MPDU action on addr %pM tid %d\n", sta->addr, tid);
 
diff --git a/drivers/net/wireless/intel/iwlegacy/4965.h b/drivers/net/wireless/intel/iwlegacy/4965.h
index 8ab8706f9422..e432715e02d8 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965.h
+++ b/drivers/net/wireless/intel/iwlegacy/4965.h
@@ -182,9 +182,7 @@ void il4965_mac_update_tkip_key(struct ieee80211_hw *hw,
 				struct ieee80211_sta *sta, u32 iv32,
 				u16 *phase1key);
 int il4965_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			    enum ieee80211_ampdu_mlme_action action,
-			    struct ieee80211_sta *sta, u16 tid, u16 * ssn,
-			    u8 buf_size, bool amsdu);
+			    struct ieee80211_ampdu_params *params);
 int il4965_mac_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		       struct ieee80211_sta *sta);
 void
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
index 29ea1c6705b4..151721e4040c 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
@@ -732,12 +732,15 @@ static inline bool iwl_enable_tx_ampdu(const struct iwl_cfg *cfg)
 
 static int iwlagn_mac_ampdu_action(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
-				   enum ieee80211_ampdu_mlme_action action,
-				   struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				   u8 buf_size, bool amsdu)
+				   struct ieee80211_ampdu_params *params)
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
 	int ret = -EINVAL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 	struct iwl_station_priv *sta_priv = (void *) sta->drv_priv;
 
 	IWL_DEBUG_HT(priv, "A-MPDU action on addr %pM tid %d\n",
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index d70a1716f3e0..1bd3f0b700d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -837,13 +837,16 @@ iwl_mvm_ampdu_check_trigger(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 static int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
-				    enum ieee80211_ampdu_mlme_action action,
-				    struct ieee80211_sta *sta, u16 tid,
-				    u16 *ssn, u8 buf_size, bool amsdu)
+				    struct ieee80211_ampdu_params *params)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	int ret;
 	bool tx_agg_ref = false;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 
 	IWL_DEBUG_HT(mvm, "A-MPDU action on addr %pM tid %d: action %d\n",
 		     sta->addr, tid, action);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index c32889a1e39c..e31a94fd6135 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1845,10 +1845,12 @@ static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw,
 
 static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
-				       enum ieee80211_ampdu_mlme_action action,
-				       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				       u8 buf_size, bool amsdu)
+				       struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
 		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 30e3aaae32e2..088429d0a634 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5421,11 +5421,13 @@ static int mwl8k_get_survey(struct ieee80211_hw *hw, int idx,
 
 static int
 mwl8k_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		   enum ieee80211_ampdu_mlme_action action,
-		   struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		   u8 buf_size, bool amsdu)
+		   struct ieee80211_ampdu_params *params)
 {
-
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 	int i, rc = 0;
 	struct mwl8k_priv *priv = hw->priv;
 	struct mwl8k_ampdu_stream *stream;
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index f715eee39851..e70dd9523911 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -334,11 +334,13 @@ static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 
 static int
 mt76_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		  enum ieee80211_ampdu_mlme_action action,
-		  struct ieee80211_sta *sta, u16 tid, u16 *ssn, u8 buf_size,
-		  bool amsdu)
+		  struct ieee80211_ampdu_params *params)
 {
 	struct mt7601u_dev *dev = hw->priv;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 	struct mt76_sta *msta = (struct mt76_sta *) sta->drv_priv;
 
 	WARN_ON(msta->wcid.idx > GROUP_WCID(0));
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index 9733b31a780d..69c1c09687a3 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -7935,10 +7935,11 @@ u64 rt2800_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 EXPORT_SYMBOL_GPL(rt2800_get_tsf);
 
 int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 	struct rt2x00_sta *sta_priv = (struct rt2x00_sta *)sta->drv_priv;
 	int ret = 0;
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
index 440790b92b19..83f1a44fb9b4 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
@@ -218,9 +218,7 @@ int rt2800_conf_tx(struct ieee80211_hw *hw,
 		   const struct ieee80211_tx_queue_params *params);
 u64 rt2800_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu);
+			struct ieee80211_ampdu_params *params);
 int rt2800_get_survey(struct ieee80211_hw *hw, int idx,
 		      struct survey_info *survey);
 void rt2800_disable_wpdma(struct rt2x00_dev *rt2x00dev);
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
index 6aed923a709a..7d820c395375 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
@@ -5375,13 +5375,13 @@ static int rtl8xxxu_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 
 static int
 rtl8xxxu_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		      enum ieee80211_ampdu_mlme_action action,
-		      struct ieee80211_sta *sta, u16 tid, u16 *ssn, u8 buf_size,
-		      bool amsdu)
+		      struct ieee80211_ampdu_params *params)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
 	struct device *dev = &priv->udev->dev;
 	u8 ampdu_factor, ampdu_density;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
 
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 4ae421ef30d9..f2507610314b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1371,11 +1371,13 @@ static void rtl_op_sta_notify(struct ieee80211_hw *hw,
 
 static int rtl_op_ampdu_action(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
-			       enum ieee80211_ampdu_mlme_action action,
-			       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			       u8 buf_size, bool amsdu)
+			       struct ieee80211_ampdu_params *params)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index b5bcc933a2a6..4df992de7d07 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -659,29 +659,24 @@ static int rsi_mac80211_set_key(struct ieee80211_hw *hw,
  *				 informs the f/w regarding this.
  * @hw: Pointer to the ieee80211_hw structure.
  * @vif: Pointer to the ieee80211_vif structure.
- * @action: ieee80211_ampdu_mlme_action enum.
- * @sta: Pointer to the ieee80211_sta structure.
- * @tid: Traffic identifier.
- * @ssn: Pointer to ssn value.
- * @buf_size: Buffer size (for kernel version > 2.6.38).
- * @amsdu: is AMSDU in AMPDU allowed
+ * @params: Pointer to A-MPDU action parameters
  *
  * Return: status: 0 on success, negative error code on failure.
  */
 static int rsi_mac80211_ampdu_action(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
-				     enum ieee80211_ampdu_mlme_action action,
-				     struct ieee80211_sta *sta,
-				     unsigned short tid,
-				     unsigned short *ssn,
-				     unsigned char buf_size,
-				     bool amsdu)
+				     struct ieee80211_ampdu_params *params)
 {
 	int status = -EOPNOTSUPP;
 	struct rsi_hw *adapter = hw->priv;
 	struct rsi_common *common = adapter->priv;
 	u16 seq_no = 0;
 	u8 ii = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 
 	for (ii = 0; ii < RSI_MAX_VIFS; ii++) {
 		if (vif == adapter->vifs[ii])
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 06321c799c90..d0ddcde6c695 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -2129,9 +2129,7 @@ void cw1200_mcast_timeout(unsigned long arg)
 
 int cw1200_ampdu_action(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
 	/* Aggregation is implemented fully in firmware,
 	 * including block ack negotiation. Do not allow
diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h
index bebb3379017f..a0bacaa39b31 100644
--- a/drivers/net/wireless/st/cw1200/sta.h
+++ b/drivers/net/wireless/st/cw1200/sta.h
@@ -109,9 +109,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			     u32 changed);
 int cw1200_ampdu_action(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu);
+			struct ieee80211_ampdu_params *params);
 
 void cw1200_suspend_resume(struct cw1200_common *priv,
 			  struct wsm_suspend_resume *arg);
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index d1109c4f0f0d..45662cf3169f 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -5187,14 +5187,16 @@ out:
 
 static int wl1271_op_ampdu_action(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  enum ieee80211_ampdu_mlme_action action,
-				  struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				  u8 buf_size, bool amsdu)
+				  struct ieee80211_ampdu_params *params)
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
 	int ret;
 	u8 hlid, *ba_bitmap;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	wl1271_debug(DEBUG_MAC80211, "mac80211 ampdu action %d tid %d", action,
 		     tid);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index bdee1cc19c7e..6c9c559394b0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2702,6 +2702,33 @@ enum ieee80211_ampdu_mlme_action {
 	IEEE80211_AMPDU_TX_OPERATIONAL,
 };
 
+/**
+ * struct ieee80211_ampdu_params - AMPDU action parameters
+ *
+ * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action.
+ * @sta: peer of this AMPDU session
+ * @tid: tid of the BA session
+ * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When
+ *	action is set to %IEEE80211_AMPDU_RX_START the driver passes back the
+ *	actual ssn value used to start the session and writes the value here.
+ * @buf_size: reorder buffer size  (number of subframes). Valid only when the
+ *	action is set to %IEEE80211_AMPDU_RX_START or
+ *	%IEEE80211_AMPDU_TX_OPERATIONAL
+ * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU.
+ *	valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL
+ * @timeout: BA session timeout. Valid only when the action is set to
+ *	%IEEE80211_AMPDU_RX_START
+ */
+struct ieee80211_ampdu_params {
+	enum ieee80211_ampdu_mlme_action action;
+	struct ieee80211_sta *sta;
+	u16 tid;
+	u16 ssn;
+	u8 buf_size;
+	bool amsdu;
+	u16 timeout;
+};
+
 /**
  * enum ieee80211_frame_release_type - frame release reason
  * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll
@@ -3046,15 +3073,9 @@ enum ieee80211_reconfig_type {
  * @ampdu_action: Perform a certain A-MPDU action
  * 	The RA/TID combination determines the destination and TID we want
  * 	the ampdu action to be performed for. The action is defined through
- * 	ieee80211_ampdu_mlme_action. Starting sequence number (@ssn)
- * 	is the first frame we expect to perform the action on. Notice
- * 	that TX/RX_STOP can pass NULL for this parameter.
- *	The @buf_size parameter is valid only when the action is set to
- *	%IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and
- *	indicates the reorder buffer size (number of subframes) for this
- *	session.
+ *	ieee80211_ampdu_mlme_action.
  *	When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver
- *	may neither send aggregates containing more subframes than this
+ *	may neither send aggregates containing more subframes than @buf_size
  *	nor send aggregates in a way that lost frames would exceed the
  *	buffer size. If just limiting the aggregate size, this would be
  *	possible with a buf_size of 8:
@@ -3065,9 +3086,6 @@ enum ieee80211_reconfig_type {
  *	buffer size of 8. Correct ways to retransmit #1 would be:
  *	 - TX:       1 or 18 or 81
  *	Even "189" would be wrong since 1 could be lost again.
- *	The @amsdu parameter is valid when the action is set to
- *	%IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's ability
- *	to receive A-MSDU within A-MPDU.
  *
  *	Returns a negative error code on failure.
  *	The callback can sleep.
@@ -3409,9 +3427,7 @@ struct ieee80211_ops {
 	int (*tx_last_beacon)(struct ieee80211_hw *hw);
 	int (*ampdu_action)(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
-			    enum ieee80211_ampdu_mlme_action action,
-			    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			    u8 buf_size, bool amsdu);
+			    struct ieee80211_ampdu_params *params);
 	int (*get_survey)(struct ieee80211_hw *hw, int idx,
 		struct survey_info *survey);
 	void (*rfkill_poll)(struct ieee80211_hw *hw);
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 78672737fe3e..ec80db7c955c 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -7,6 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -61,6 +62,14 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 {
 	struct ieee80211_local *local = sta->local;
 	struct tid_ampdu_rx *tid_rx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_RX_STOP,
+		.tid = tid,
+		.amsdu = false,
+		.timeout = 0,
+		.ssn = 0,
+	};
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
@@ -78,8 +87,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 	       initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator",
 	       (int)reason);
 
-	if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
-			     &sta->sta, tid, NULL, 0, false))
+	if (drv_ampdu_action(local, sta->sdata, &params))
 		sdata_info(sta->sdata,
 			   "HW problem - can not stop rx aggregation for %pM tid %d\n",
 			   sta->sta.addr, tid);
@@ -237,6 +245,15 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 {
 	struct ieee80211_local *local = sta->sdata->local;
 	struct tid_ampdu_rx *tid_agg_rx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_RX_START,
+		.tid = tid,
+		.amsdu = false,
+		.timeout = timeout,
+		.ssn = start_seq_num,
+	};
+
 	int i, ret = -EOPNOTSUPP;
 	u16 status = WLAN_STATUS_REQUEST_DECLINED;
 
@@ -275,6 +292,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	/* make sure the size doesn't exceed the maximum supported by the hw */
 	if (buf_size > local->hw.max_rx_aggregation_subframes)
 		buf_size = local->hw.max_rx_aggregation_subframes;
+	params.buf_size = buf_size;
 
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
@@ -322,8 +340,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	for (i = 0; i < buf_size; i++)
 		__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
 
-	ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
-			       &sta->sta, tid, &start_seq_num, buf_size, false);
+	ret = drv_ampdu_action(local, sta->sdata, &params);
 	ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
 	       sta->sta.addr, tid, ret);
 	if (ret) {
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ff757181b0a8..4932e9f243a2 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -7,6 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -295,7 +296,14 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 {
 	struct ieee80211_local *local = sta->local;
 	struct tid_ampdu_tx *tid_tx;
-	enum ieee80211_ampdu_mlme_action action;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.tid = tid,
+		.buf_size = 0,
+		.amsdu = false,
+		.timeout = 0,
+		.ssn = 0,
+	};
 	int ret;
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
@@ -304,10 +312,10 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 	case AGG_STOP_DECLINED:
 	case AGG_STOP_LOCAL_REQUEST:
 	case AGG_STOP_PEER_REQUEST:
-		action = IEEE80211_AMPDU_TX_STOP_CONT;
+		params.action = IEEE80211_AMPDU_TX_STOP_CONT;
 		break;
 	case AGG_STOP_DESTROY_STA:
-		action = IEEE80211_AMPDU_TX_STOP_FLUSH;
+		params.action = IEEE80211_AMPDU_TX_STOP_FLUSH;
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -330,9 +338,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 		spin_unlock_bh(&sta->lock);
 		if (reason != AGG_STOP_DESTROY_STA)
 			return -EALREADY;
-		ret = drv_ampdu_action(local, sta->sdata,
-				       IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
-				       &sta->sta, tid, NULL, 0, false);
+		params.action = IEEE80211_AMPDU_TX_STOP_FLUSH_CONT;
+		ret = drv_ampdu_action(local, sta->sdata, &params);
 		WARN_ON_ONCE(ret);
 		return 0;
 	}
@@ -381,8 +388,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 					WLAN_BACK_INITIATOR;
 	tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST;
 
-	ret = drv_ampdu_action(local, sta->sdata, action,
-			       &sta->sta, tid, NULL, 0, false);
+	ret = drv_ampdu_action(local, sta->sdata, &params);
 
 	/* HW shall not deny going back to legacy */
 	if (WARN_ON(ret)) {
@@ -445,7 +451,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	struct tid_ampdu_tx *tid_tx;
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	u16 start_seq_num;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_TX_START,
+		.tid = tid,
+		.buf_size = 0,
+		.amsdu = false,
+		.timeout = 0,
+	};
 	int ret;
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -467,10 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	 */
 	synchronize_net();
 
-	start_seq_num = sta->tid_seq[tid] >> 4;
-
-	ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
-			       &sta->sta, tid, &start_seq_num, 0, false);
+	params.ssn = sta->tid_seq[tid] >> 4;
+	ret = drv_ampdu_action(local, sdata, &params);
 	if (ret) {
 		ht_dbg(sdata,
 		       "BA request denied - HW unavailable for %pM tid %d\n",
@@ -499,7 +510,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 
 	/* send AddBA request */
 	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
-				     tid_tx->dialog_token, start_seq_num,
+				     tid_tx->dialog_token, params.ssn,
 				     IEEE80211_MAX_AMPDU_BUF,
 				     tid_tx->timeout);
 }
@@ -684,18 +695,24 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
 					 struct sta_info *sta, u16 tid)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_TX_OPERATIONAL,
+		.tid = tid,
+		.timeout = 0,
+		.ssn = 0,
+	};
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+	params.buf_size = tid_tx->buf_size;
+	params.amsdu = tid_tx->amsdu;
 
 	ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n",
 	       sta->sta.addr, tid);
 
-	drv_ampdu_action(local, sta->sdata,
-			 IEEE80211_AMPDU_TX_OPERATIONAL,
-			 &sta->sta, tid, NULL, tid_tx->buf_size,
-			 tid_tx->amsdu);
+	drv_ampdu_action(local, sta->sdata, &params);
 
 	/*
 	 * synchronize with TX path, while splicing the TX path
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index ca1fe5576103..c258f1041d33 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -284,9 +284,7 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local,
 
 int drv_ampdu_action(struct ieee80211_local *local,
 		     struct ieee80211_sub_if_data *sdata,
-		     enum ieee80211_ampdu_mlme_action action,
-		     struct ieee80211_sta *sta, u16 tid,
-		     u16 *ssn, u8 buf_size, bool amsdu)
+		     struct ieee80211_ampdu_params *params)
 {
 	int ret = -EOPNOTSUPP;
 
@@ -296,12 +294,10 @@ int drv_ampdu_action(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_ampdu_action(local, sdata, action, sta, tid,
-			       ssn, buf_size, amsdu);
+	trace_drv_ampdu_action(local, sdata, params);
 
 	if (local->ops->ampdu_action)
-		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
-					       sta, tid, ssn, buf_size, amsdu);
+		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params);
 
 	trace_drv_return_int(local, ret);
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 154ce4b13406..18b0d65baff0 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -585,9 +585,7 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local)
 
 int drv_ampdu_action(struct ieee80211_local *local,
 		     struct ieee80211_sub_if_data *sdata,
-		     enum ieee80211_ampdu_mlme_action action,
-		     struct ieee80211_sta *sta, u16 tid,
-		     u16 *ssn, u8 buf_size, bool amsdu);
+		     struct ieee80211_ampdu_params *params);
 
 static inline int drv_get_survey(struct ieee80211_local *local, int idx,
 				struct survey_info *survey)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index a6b4442776a0..2b0a17ee907a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -80,7 +80,23 @@
 #define KEY_PR_FMT	" cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
 #define KEY_PR_ARG	__entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx
 
-
+#define AMPDU_ACTION_ENTRY	__field(enum ieee80211_ampdu_mlme_action,		\
+					ieee80211_ampdu_mlme_action)			\
+				STA_ENTRY						\
+				__field(u16, tid)					\
+				__field(u16, ssn)					\
+				__field(u8, buf_size)					\
+				__field(bool, amsdu)					\
+				__field(u16, timeout)
+#define AMPDU_ACTION_ASSIGN	STA_NAMED_ASSIGN(params->sta);				\
+				__entry->tid = params->tid;				\
+				__entry->ssn = params->ssn;				\
+				__entry->buf_size = params->buf_size;			\
+				__entry->amsdu = params->amsdu;				\
+				__entry->timeout = params->timeout;
+#define AMPDU_ACTION_PR_FMT	STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d"
+#define AMPDU_ACTION_PR_ARG	STA_PR_ARG, __entry->tid, __entry->ssn,			\
+				__entry->buf_size, __entry->amsdu, __entry->timeout
 
 /*
  * Tracing for driver callbacks.
@@ -970,38 +986,25 @@ DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
 TRACE_EVENT(drv_ampdu_action,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 enum ieee80211_ampdu_mlme_action action,
-		 struct ieee80211_sta *sta, u16 tid,
-		 u16 *ssn, u8 buf_size, bool amsdu),
+		 struct ieee80211_ampdu_params *params),
 
-	TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu),
+	TP_ARGS(local, sdata, params),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
-		STA_ENTRY
-		__field(u32, action)
-		__field(u16, tid)
-		__field(u16, ssn)
-		__field(u8, buf_size)
-		__field(bool, amsdu)
 		VIF_ENTRY
+		AMPDU_ACTION_ENTRY
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
-		STA_ASSIGN;
-		__entry->action = action;
-		__entry->tid = tid;
-		__entry->ssn = ssn ? *ssn : 0;
-		__entry->buf_size = buf_size;
-		__entry->amsdu = amsdu;
+		AMPDU_ACTION_ASSIGN;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
-		__entry->tid, __entry->buf_size, __entry->amsdu
+		LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG
 	)
 );
 
-- 
cgit v1.2.3


From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:05 -0500
Subject: net: add rx_nohandler stat counter

This adds an rx_nohandler stat counter, along with a sysfs statistics
node, and copies the counter out via netlink as well.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@mellanox.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Tom Herbert <tom@herbertland.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <gospo@cumulusnetworks.com>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 3 +++
 include/uapi/linux/if_link.h | 4 ++++
 net/core/dev.c               | 6 +++++-
 net/core/net-sysfs.c         | 2 ++
 net/core/rtnetlink.c         | 2 ++
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 289c2314d766..78a20cec2a0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1397,6 +1397,8 @@ enum netdev_priv_flags {
  *			do not use this in drivers
  *	@tx_dropped:	Dropped packets by core network,
  *			do not use this in drivers
+ *	@rx_nohandler:	nohandler dropped packets by core network on
+ *			inactive devices, do not use this in drivers
  *
  *	@wireless_handlers:	List of functions to handle Wireless Extensions,
  *				instead of ioctl,
@@ -1611,6 +1613,7 @@ struct net_device {
 
 	atomic_long_t		rx_dropped;
 	atomic_long_t		tx_dropped;
+	atomic_long_t		rx_nohandler;
 
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *	wireless_handlers;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a30b78090594..d3e90b91e07e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -35,6 +35,8 @@ struct rtnl_link_stats {
 	/* for cslip etc */
 	__u32	rx_compressed;
 	__u32	tx_compressed;
+
+	__u32	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The main device statistics structure */
@@ -68,6 +70,8 @@ struct rtnl_link_stats64 {
 	/* for cslip etc */
 	__u64	rx_compressed;
 	__u64	tx_compressed;
+
+	__u64	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The struct should be in sync with struct ifmap */
diff --git a/net/core/dev.c b/net/core/dev.c
index 65863e512227..f1284835b8c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4154,7 +4154,10 @@ ncls:
 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
 drop:
-		atomic_long_inc(&skb->dev->rx_dropped);
+		if (!deliver_exact)
+			atomic_long_inc(&skb->dev->rx_dropped);
+		else
+			atomic_long_inc(&skb->dev->rx_nohandler);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -7307,6 +7310,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 	}
 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b6c8a6629b39..da7dbc237a5f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -574,6 +574,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
 NETSTAT_ENTRY(tx_window_errors);
 NETSTAT_ENTRY(rx_compressed);
 NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
 
 static struct attribute *netstat_attrs[] = {
 	&dev_attr_rx_packets.attr,
@@ -599,6 +600,7 @@ static struct attribute *netstat_attrs[] = {
 	&dev_attr_tx_window_errors.attr,
 	&dev_attr_rx_compressed.attr,
 	&dev_attr_tx_compressed.attr,
+	&dev_attr_rx_nohandler.attr,
 	NULL
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d735e854f916..20d71358c143 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 
 	a->rx_compressed = b->rx_compressed;
 	a->tx_compressed = b->tx_compressed;
+
+	a->rx_nohandler = b->rx_nohandler;
 }
 
 static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
-- 
cgit v1.2.3


From bb63daf9efb4f2bcb657d7179a53bd808f978dc9 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:06 -0500
Subject: team: track sum of rx_nohandler for all slaves

CC: Jiri Pirko <jiri@resnulli.us>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 10 +++++++---
 include/linux/if_team.h |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 718ceeab4dbc..00558e139584 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -758,6 +758,8 @@ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb)
 		u64_stats_update_end(&pcpu_stats->syncp);
 
 		skb->dev = team->dev;
+	} else if (res == RX_HANDLER_EXACT) {
+		this_cpu_inc(team->pcpu_stats->rx_nohandler);
 	} else {
 		this_cpu_inc(team->pcpu_stats->rx_dropped);
 	}
@@ -1807,7 +1809,7 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	struct team *team = netdev_priv(dev);
 	struct team_pcpu_stats *p;
 	u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes;
-	u32 rx_dropped = 0, tx_dropped = 0;
+	u32 rx_dropped = 0, tx_dropped = 0, rx_nohandler = 0;
 	unsigned int start;
 	int i;
 
@@ -1828,14 +1830,16 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets	+= tx_packets;
 		stats->tx_bytes		+= tx_bytes;
 		/*
-		 * rx_dropped & tx_dropped are u32, updated
-		 * without syncp protection.
+		 * rx_dropped, tx_dropped & rx_nohandler are u32,
+		 * updated without syncp protection.
 		 */
 		rx_dropped	+= p->rx_dropped;
 		tx_dropped	+= p->tx_dropped;
+		rx_nohandler	+= p->rx_nohandler;
 	}
 	stats->rx_dropped	= rx_dropped;
 	stats->tx_dropped	= tx_dropped;
+	stats->rx_nohandler	= rx_nohandler;
 	return stats;
 }
 
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index b84e49c3a738..174f43f43aff 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -24,6 +24,7 @@ struct team_pcpu_stats {
 	struct u64_stats_sync	syncp;
 	u32			rx_dropped;
 	u32			tx_dropped;
+	u32			rx_nohandler;
 };
 
 struct team;
-- 
cgit v1.2.3


From 61d2bcae99f66a640b3dd9632180209143fb5512 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Feb 2016 21:03:07 -0800
Subject: tcp: fastopen: accept data/FIN present in SYNACK message

RFC 7413 (TCP Fast Open) 4.2.2 states that the SYNACK message
MAY include data and/or FIN

This patch adds support for the client side :

If we receive a SYNACK with payload or FIN, queue the skb instead
of ignoring it.

Since we already support the same for SYN, we refactor the existing
code and reuse it. Note we need to clone the skb, so this operation
might fail under memory pressure.

Sara Dickinson pointed out FreeBSD server Fast Open implementation
was planned to generate such SYNACK in the future.

The server side might be implemented on linux later.

Reported-by: Sara Dickinson <sara@sinodun.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_fastopen.c | 64 ++++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_input.c    |  3 +++
 3 files changed, 38 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6f8f032c73e..27f4c733116d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1437,6 +1437,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 
 extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 int tcp_fastopen_reset_cipher(void *key, unsigned int len);
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 55be6ac70cff..467d3e985411 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -124,6 +124,35 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 	return false;
 }
 
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+		return;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_dst_drop(skb);
+	__skb_pull(skb, tcp_hdrlen(skb));
+	skb_set_owner_r(skb, sk);
+
+	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	tp->syn_data_acked = 1;
+
+	/* u64_stats_update_begin(&tp->syncp) not needed here,
+	 * as we certainly are not changing upper 32bit value (0)
+	 */
+	tp->bytes_received = skb->len;
+}
+
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
 					      struct sk_buff *skb,
 					      struct dst_entry *dst,
@@ -132,7 +161,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	struct tcp_sock *tp;
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	struct sock *child;
-	u32 end_seq;
 	bool own_req;
 
 	req->num_retrans = 0;
@@ -178,35 +206,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	tcp_init_metrics(child);
 	tcp_init_buffer_space(child);
 
-	/* Queue the data carried in the SYN packet.
-	 * We used to play tricky games with skb_get().
-	 * With lockless listener, it is a dead end.
-	 * Do not think about it.
-	 *
-	 * XXX (TFO) - we honor a zero-payload TFO request for now,
-	 * (any reason not to?) but no need to queue the skb since
-	 * there is no data. How about SYN+FIN?
-	 */
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-	if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
-		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
-		if (likely(skb2)) {
-			skb_dst_drop(skb2);
-			__skb_pull(skb2, tcp_hdrlen(skb));
-			skb_set_owner_r(skb2, child);
-			__skb_queue_tail(&child->sk_receive_queue, skb2);
-			tp->syn_data_acked = 1;
-
-			/* u64_stats_update_begin(&tp->syncp) not needed here,
-			 * as we certainly are not changing upper 32bit value (0)
-			 */
-			tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
-		} else {
-			end_seq = TCP_SKB_CB(skb)->seq + 1;
-		}
-	}
-	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+	tcp_fastopen_add_skb(child, skb);
+
+	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
 	/* tcp_conn_request() is sending the SYNACK,
 	 * and queues the child into listener accept queue.
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1c2a73406261..4add3eb40e58 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5509,6 +5509,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	tp->syn_data_acked = tp->syn_data;
 	if (tp->syn_data_acked)
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+
+	tcp_fastopen_add_skb(sk, synack);
+
 	return false;
 }
 
-- 
cgit v1.2.3


From ba905f5e2f63d86ed4cfbd3d9096fb28d156f1ee Mon Sep 17 00:00:00 2001
From: Kim Jones <kim-marie.jones@intel.com>
Date: Tue, 2 Feb 2016 03:51:16 +0000
Subject: ethtool: Declare netdev_rss_key as __read_mostly.

netdev_rss_key is written to once and thereafter is read by
drivers when they are initialising. The fact that it is mostly
read and not written to makes it a candidate for a __read_mostly
declaration.

Signed-off-by: Kim Jones <kim-marie.jones@intel.com>
Signed-off-by: Alan Carey <alan.carey@intel.com>
Acked-by: Rami Rosen <rami.rosen@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 net/core/ethtool.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 78a20cec2a0a..219f53c30cb3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3744,7 +3744,7 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
-extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
 void netdev_rss_key_fill(void *buffer, size_t len);
 
 int dev_get_nest_level(struct net_device *dev,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index daf04709dd3c..453c803f1c87 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -632,7 +632,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
 	return 0;
 }
 
-u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
 
 void netdev_rss_key_fill(void *buffer, size_t len)
 {
-- 
cgit v1.2.3


From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:53 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map

Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do
accurate counters without need to use BPF_XADD instruction which turned
out to be too costly for high-performance network monitoring.
In the typical use case the 'key' is the flow tuple or other long
living object that sees a lot of events per second.

bpf_map_lookup_elem() returns per-cpu area.
Example:
struct {
  u32 packets;
  u32 bytes;
} * ptr = bpf_map_lookup_elem(&map, &key);
/* ptr points to this_cpu area of the value, so the following
 * increments will not collide with other cpus
 */
ptr->packets ++;
ptr->bytes += skb->len;

bpf_update_elem() atomically creates a new element where all per-cpu
values are zero initialized and this_cpu value is populated with
given 'value'.
Note that non-per-cpu hash map always allocates new element
and then deletes old after rcu grace period to maintain atomicity
of update. Per-cpu hash map updates element values in-place.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 275 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 229 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa6f8571de13..43ae40c8763e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd8a315..2be5f6e8bb04 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -31,21 +31,27 @@ struct bpf_htab {
 struct htab_elem {
 	struct hlist_node hash_node;
 	struct rcu_head rcu;
-	u32 hash;
+	union {
+		u32 hash;
+		u32 key_size;
+	};
 	char key[0] __aligned(8);
 };
 
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct bpf_htab *htab;
 	int err, i;
+	u64 cost;
 
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
 
 	/* mandatory map attributes */
+	htab->map.map_type = attr->map_type;
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
@@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
+	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+		/* make sure the size for pcpu_alloc() is reasonable */
+		goto free_htab;
+
 	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+			  round_up(htab->map.key_size, 8);
+	if (percpu)
+		htab->elem_size += sizeof(void *);
+	else
+		htab->elem_size += htab->map.value_size;
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
 		goto free_htab;
 
-	if ((u64) htab->n_buckets * sizeof(struct bucket) +
-	    (u64) htab->elem_size * htab->map.max_entries >=
-	    U32_MAX - PAGE_SIZE)
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (percpu)
+		cost += (u64) round_up(htab->map.value_size, 8) *
+			num_possible_cpus() * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
-				   htab->elem_size * htab->map.max_entries,
-				   PAGE_SIZE) >> PAGE_SHIFT;
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
@@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 }
 
 /* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
@@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
+	return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
 	if (l)
 		return l->key + round_up(map->key_size, 8);
 
@@ -230,65 +253,139 @@ find_first_elem:
 	return -ENOENT;
 }
 
+
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static void htab_percpu_elem_free(struct htab_elem *l)
+{
+	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	kfree(l);
+}
+
+static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+{
+	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+
+	htab_percpu_elem_free(l);
+}
+
+static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+{
+	if (percpu) {
+		l->key_size = key_size;
+		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	} else {
+		kfree_rcu(l, rcu);
+	}
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+					 void *value, u32 key_size, u32 hash,
+					 bool percpu)
+{
+	u32 size = htab->map.value_size;
+	struct htab_elem *l_new;
+	void __percpu *pptr;
+
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!l_new)
+		return NULL;
+
+	memcpy(l_new->key, key, key_size);
+	if (percpu) {
+		/* round up value_size to 8 bytes */
+		size = round_up(size, 8);
+
+		/* alloc_percpu zero-fills */
+		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
+		if (!pptr) {
+			kfree(l_new);
+			return NULL;
+		}
+
+		/* copy true value_size bytes */
+		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+		htab_elem_set_ptr(l_new, key_size, pptr);
+	} else {
+		memcpy(l_new->key + round_up(key_size, 8), value, size);
+	}
+
+	l_new->hash = hash;
+	return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+		       u64 map_flags)
+{
+	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		return -E2BIG;
+
+	if (l_old && map_flags == BPF_NOEXIST)
+		/* elem already exists */
+		return -EEXIST;
+
+	if (!l_old && map_flags == BPF_EXIST)
+		/* elem doesn't exist, cannot update it */
+		return -ENOENT;
+
+	return 0;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct htab_elem *l_new, *l_old;
+	struct htab_elem *l_new = NULL, *l_old;
 	struct hlist_head *head;
-	struct bucket *b;
 	unsigned long flags;
-	u32 key_size;
+	struct bucket *b;
+	u32 key_size, hash;
 	int ret;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return -ENOMEM;
-
 	key_size = map->key_size;
 
-	memcpy(l_new->key, key, key_size);
-	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+	hash = htab_map_hash(key, key_size);
+
+	/* allocate new element outside of the lock, since
+	 * we're most likley going to insert it
+	 */
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
+	if (!l_new)
+		return -ENOMEM;
 
-	l_new->hash = htab_map_hash(l_new->key, key_size);
-	b = __select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, hash);
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
 
-	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		ret = -E2BIG;
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
 		goto err;
-	}
 
-	if (l_old && map_flags == BPF_NOEXIST) {
-		/* elem already exists */
-		ret = -EEXIST;
-		goto err;
-	}
-
-	if (!l_old && map_flags == BPF_EXIST) {
-		/* elem doesn't exist, cannot update it */
-		ret = -ENOENT;
-		goto err;
-	}
-
-	/* add new element to the head of the list, so that concurrent
-	 * search will find it before old elem
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
@@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		atomic_inc(&htab->count);
 	}
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-
 	return 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
@@ -306,10 +402,64 @@ err:
 	return ret;
 }
 
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new = NULL, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+	u32 key_size, hash;
+	int ret;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	if (l_old) {
+		/* per-cpu hash map can update value in-place */
+		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
+		       value, htab->map.value_size);
+	} else {
+		l_new = alloc_htab_elem(htab, key, value, key_size,
+					hash, true);
+		if (!l_new) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		hlist_add_head_rcu(&l_new->hash_node, head);
+		atomic_inc(&htab->count);
+	}
+	ret = 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	return ret;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
 		atomic_dec(&htab->count);
-		kfree_rcu(l, rcu);
+		free_htab_elem(l, percpu, key_size);
 		ret = 0;
 	}
 
@@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab)
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
 			atomic_dec(&htab->count);
-			kfree(l);
+			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+				l->key_size = htab->map.key_size;
+				htab_percpu_elem_free(l);
+			} else {
+				kfree(l);
+			}
 		}
 	}
 }
@@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+	if (l)
+		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+	else
+		return NULL;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_update_elem = htab_percpu_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+	.ops = &htab_percpu_ops,
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
+	bpf_register_map_type(&htab_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
cgit v1.2.3


From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:54 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map

Primary use case is a histogram array of latency
where bpf program computes the latency of block requests or other
events and stores histogram of latency into array of 64 elements.
All cpus are constantly running, so normal increment is not accurate,
bpf_xadd causes cache ping-pong and this per-cpu approach allows
fastest collision-free counters.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 83d1926c61e4..141fb0d45731 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -151,6 +151,7 @@ struct bpf_array {
 	union {
 		char value[0] __aligned(8);
 		void *ptrs[0] __aligned(8);
+		void __percpu *pptrs[0] __aligned(8);
 	};
 };
 #define MAX_TAIL_CALL_CNT 32
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ae40c8763e..2ee0fde1bf96 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,6 +82,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
+	BPF_MAP_TYPE_PERCPU_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 89ebbc4d1164..b9bf1d7949ca 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,11 +17,39 @@
 #include <linux/filter.h>
 #include <linux/perf_event.h>
 
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		free_percpu(array->pptrs[i]);
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+	void __percpu *ptr;
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++) {
+		ptr = __alloc_percpu_gfp(array->elem_size, 8,
+					 GFP_USER | __GFP_NOWARN);
+		if (!ptr) {
+			bpf_array_free_percpu(array);
+			return -ENOMEM;
+		}
+		array->pptrs[i] = ptr;
+	}
+
+	return 0;
+}
+
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	struct bpf_array *array;
-	u32 elem_size, array_size;
+	u64 array_size;
+	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	elem_size = round_up(attr->value_size, 8);
 
-	/* check round_up into zero and u32 overflow */
-	if (elem_size == 0 ||
-	    attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+	array_size = sizeof(*array);
+	if (percpu)
+		array_size += (u64) attr->max_entries * sizeof(void *);
+	else
+		array_size += (u64) attr->max_entries * elem_size;
+
+	/* make sure there is no u32 overflow later in round_up() */
+	if (array_size >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-ENOMEM);
 
-	array_size = sizeof(*array) + attr->max_entries * elem_size;
 
 	/* allocate all map elements and zero-initialize them */
 	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 
 	/* copy mandatory map attributes */
+	array->map.map_type = attr->map_type;
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
-	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 	array->elem_size = elem_size;
 
+	if (!percpu)
+		goto out;
+
+	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
+
+	if (array_size >= U32_MAX - PAGE_SIZE ||
+	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+		kvfree(array);
+		return ERR_PTR(-ENOMEM);
+	}
+out:
+	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+
 	return &array->map;
 }
 
@@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
 	return array->value + array->elem_size * index;
 }
 
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return this_cpu_ptr(array->pptrs[index]);
+}
+
 /* Called from syscall */
 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
@@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		/* all elements were pre-allocated, cannot insert a new one */
 		return -E2BIG;
 
-	if (map_flags == BPF_NOEXIST)
+	if (unlikely(map_flags == BPF_NOEXIST))
 		/* all elements already exist */
 		return -EEXIST;
 
-	memcpy(array->value + array->elem_size * index, value, map->value_size);
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		memcpy(this_cpu_ptr(array->pptrs[index]),
+		       value, map->value_size);
+	else
+		memcpy(array->value + array->elem_size * index,
+		       value, map->value_size);
 	return 0;
 }
 
@@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		bpf_array_free_percpu(array);
+
 	kvfree(array);
 }
 
@@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
 	.type = BPF_MAP_TYPE_ARRAY,
 };
 
+static const struct bpf_map_ops percpu_array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = percpu_array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+	.ops = &percpu_array_ops,
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
 static int __init register_array_map(void)
 {
 	bpf_register_map_type(&array_type);
+	bpf_register_map_type(&percpu_array_type);
 	return 0;
 }
 late_initcall(register_array_map);
-- 
cgit v1.2.3


From 15a07b33814d14ca817887dbea8530728dc0fbe4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:55 -0800
Subject: bpf: add lookup/update support for per-cpu hash and array maps

The functions bpf_map_lookup_elem(map, key, value) and
bpf_map_update_elem(map, key, value, flags) need to get/set
values from all-cpus for per-cpu hash and array maps,
so that user space can aggregate/update them as necessary.

Example of single counter aggregation in user space:
  unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
  long values[nr_cpus];
  long value = 0;

  bpf_lookup_elem(fd, key, values);
  for (i = 0; i < nr_cpus; i++)
    value += values[i];

The user space must provide round_up(value_size, 8) * nr_cpus
array to get/set values, since kernel will use 'long' copy
of per-cpu values to try to copy good counters atomically.
It's a best-effort, since bpf programs and user space are racing
to access the same memory.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   | 23 ++++++++++++++
 kernel/bpf/arraymap.c | 64 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/hashtab.c  | 83 +++++++++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/syscall.c  | 57 ++++++++++++++++++++++++-----------
 4 files changed, 201 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 141fb0d45731..90ee6ab24bc5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname);
 
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+			   u64 flags);
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+			    u64 flags);
+
+/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
+ * forced to use 'long' read/writes to try to atomically copy long counters.
+ * Best-effort only.  No barriers here, since it _will_ race with concurrent
+ * updates from BPF programs. Called from bpf syscall and mostly used with
+ * size 8 or 16 bytes, so ask compiler to inline it.
+ */
+static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
+{
+	const long *lsrc = src;
+	long *ldst = dst;
+
+	size /= sizeof(long);
+	while (size--)
+		*ldst++ = *lsrc++;
+}
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 #else
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b9bf1d7949ca..bd3bdf2486a7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	return this_cpu_ptr(array->pptrs[index]);
 }
 
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+	void __percpu *pptr;
+	int cpu, off = 0;
+	u32 size;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -ENOENT;
+
+	/* per_cpu areas are zero-filled and bpf programs can only
+	 * access 'value_size' of them, so copying rounded areas
+	 * will not leak any kernel data
+	 */
+	size = round_up(map->value_size, 8);
+	rcu_read_lock();
+	pptr = array->pptrs[index];
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 /* Called from syscall */
 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
@@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+			    u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+	void __percpu *pptr;
+	int cpu, off = 0;
+	u32 size;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	if (unlikely(index >= array->map.max_entries))
+		/* all elements were pre-allocated, cannot insert a new one */
+		return -E2BIG;
+
+	if (unlikely(map_flags == BPF_NOEXIST))
+		/* all elements already exist */
+		return -EEXIST;
+
+	/* the user space will provide round_up(value_size, 8) bytes that
+	 * will be copied into per-cpu area. bpf programs can only access
+	 * value_size of it. During lookup the same extra bytes will be
+	 * returned or zeros which were zero-filled by percpu_alloc,
+	 * so no kernel data leaks possible
+	 */
+	size = round_up(map->value_size, 8);
+	rcu_read_lock();
+	pptr = array->pptrs[index];
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 /* Called from syscall or from eBPF program */
 static int array_map_delete_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2be5f6e8bb04..fd5db8fe9360 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
 
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
-					 bool percpu)
+					 bool percpu, bool onallcpus)
 {
 	u32 size = htab->map.value_size;
 	struct htab_elem *l_new;
@@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			return NULL;
 		}
 
-		/* copy true value_size bytes */
-		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+		if (!onallcpus) {
+			/* copy true value_size bytes */
+			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+		} else {
+			int off = 0, cpu;
+
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+						value + off, size);
+				off += size;
+			}
+		}
 		htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
@@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	/* allocate new element outside of the lock, since
 	 * we're most likley going to insert it
 	 */
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
 	if (!l_new)
 		return -ENOMEM;
 
@@ -402,8 +412,9 @@ err:
 	return ret;
 }
 
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags)
+static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					 void *value, u64 map_flags,
+					 bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 		goto err;
 
 	if (l_old) {
+		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
+		u32 size = htab->map.value_size;
+
 		/* per-cpu hash map can update value in-place */
-		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
-		       value, htab->map.value_size);
+		if (!onallcpus) {
+			memcpy(this_cpu_ptr(pptr), value, size);
+		} else {
+			int off = 0, cpu;
+
+			size = round_up(size, 8);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+						value + off, size);
+				off += size;
+			}
+		}
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
-					hash, true);
+					hash, true, onallcpus);
 		if (!l_new) {
 			ret = -ENOMEM;
 			goto err;
@@ -455,6 +479,12 @@ err:
 	return ret;
 }
 
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
@@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 		return NULL;
 }
 
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+	struct htab_elem *l;
+	void __percpu *pptr;
+	int ret = -ENOENT;
+	int cpu, off = 0;
+	u32 size;
+
+	/* per_cpu areas are zero-filled and bpf programs can only
+	 * access 'value_size' of them, so copying rounded areas
+	 * will not leak any kernel data
+	 */
+	size = round_up(map->value_size, 8);
+	rcu_read_lock();
+	l = __htab_map_lookup_elem(map, key);
+	if (!l)
+		goto out;
+	pptr = htab_elem_get_ptr(l, map->key_size);
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(value + off,
+				per_cpu_ptr(pptr, cpu), size);
+		off += size;
+	}
+	ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+			   u64 map_flags)
+{
+	return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+}
+
 static const struct bpf_map_ops htab_percpu_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 637397059f76..c95a753c2007 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *value, *ptr;
+	u32 value_size;
 	struct fd f;
 	int err;
 
@@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+	else
+		value_size = map->value_size;
+
 	err = -ENOMEM;
-	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
-	rcu_read_lock();
-	ptr = map->ops->map_lookup_elem(map, key);
-	if (ptr)
-		memcpy(value, ptr, map->value_size);
-	rcu_read_unlock();
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+		err = bpf_percpu_hash_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_copy(map, key, value);
+	} else {
+		rcu_read_lock();
+		ptr = map->ops->map_lookup_elem(map, key);
+		if (ptr)
+			memcpy(value, ptr, value_size);
+		rcu_read_unlock();
+		err = ptr ? 0 : -ENOENT;
+	}
 
-	err = -ENOENT;
-	if (!ptr)
+	if (err)
 		goto free_value;
 
 	err = -EFAULT;
-	if (copy_to_user(uvalue, value, map->value_size) != 0)
+	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
 	err = 0;
@@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *value;
+	u32 value_size;
 	struct fd f;
 	int err;
 
@@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+	else
+		value_size = map->value_size;
+
 	err = -ENOMEM;
-	value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
 	err = -EFAULT;
-	if (copy_from_user(value, uvalue, map->value_size) != 0)
+	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
-	/* eBPF program that use maps are running under rcu_read_lock(),
-	 * therefore all map accessors rely on this fact, so do the same here
-	 */
-	rcu_read_lock();
-	err = map->ops->map_update_elem(map, key, value, attr->flags);
-	rcu_read_unlock();
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+		err = bpf_percpu_hash_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else {
+		rcu_read_lock();
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		rcu_read_unlock();
+	}
 
 free_value:
 	kfree(value);
-- 
cgit v1.2.3


From 7267bcda332e2782e21a559f3b1b859a35b4062d Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sat, 16 Jan 2016 00:48:52 +0100
Subject: bcma: identify bus cores (devices) found on BCM47189
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing defines and print proper names.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/scan.c       | 3 +++
 include/linux/bcma/bcma.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c
index df806b9c5490..5ee731132365 100644
--- a/drivers/bcma/scan.c
+++ b/drivers/bcma/scan.c
@@ -98,6 +98,9 @@ static const struct bcma_device_id_name bcma_bcm_device_names[] = {
 	{ BCMA_CORE_SHIM, "SHIM" },
 	{ BCMA_CORE_PCIE2, "PCIe Gen2" },
 	{ BCMA_CORE_ARM_CR4, "ARM CR4" },
+	{ BCMA_CORE_GCI, "GCI" },
+	{ BCMA_CORE_CMEM, "CNDS DDR2/3 memory controller" },
+	{ BCMA_CORE_ARM_CA7, "ARM CA7" },
 	{ BCMA_CORE_DEFAULT, "Default" },
 };
 
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 3feb1b2d75d8..991ebb4c2015 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -151,6 +151,8 @@ struct bcma_host_ops {
 #define BCMA_CORE_PCIE2			0x83C	/* PCI Express Gen2 */
 #define BCMA_CORE_USB30_DEV		0x83D
 #define BCMA_CORE_ARM_CR4		0x83E
+#define BCMA_CORE_GCI			0x840
+#define BCMA_CORE_CMEM			0x846	/* CNDS DDR2/3 memory controller */
 #define BCMA_CORE_ARM_CA7		0x847
 #define BCMA_CORE_SYS_MEM		0x849
 #define BCMA_CORE_DEFAULT		0xFFF
-- 
cgit v1.2.3


From 67edf354faaf93156646e741483b2313bc756c0f Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 19 Jan 2016 08:45:25 +0100
Subject: bcma: use _PMU_ in all names of PMU registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PMU (Power Management Unit) seems to be a separated piece of hardware,
just accessed using ChipCommon core registers. In recent Broadcom
chipsets PMU is not bounded to CC but available as separated core.

To make code cleaner & easier to review (for a correct R/W access) use
clearer names.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_chipcommon_pmu.c        | 46 ++++++++++++++---------------
 drivers/net/wireless/broadcom/b43/main.c    |  8 ++---
 include/linux/bcma/bcma_driver_chipcommon.h | 12 ++++----
 3 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/bcma/driver_chipcommon_pmu.c b/drivers/bcma/driver_chipcommon_pmu.c
index fe0d48cb1778..472f39dc5a38 100644
--- a/drivers/bcma/driver_chipcommon_pmu.c
+++ b/drivers/bcma/driver_chipcommon_pmu.c
@@ -15,44 +15,44 @@
 
 u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset)
 {
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PLLCTL_ADDR);
-	return bcma_cc_read32(cc, BCMA_CC_PLLCTL_DATA);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	return bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_read);
 
 void bcma_chipco_pll_write(struct bcma_drv_cc *cc, u32 offset, u32 value)
 {
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PLLCTL_ADDR);
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_DATA, value);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_write);
 
 void bcma_chipco_pll_maskset(struct bcma_drv_cc *cc, u32 offset, u32 mask,
 			     u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PLLCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_PLLCTL_DATA, mask, set);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	bcma_cc_maskset32(cc, BCMA_CC_PMU_PLLCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_maskset);
 
 void bcma_chipco_chipctl_maskset(struct bcma_drv_cc *cc,
 				 u32 offset, u32 mask, u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_CHIPCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_CHIPCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_CHIPCTL_DATA, mask, set);
+	bcma_cc_write32(cc, BCMA_CC_PMU_CHIPCTL_ADDR, offset);
+	bcma_cc_read32(cc, BCMA_CC_PMU_CHIPCTL_ADDR);
+	bcma_cc_maskset32(cc, BCMA_CC_PMU_CHIPCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_chipctl_maskset);
 
 void bcma_chipco_regctl_maskset(struct bcma_drv_cc *cc, u32 offset, u32 mask,
 				u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_REGCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_REGCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_REGCTL_DATA, mask, set);
+	bcma_cc_write32(cc, BCMA_CC_PMU_REGCTL_ADDR, offset);
+	bcma_cc_read32(cc, BCMA_CC_PMU_REGCTL_ADDR);
+	bcma_cc_maskset32(cc, BCMA_CC_PMU_REGCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_regctl_maskset);
 
@@ -472,8 +472,8 @@ u32 bcma_pmu_get_cpu_clock(struct bcma_drv_cc *cc)
 static void bcma_pmu_spuravoid_pll_write(struct bcma_drv_cc *cc, u32 offset,
 					 u32 value)
 {
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR, offset);
-	bcma_cc_write32(cc, BCMA_CC_PLLCTL_DATA, value);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
 }
 
 void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid)
@@ -497,20 +497,20 @@ void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid)
 		       bus->chipinfo.id == BCMA_CHIP_ID_BCM53572) ? 6 : 0;
 
 		/* RMW only the P1 divider */
-		bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR,
+		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
 				BCMA_CC_PMU_PLL_CTL0 + phypll_offset);
-		tmp = bcma_cc_read32(cc, BCMA_CC_PLLCTL_DATA);
+		tmp = bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 		tmp &= (~(BCMA_CC_PMU1_PLL0_PC0_P1DIV_MASK));
 		tmp |= (bcm5357_bcm43236_p1div[spuravoid] << BCMA_CC_PMU1_PLL0_PC0_P1DIV_SHIFT);
-		bcma_cc_write32(cc, BCMA_CC_PLLCTL_DATA, tmp);
+		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
 
 		/* RMW only the int feedback divider */
-		bcma_cc_write32(cc, BCMA_CC_PLLCTL_ADDR,
+		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
 				BCMA_CC_PMU_PLL_CTL2 + phypll_offset);
-		tmp = bcma_cc_read32(cc, BCMA_CC_PLLCTL_DATA);
+		tmp = bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 		tmp &= ~(BCMA_CC_PMU1_PLL0_PC2_NDIV_INT_MASK);
 		tmp |= (bcm5357_bcm43236_ndiv[spuravoid]) << BCMA_CC_PMU1_PLL0_PC2_NDIV_INT_SHIFT;
-		bcma_cc_write32(cc, BCMA_CC_PLLCTL_DATA, tmp);
+		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
 
 		tmp = BCMA_CC_PMU_CTL_PLL_UPD;
 		break;
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index ec013fbd6a81..c279211e49f9 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -1215,10 +1215,10 @@ void b43_wireless_core_phy_pll_reset(struct b43_wldev *dev)
 	case B43_BUS_BCMA:
 		bcma_cc = &dev->dev->bdev->bus->drv_cc;
 
-		bcma_cc_write32(bcma_cc, BCMA_CC_CHIPCTL_ADDR, 0);
-		bcma_cc_mask32(bcma_cc, BCMA_CC_CHIPCTL_DATA, ~0x4);
-		bcma_cc_set32(bcma_cc, BCMA_CC_CHIPCTL_DATA, 0x4);
-		bcma_cc_mask32(bcma_cc, BCMA_CC_CHIPCTL_DATA, ~0x4);
+		bcma_cc_write32(bcma_cc, BCMA_CC_PMU_CHIPCTL_ADDR, 0);
+		bcma_cc_mask32(bcma_cc, BCMA_CC_PMU_CHIPCTL_DATA, ~0x4);
+		bcma_cc_set32(bcma_cc, BCMA_CC_PMU_CHIPCTL_DATA, 0x4);
+		bcma_cc_mask32(bcma_cc, BCMA_CC_PMU_CHIPCTL_DATA, ~0x4);
 		break;
 #endif
 #ifdef CONFIG_B43_SSB
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index db51a6ffb7d6..96d8d56f240f 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -351,12 +351,12 @@
 #define BCMA_CC_PMU_RES_REQTS		0x0640 /* PMU res req timer sel */
 #define BCMA_CC_PMU_RES_REQT		0x0644 /* PMU res req timer */
 #define BCMA_CC_PMU_RES_REQM		0x0648 /* PMU res req mask */
-#define BCMA_CC_CHIPCTL_ADDR		0x0650
-#define BCMA_CC_CHIPCTL_DATA		0x0654
-#define BCMA_CC_REGCTL_ADDR		0x0658
-#define BCMA_CC_REGCTL_DATA		0x065C
-#define BCMA_CC_PLLCTL_ADDR		0x0660
-#define BCMA_CC_PLLCTL_DATA		0x0664
+#define BCMA_CC_PMU_CHIPCTL_ADDR	0x0650
+#define BCMA_CC_PMU_CHIPCTL_DATA	0x0654
+#define BCMA_CC_PMU_REGCTL_ADDR		0x0658
+#define BCMA_CC_PMU_REGCTL_DATA		0x065C
+#define BCMA_CC_PMU_PLLCTL_ADDR		0x0660
+#define BCMA_CC_PMU_PLLCTL_DATA		0x0664
 #define BCMA_CC_PMU_STRAPOPT		0x0668 /* (corerev >= 28) */
 #define BCMA_CC_PMU_XTAL_FREQ		0x066C /* (pmurev >= 10) */
 #define  BCMA_CC_PMU_XTAL_FREQ_ILPCTL_MASK	0x00001FFF
-- 
cgit v1.2.3


From b3c47afbf54d86daa0473895e8ca9e8b663f5c1a Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 19 Jan 2016 08:45:26 +0100
Subject: bcma: support PMU present as separated bus core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On recent Broadcom chipsets PMU is present as separated core and it
can't be accessed using ChipCommon anymore as it fails with e.g.:
[    0.000577] Unhandled fault: external abort on non-linefetch (0x1008) at 0xf1000604

Solve it by using a new (PMU) core pointer set to ChipCommon or PMU
depending on the hardware capabilities.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_chipcommon.c            |  2 +-
 drivers/bcma/driver_chipcommon_pmu.c        | 94 ++++++++++++++++-------------
 include/linux/bcma/bcma_driver_chipcommon.h | 19 ++++++
 3 files changed, 72 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c
index b7c8a8d4e6d1..36ee221e298f 100644
--- a/drivers/bcma/driver_chipcommon.c
+++ b/drivers/bcma/driver_chipcommon.c
@@ -185,7 +185,7 @@ u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks)
 			ticks = 2;
 		else if (ticks > maxt)
 			ticks = maxt;
-		bcma_cc_write32(cc, BCMA_CC_PMU_WATCHDOG, ticks);
+		bcma_pmu_write32(cc, BCMA_CC_PMU_WATCHDOG, ticks);
 	} else {
 		struct bcma_bus *bus = cc->core->bus;
 
diff --git a/drivers/bcma/driver_chipcommon_pmu.c b/drivers/bcma/driver_chipcommon_pmu.c
index 472f39dc5a38..f1eb4d3e1d57 100644
--- a/drivers/bcma/driver_chipcommon_pmu.c
+++ b/drivers/bcma/driver_chipcommon_pmu.c
@@ -15,44 +15,44 @@
 
 u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
-	return bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	return bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_read);
 
 void bcma_chipco_pll_write(struct bcma_drv_cc *cc, u32 offset, u32 value)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_write);
 
 void bcma_chipco_pll_maskset(struct bcma_drv_cc *cc, u32 offset, u32 mask,
 			     u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_PMU_PLLCTL_DATA, mask, set);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_ADDR);
+	bcma_pmu_maskset32(cc, BCMA_CC_PMU_PLLCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_pll_maskset);
 
 void bcma_chipco_chipctl_maskset(struct bcma_drv_cc *cc,
 				 u32 offset, u32 mask, u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_CHIPCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PMU_CHIPCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_PMU_CHIPCTL_DATA, mask, set);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_CHIPCTL_ADDR, offset);
+	bcma_pmu_read32(cc, BCMA_CC_PMU_CHIPCTL_ADDR);
+	bcma_pmu_maskset32(cc, BCMA_CC_PMU_CHIPCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_chipctl_maskset);
 
 void bcma_chipco_regctl_maskset(struct bcma_drv_cc *cc, u32 offset, u32 mask,
 				u32 set)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_REGCTL_ADDR, offset);
-	bcma_cc_read32(cc, BCMA_CC_PMU_REGCTL_ADDR);
-	bcma_cc_maskset32(cc, BCMA_CC_PMU_REGCTL_DATA, mask, set);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_REGCTL_ADDR, offset);
+	bcma_pmu_read32(cc, BCMA_CC_PMU_REGCTL_ADDR);
+	bcma_pmu_maskset32(cc, BCMA_CC_PMU_REGCTL_DATA, mask, set);
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_regctl_maskset);
 
@@ -60,18 +60,18 @@ static u32 bcma_pmu_xtalfreq(struct bcma_drv_cc *cc)
 {
 	u32 ilp_ctl, alp_hz;
 
-	if (!(bcma_cc_read32(cc, BCMA_CC_PMU_STAT) &
+	if (!(bcma_pmu_read32(cc, BCMA_CC_PMU_STAT) &
 	      BCMA_CC_PMU_STAT_EXT_LPO_AVAIL))
 		return 0;
 
-	bcma_cc_write32(cc, BCMA_CC_PMU_XTAL_FREQ,
-			BIT(BCMA_CC_PMU_XTAL_FREQ_MEASURE_SHIFT));
+	bcma_pmu_write32(cc, BCMA_CC_PMU_XTAL_FREQ,
+			 BIT(BCMA_CC_PMU_XTAL_FREQ_MEASURE_SHIFT));
 	usleep_range(1000, 2000);
 
-	ilp_ctl = bcma_cc_read32(cc, BCMA_CC_PMU_XTAL_FREQ);
+	ilp_ctl = bcma_pmu_read32(cc, BCMA_CC_PMU_XTAL_FREQ);
 	ilp_ctl &= BCMA_CC_PMU_XTAL_FREQ_ILPCTL_MASK;
 
-	bcma_cc_write32(cc, BCMA_CC_PMU_XTAL_FREQ, 0);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_XTAL_FREQ, 0);
 
 	alp_hz = ilp_ctl * 32768 / 4;
 	return (alp_hz + 50000) / 100000 * 100;
@@ -127,8 +127,8 @@ static void bcma_pmu2_pll_init0(struct bcma_drv_cc *cc, u32 xtalfreq)
 		mask = (u32)~(BCMA_RES_4314_HT_AVAIL |
 			      BCMA_RES_4314_MACPHY_CLK_AVAIL);
 
-		bcma_cc_mask32(cc, BCMA_CC_PMU_MINRES_MSK, mask);
-		bcma_cc_mask32(cc, BCMA_CC_PMU_MAXRES_MSK, mask);
+		bcma_pmu_mask32(cc, BCMA_CC_PMU_MINRES_MSK, mask);
+		bcma_pmu_mask32(cc, BCMA_CC_PMU_MAXRES_MSK, mask);
 		bcma_wait_value(cc->core, BCMA_CLKCTLST,
 				BCMA_CLKCTLST_HAVEHT, 0, 20000);
 		break;
@@ -140,7 +140,7 @@ static void bcma_pmu2_pll_init0(struct bcma_drv_cc *cc, u32 xtalfreq)
 
 	/* Flush */
 	if (cc->pmu.rev >= 2)
-		bcma_cc_set32(cc, BCMA_CC_PMU_CTL, BCMA_CC_PMU_CTL_PLL_UPD);
+		bcma_pmu_set32(cc, BCMA_CC_PMU_CTL, BCMA_CC_PMU_CTL_PLL_UPD);
 
 	/* TODO: Do we need to update OTP? */
 }
@@ -195,9 +195,9 @@ static void bcma_pmu_resources_init(struct bcma_drv_cc *cc)
 
 	/* Set the resource masks. */
 	if (min_msk)
-		bcma_cc_write32(cc, BCMA_CC_PMU_MINRES_MSK, min_msk);
+		bcma_pmu_write32(cc, BCMA_CC_PMU_MINRES_MSK, min_msk);
 	if (max_msk)
-		bcma_cc_write32(cc, BCMA_CC_PMU_MAXRES_MSK, max_msk);
+		bcma_pmu_write32(cc, BCMA_CC_PMU_MAXRES_MSK, max_msk);
 
 	/*
 	 * Add some delay; allow resources to come up and settle.
@@ -269,23 +269,33 @@ static void bcma_pmu_workarounds(struct bcma_drv_cc *cc)
 
 void bcma_pmu_early_init(struct bcma_drv_cc *cc)
 {
+	struct bcma_bus *bus = cc->core->bus;
 	u32 pmucap;
 
-	pmucap = bcma_cc_read32(cc, BCMA_CC_PMU_CAP);
+	if (cc->core->id.rev >= 35 &&
+	    cc->capabilities_ext & BCMA_CC_CAP_EXT_AOB_PRESENT) {
+		cc->pmu.core = bcma_find_core(bus, BCMA_CORE_PMU);
+		if (!cc->pmu.core)
+			bcma_warn(bus, "Couldn't find expected PMU core");
+	}
+	if (!cc->pmu.core)
+		cc->pmu.core = cc->core;
+
+	pmucap = bcma_pmu_read32(cc, BCMA_CC_PMU_CAP);
 	cc->pmu.rev = (pmucap & BCMA_CC_PMU_CAP_REVISION);
 
-	bcma_debug(cc->core->bus, "Found rev %u PMU (capabilities 0x%08X)\n",
-		   cc->pmu.rev, pmucap);
+	bcma_debug(bus, "Found rev %u PMU (capabilities 0x%08X)\n", cc->pmu.rev,
+		   pmucap);
 }
 
 void bcma_pmu_init(struct bcma_drv_cc *cc)
 {
 	if (cc->pmu.rev == 1)
-		bcma_cc_mask32(cc, BCMA_CC_PMU_CTL,
-			      ~BCMA_CC_PMU_CTL_NOILPONW);
+		bcma_pmu_mask32(cc, BCMA_CC_PMU_CTL,
+				~BCMA_CC_PMU_CTL_NOILPONW);
 	else
-		bcma_cc_set32(cc, BCMA_CC_PMU_CTL,
-			     BCMA_CC_PMU_CTL_NOILPONW);
+		bcma_pmu_set32(cc, BCMA_CC_PMU_CTL,
+			       BCMA_CC_PMU_CTL_NOILPONW);
 
 	bcma_pmu_pll_init(cc);
 	bcma_pmu_resources_init(cc);
@@ -472,8 +482,8 @@ u32 bcma_pmu_get_cpu_clock(struct bcma_drv_cc *cc)
 static void bcma_pmu_spuravoid_pll_write(struct bcma_drv_cc *cc, u32 offset,
 					 u32 value)
 {
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
-	bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR, offset);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, value);
 }
 
 void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid)
@@ -497,20 +507,20 @@ void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid)
 		       bus->chipinfo.id == BCMA_CHIP_ID_BCM53572) ? 6 : 0;
 
 		/* RMW only the P1 divider */
-		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
+		bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
 				BCMA_CC_PMU_PLL_CTL0 + phypll_offset);
-		tmp = bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
+		tmp = bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 		tmp &= (~(BCMA_CC_PMU1_PLL0_PC0_P1DIV_MASK));
 		tmp |= (bcm5357_bcm43236_p1div[spuravoid] << BCMA_CC_PMU1_PLL0_PC0_P1DIV_SHIFT);
-		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
+		bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
 
 		/* RMW only the int feedback divider */
-		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
+		bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_ADDR,
 				BCMA_CC_PMU_PLL_CTL2 + phypll_offset);
-		tmp = bcma_cc_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
+		tmp = bcma_pmu_read32(cc, BCMA_CC_PMU_PLLCTL_DATA);
 		tmp &= ~(BCMA_CC_PMU1_PLL0_PC2_NDIV_INT_MASK);
 		tmp |= (bcm5357_bcm43236_ndiv[spuravoid]) << BCMA_CC_PMU1_PLL0_PC2_NDIV_INT_SHIFT;
-		bcma_cc_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
+		bcma_pmu_write32(cc, BCMA_CC_PMU_PLLCTL_DATA, tmp);
 
 		tmp = BCMA_CC_PMU_CTL_PLL_UPD;
 		break;
@@ -646,7 +656,7 @@ void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid)
 		break;
 	}
 
-	tmp |= bcma_cc_read32(cc, BCMA_CC_PMU_CTL);
-	bcma_cc_write32(cc, BCMA_CC_PMU_CTL, tmp);
+	tmp |= bcma_pmu_read32(cc, BCMA_CC_PMU_CTL);
+	bcma_pmu_write32(cc, BCMA_CC_PMU_CTL, tmp);
 }
 EXPORT_SYMBOL_GPL(bcma_pmu_spuravoid_pllupdate);
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 96d8d56f240f..700d0c6f7480 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -217,6 +217,11 @@
 #define	 BCMA_CC_CLKDIV_JTAG_SHIFT	8
 #define	 BCMA_CC_CLKDIV_UART		0x000000FF
 #define BCMA_CC_CAP_EXT			0x00AC		/* Capabilities */
+#define  BCMA_CC_CAP_EXT_SECI_PRESENT	0x00000001
+#define  BCMA_CC_CAP_EXT_GSIO_PRESENT	0x00000002
+#define  BCMA_CC_CAP_EXT_GCI_PRESENT	0x00000004
+#define  BCMA_CC_CAP_EXT_SECI_PUART_PRESENT		0x00000008    /* UART present */
+#define  BCMA_CC_CAP_EXT_AOB_PRESENT	0x00000040
 #define BCMA_CC_PLLONDELAY		0x00B0		/* Rev >= 4 only */
 #define BCMA_CC_FREFSELDELAY		0x00B4		/* Rev >= 4 only */
 #define BCMA_CC_SLOWCLKCTL		0x00B8		/* 6 <= Rev <= 9 only */
@@ -566,6 +571,7 @@
  * Check availability with ((struct bcma_chipcommon)->capabilities & BCMA_CC_CAP_PMU)
  */
 struct bcma_chipcommon_pmu {
+	struct bcma_device *core;	/* Can be separated core or just ChipCommon one */
 	u8 rev;			/* PMU revision */
 	u32 crystalfreq;	/* The active crystal frequency (in kHz) */
 };
@@ -660,6 +666,19 @@ struct bcma_drv_cc_b {
 #define bcma_cc_maskset32(cc, offset, mask, set) \
 	bcma_cc_write32(cc, offset, (bcma_cc_read32(cc, offset) & (mask)) | (set))
 
+/* PMU registers access */
+#define bcma_pmu_read32(cc, offset) \
+	bcma_read32((cc)->pmu.core, offset)
+#define bcma_pmu_write32(cc, offset, val) \
+	bcma_write32((cc)->pmu.core, offset, val)
+
+#define bcma_pmu_mask32(cc, offset, mask) \
+	bcma_pmu_write32(cc, offset, bcma_pmu_read32(cc, offset) & (mask))
+#define bcma_pmu_set32(cc, offset, set) \
+	bcma_pmu_write32(cc, offset, bcma_pmu_read32(cc, offset) | (set))
+#define bcma_pmu_maskset32(cc, offset, mask, set) \
+	bcma_pmu_write32(cc, offset, (bcma_pmu_read32(cc, offset) & (mask)) | (set))
+
 extern u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks);
 
 extern u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc);
-- 
cgit v1.2.3


From 61dba73cdbba8ec5c01b31beaf9e2debc2d2f273 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 24 Jan 2016 16:37:33 +0100
Subject: bcma: add support for BCM47094
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's another SoC with 32 GPIOs and simplified watchdog handling. It was
tested on D-Link DIR-885L.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_chipcommon.c | 1 +
 drivers/bcma/driver_gpio.c       | 1 +
 include/linux/bcma/bcma.h        | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c
index bdb73d97da63..b0f44a2937b9 100644
--- a/drivers/bcma/driver_chipcommon.c
+++ b/drivers/bcma/driver_chipcommon.c
@@ -197,6 +197,7 @@ u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks)
 		struct bcma_bus *bus = cc->core->bus;
 
 		if (bus->chipinfo.id != BCMA_CHIP_ID_BCM4707 &&
+		    bus->chipinfo.id != BCMA_CHIP_ID_BCM47094 &&
 		    bus->chipinfo.id != BCMA_CHIP_ID_BCM53018)
 			bcma_core_set_clockmode(cc->core,
 						ticks ? BCMA_CLKMODE_FAST : BCMA_CLKMODE_DYNAMIC);
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index 504899a72966..77b0738fbe1b 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -197,6 +197,7 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	case BCMA_CHIP_ID_BCM4707:
 	case BCMA_CHIP_ID_BCM5357:
 	case BCMA_CHIP_ID_BCM53572:
+	case BCMA_CHIP_ID_BCM47094:
 		chip->ngpio	= 32;
 		break;
 	default:
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 991ebb4c2015..0367c63f5960 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -201,6 +201,7 @@ struct bcma_host_ops {
 #define  BCMA_PKG_ID_BCM4707	1
 #define  BCMA_PKG_ID_BCM4708	2
 #define  BCMA_PKG_ID_BCM4709	0
+#define BCMA_CHIP_ID_BCM47094	53030
 #define BCMA_CHIP_ID_BCM53018	53018
 
 /* Board types (on PCI usually equals to the subsystem dev id) */
-- 
cgit v1.2.3


From e3e17b773bfe45462b7f3fae20c550025975cb13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Feb 2016 11:16:28 -0800
Subject: tcp: fastopen: call tcp_fin() if FIN present in SYNACK

When we acknowledge a FIN, it is not enough to ack the sequence number
and queue the skb into receive queue. We also have to call tcp_fin()
to properly update socket state and send proper poll() notifications.

It seems we also had the problem if we received a SYN packet with the
FIN flag set, but it does not seem an urgent issue, as no known
implementation can do that.

Fixes: 61d2bcae99f6 ("tcp: fastopen: accept data/FIN present in SYNACK message")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       | 1 +
 net/ipv4/tcp_fastopen.c | 3 +++
 net/ipv4/tcp_input.c    | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 27f4c733116d..479d535609fd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -568,6 +568,7 @@ void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
+void tcp_fin(struct sock *sk);
 
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 6a6e11e54bae..fdb286ddba04 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -154,6 +154,9 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 	 * as we certainly are not changing upper 32bit value (0)
 	 */
 	tp->bytes_received = skb->len;
+
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		tcp_fin(sk);
 }
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4add3eb40e58..8194a250a01e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3995,7 +3995,7 @@ void tcp_reset(struct sock *sk)
  *
  *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
  */
-static void tcp_fin(struct sock *sk)
+void tcp_fin(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-- 
cgit v1.2.3


From 0e715d6fbd2a4a1dcd215d6d51091346e6a3d3fa Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 2 Feb 2016 18:09:11 +0100
Subject: vxlan: cleanup types

include/net/vxlan.h is a kernel header, no need to prefix fixed size types
with double underscore.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 0fb86442544b..5c64250619c5 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -30,15 +30,15 @@
  * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
  */
 struct vxlanhdr_gbp {
-	__u8	vx_flags;
+	u8	vx_flags;
 #ifdef __LITTLE_ENDIAN_BITFIELD
-	__u8	reserved_flags1:3,
+	u8	reserved_flags1:3,
 		policy_applied:1,
 		reserved_flags2:2,
 		dont_learn:1,
 		reserved_flags3:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
-	__u8	reserved_flags1:1,
+	u8	reserved_flags1:1,
 		dont_learn:1,
 		reserved_flags2:2,
 		policy_applied:1,
@@ -138,10 +138,10 @@ struct vxlan_config {
 	int			remote_ifindex;
 	int			mtu;
 	__be16			dst_port;
-	__u16			port_min;
-	__u16			port_max;
-	__u8			tos;
-	__u8			ttl;
+	u16			port_min;
+	u16			port_max;
+	u8			tos;
+	u8			ttl;
 	u32			flags;
 	unsigned long		age_interval;
 	unsigned int		addrmax;
-- 
cgit v1.2.3


From 427bc465bf9fcdab749f6997ff7a4eecaef4ca40 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 2 Feb 2016 18:09:12 +0100
Subject: vxlan: remove duplicated macros

VNI_HASH_BITS and VNI_HASH_SIZE are defined twice. Remove the extra
definitions.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 5c64250619c5..234bf1ef2737 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -9,9 +9,6 @@
 #include <linux/udp.h>
 #include <net/dst_metadata.h>
 
-#define VNI_HASH_BITS	10
-#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
-
 /*
  * VXLAN Group Based Policy Extension:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-- 
cgit v1.2.3


From 828788ac99d5de6bae10b333d1e8ddf25928ac12 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 2 Feb 2016 18:09:13 +0100
Subject: vxlan: restructure vxlan.h definitions

RCO and GBP are VXLAN extensions, not specified in RFC 7348. Because of
that, they need to be explicitly enabled when creating vxlan interface. By
default, those extensions are not used and plain VXLAN header is sent and
received.

Reflect this in vxlan.h: first, the plain VXLAN header is defined. Following
it, RCO is documented and defined, and likewise for GBP.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 104 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 234bf1ef2737..25bd919c9ef0 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -9,14 +9,71 @@
 #include <linux/udp.h>
 #include <net/dst_metadata.h>
 
+/* VXLAN protocol (RFC 7348) header:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|R|R|R|               Reserved                        |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                VXLAN Network Identifier (VNI) |   Reserved    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * I = VXLAN Network Identifier (VNI) present.
+ */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+/* VXLAN header flags. */
+#define VXLAN_HF_VNI BIT(27)
+
+#define VXLAN_N_VID     (1u << 24)
+#define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
+#define VXLAN_VNI_MASK  (VXLAN_VID_MASK << 8)
+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+/* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X):
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|R|R|R|R|R|C|              Reserved                   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |           VXLAN Network Identifier (VNI)      |O| Csum start  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * C = Remote checksum offload bit. When set indicates that the
+ *     remote checksum offload data is present.
+ *
+ * O = Offset bit. Indicates the checksum offset relative to
+ *     checksum start.
+ *
+ * Csum start = Checksum start divided by two.
+ *
+ * http://tools.ietf.org/html/draft-herbert-vxlan-rco
+ */
+
+/* VXLAN-RCO header flags. */
+#define VXLAN_HF_RCO BIT(21)
+
+/* Remote checksum offload header option */
+#define VXLAN_RCO_MASK  0x7f    /* Last byte of vni field */
+#define VXLAN_RCO_UDP   0x80    /* Indicate UDP RCO (TCP when not set *) */
+#define VXLAN_RCO_SHIFT 1       /* Left shift of start */
+#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1)
+#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT)
+
 /*
- * VXLAN Group Based Policy Extension:
+ * VXLAN Group Based Policy Extension (VXLAN_F_GBP):
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R|        Group Policy ID        |
+ * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R|        Group Policy ID        |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |                VXLAN Network Identifier (VNI) |   Reserved    |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
+ * G = Group Policy ID present.
+ *
  * D = Don't Learn bit. When set, this bit indicates that the egress
  *     VTEP MUST NOT learn the source address of the encapsulated frame.
  *
@@ -24,7 +81,7 @@
  *     this packet. Policies MUST NOT be applied by devices when the
  *     A bit is set.
  *
- * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
+ * https://tools.ietf.org/html/draft-smith-vxlan-group-policy
  */
 struct vxlanhdr_gbp {
 	u8	vx_flags;
@@ -47,6 +104,9 @@ struct vxlanhdr_gbp {
 	__be32	vx_vni;
 };
 
+/* VXLAN-GBP header flags. */
+#define VXLAN_HF_GBP BIT(31)
+
 #define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF)
 
 /* skb->mark mapping
@@ -59,44 +119,6 @@ struct vxlanhdr_gbp {
 #define VXLAN_GBP_POLICY_APPLIED	(BIT(3) << 16)
 #define VXLAN_GBP_ID_MASK		(0xFFFF)
 
-/* VXLAN protocol header:
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |G|R|R|R|I|R|R|C|               Reserved                        |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |                VXLAN Network Identifier (VNI) |   Reserved    |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * G = 1	Group Policy (VXLAN-GBP)
- * I = 1	VXLAN Network Identifier (VNI) present
- * C = 1	Remote checksum offload (RCO)
- */
-struct vxlanhdr {
-	__be32 vx_flags;
-	__be32 vx_vni;
-};
-
-/* VXLAN header flags. */
-#define VXLAN_HF_RCO BIT(21)
-#define VXLAN_HF_VNI BIT(27)
-#define VXLAN_HF_GBP BIT(31)
-
-/* Remote checksum offload header option */
-#define VXLAN_RCO_MASK  0x7f    /* Last byte of vni field */
-#define VXLAN_RCO_UDP   0x80    /* Indicate UDP RCO (TCP when not set *) */
-#define VXLAN_RCO_SHIFT 1       /* Left shift of start */
-#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1)
-#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT)
-
-#define VXLAN_N_VID     (1u << 24)
-#define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
-#define VXLAN_VNI_MASK  (VXLAN_VID_MASK << 8)
-#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
-
-#define VNI_HASH_BITS	10
-#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
-#define FDB_HASH_BITS	8
-#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
-
 struct vxlan_metadata {
 	u32		gbp;
 };
-- 
cgit v1.2.3


From 67eb03318bc5fe170ae832423fda7a23b0d801cf Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 2 Feb 2016 07:43:45 -0800
Subject: net: Add support for fill_slave_info to VRF device

Allows userspace to have direct access to VRF table association
versus looking up master device and its table.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c            | 21 +++++++++++++++++++++
 include/uapi/linux/if_link.h |  8 ++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 66addb7a7911..76e1fc9d8748 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -877,6 +877,24 @@ static int vrf_fillinfo(struct sk_buff *skb,
 	return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
 }
 
+static size_t vrf_get_slave_size(const struct net_device *bond_dev,
+				 const struct net_device *slave_dev)
+{
+	return nla_total_size(sizeof(u32));  /* IFLA_VRF_PORT_TABLE */
+}
+
+static int vrf_fill_slave_info(struct sk_buff *skb,
+			       const struct net_device *vrf_dev,
+			       const struct net_device *slave_dev)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+
+	if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
 	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
 };
@@ -890,6 +908,9 @@ static struct rtnl_link_ops vrf_link_ops __read_mostly = {
 	.validate	= vrf_validate,
 	.fill_info	= vrf_fillinfo,
 
+	.get_slave_size  = vrf_get_slave_size,
+	.fill_slave_info = vrf_fill_slave_info,
+
 	.newlink	= vrf_newlink,
 	.dellink	= vrf_dellink,
 	.setup		= vrf_setup,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d3e90b91e07e..d452cea59020 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -405,6 +405,14 @@ enum {
 
 #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
 
+enum {
+	IFLA_VRF_PORT_UNSPEC,
+	IFLA_VRF_PORT_TABLE,
+	__IFLA_VRF_PORT_MAX
+};
+
+#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From ddf1af6fa00e772fdb67a7d22cb83fac2b8968a8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:06 -0800
Subject: tcp: new delivery accounting

This patch changes the accounting of how many packets are
newly acked or sacked when the sender receives an ACK.

The current approach basically computes

   newly_acked_sacked = (prior_packets - prior_sacked) -
                        (tp->packets_out - tp->sacked_out)

   where prior_packets and prior_sacked out are snapshot
   at the beginning of the ACK processing.

The new approach tracks the delivery information via a new
TCP state variable "delivered" which monotically increases
as new packets are delivered in order or out-of-order.

The reason for this change is that the current approach is
brittle that produces negative or inaccurate estimate.

   1) For non-SACK connections, an ACK that advances the SND.UNA
   could reset the DUPACK counters (tp->sacked_out) in
   tcp_process_loss() or tcp_fastretrans_alert(). This inflates
   the inflight suddenly and causes under-estimate or even
   negative estimate. Here is a real example:

                   before   after (processing ACK)
   packets_out     75       73
   sacked_out      23        0
   ca state        Loss     Open

   The old approach computes (75-23) - (73 - 0) = -21 delivered
   while the new approach computes 1 delivered since it
   considers the 2nd-24th packets are delivered OOO.

   2) MSS change would re-count packets_out and sacked_out so
   the estimate is in-accurate and can even become negative.
   E.g., the inflight is doubled when MSS is halved.

   3) Spurious retransmission signaled by DSACK is not accounted

The new approach is simpler and more robust. For SACK connections,
tp->delivered increments as packets are being acked or sacked in
SACK and ACK processing.

For non-sack connections, it's done in tcp_remove_reno_sacks() and
tcp_add_reno_sack(). When an ACK advances the SND.UNA, tp->delivered
is incremented by the number of packets ACKed (less the current
number of DUPACKs received plus one packet hole).  Upon receiving
a DUPACK, tp->delivered is incremented assuming one out-of-order
packet is delivered.

Upon receiving a DSACK, tp->delivered is incremtened assuming one
retransmission is delivered in tcp_sacktag_write_queue().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp_input.c | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361ba3e8..d909feeeaea2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -256,6 +256,7 @@ struct tcp_sock {
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
+	u32	delivered;	/* Total data packets delivered incl. rexmits */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc810df53e90..2d690b3f0a7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1214,6 +1214,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		sacked |= TCPCB_SACKED_ACKED;
 		state->flag |= FLAG_DATA_SACKED;
 		tp->sacked_out += pcount;
+		tp->delivered += pcount;  /* Out-of-order packets delivered */
 
 		fack_count += pcount;
 
@@ -1825,8 +1826,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 static void tcp_add_reno_sack(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_sacked = tp->sacked_out;
+
 	tp->sacked_out++;
 	tcp_check_reno_reordering(sk, 0);
+	if (tp->sacked_out > prior_sacked)
+		tp->delivered++; /* Some out-of-order packet is delivered */
 	tcp_verify_left_out(tp);
 }
 
@@ -1838,6 +1843,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
 
 	if (acked > 0) {
 		/* One ACK acked hole. The rest eat duplicate ACKs. */
+		tp->delivered += max_t(int, acked - tp->sacked_out, 1);
 		if (acked - 1 >= tp->sacked_out)
 			tp->sacked_out = 0;
 		else
@@ -3156,10 +3162,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				flag |= FLAG_ORIG_SACK_ACKED;
 		}
 
-		if (sacked & TCPCB_SACKED_ACKED)
+		if (sacked & TCPCB_SACKED_ACKED) {
 			tp->sacked_out -= acked_pcount;
-		else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
-			tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+		} else if (tcp_is_sack(tp)) {
+			tp->delivered += acked_pcount;
+			if (!tcp_skb_spurious_retrans(tp, skb))
+				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
 
@@ -3541,9 +3550,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	bool is_dupack = false;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
-	const int prior_unsacked = tp->packets_out - tp->sacked_out;
+	u32 prior_delivered = tp->delivered;
 	int acked = 0; /* Number of packets newly acked */
-	int acked_sacked; /* Number of packets newly acked or sacked */
+	u32 acked_sacked; /* Number of packets newly acked or sacked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
 	sack_state.first_sackt.v64 = 0;
@@ -3645,7 +3654,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	acked_sacked = prior_unsacked - (tp->packets_out - tp->sacked_out);
+	acked_sacked = tp->delivered - prior_delivered;
 	/* Advance cwnd if state allows */
 	if (tcp_in_cwnd_reduction(sk)) {
 		/* Reduce cwnd if state mandates */
-- 
cgit v1.2.3


From 46fcc6ef9d39eb7b1becaa5ef5cba64d230f7c3f Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 2 Feb 2016 10:41:55 -0800
Subject: sunvnet: Add support for perf LDC event tracing

Add perf event macros for support of tracing and instrumentation
of LDC state machine

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/sunvnet.h | 139 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 include/trace/events/sunvnet.h

(limited to 'include')

diff --git a/include/trace/events/sunvnet.h b/include/trace/events/sunvnet.h
new file mode 100644
index 000000000000..eb080b267e55
--- /dev/null
+++ b/include/trace/events/sunvnet.h
@@ -0,0 +1,139 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sunvnet
+
+#if !defined(_TRACE_SUNVNET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SUNVNET_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(vnet_rx_one,
+
+	TP_PROTO(int lsid, int rsid, int index, int needs_ack),
+
+	TP_ARGS(lsid, rsid, index, needs_ack),
+
+	TP_STRUCT__entry(
+		__field(int, lsid)
+		__field(int, rsid)
+		__field(int, index)
+		__field(int, needs_ack)
+	),
+
+	TP_fast_assign(
+		__entry->lsid = lsid;
+		__entry->rsid = rsid;
+		__entry->index = index;
+		__entry->needs_ack = needs_ack;
+	),
+
+	TP_printk("(%x:%x) walk_rx_one index %d; needs_ack %d",
+		__entry->lsid, __entry->rsid,
+		__entry->index, __entry->needs_ack)
+);
+
+DECLARE_EVENT_CLASS(vnet_tx_stopped_ack_template,
+
+	TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+
+	TP_ARGS(lsid, rsid, ack_end, npkts),
+
+	TP_STRUCT__entry(
+		__field(int, lsid)
+		__field(int, rsid)
+		__field(int, ack_end)
+		__field(int, npkts)
+	),
+
+	TP_fast_assign(
+		__entry->lsid = lsid;
+		__entry->rsid = rsid;
+		__entry->ack_end = ack_end;
+		__entry->npkts = npkts;
+	),
+
+	TP_printk("(%x:%x) stopped ack for %d; npkts %d",
+		__entry->lsid, __entry->rsid,
+		__entry->ack_end, __entry->npkts)
+);
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_send_stopped_ack,
+	     TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+	     TP_ARGS(lsid, rsid, ack_end, npkts));
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_defer_stopped_ack,
+	     TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+	     TP_ARGS(lsid, rsid, ack_end, npkts));
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_pending_stopped_ack,
+	     TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+	     TP_ARGS(lsid, rsid, ack_end, npkts));
+
+TRACE_EVENT(vnet_rx_stopped_ack,
+
+	TP_PROTO(int lsid, int rsid, int end),
+
+	TP_ARGS(lsid, rsid, end),
+
+	TP_STRUCT__entry(
+		__field(int, lsid)
+		__field(int, rsid)
+		__field(int, end)
+	),
+
+	TP_fast_assign(
+		__entry->lsid = lsid;
+		__entry->rsid = rsid;
+		__entry->end = end;
+	),
+
+	TP_printk("(%x:%x) stopped ack for index %d",
+		__entry->lsid, __entry->rsid, __entry->end)
+);
+
+TRACE_EVENT(vnet_tx_trigger,
+
+	TP_PROTO(int lsid, int rsid, int start, int err),
+
+	TP_ARGS(lsid, rsid, start, err),
+
+	TP_STRUCT__entry(
+		__field(int, lsid)
+		__field(int, rsid)
+		__field(int, start)
+		__field(int, err)
+	),
+
+	TP_fast_assign(
+		__entry->lsid = lsid;
+		__entry->rsid = rsid;
+		__entry->start = start;
+		__entry->err = err;
+	),
+
+	TP_printk("(%x:%x) Tx trigger for %d sent with err %d %s",
+		__entry->lsid, __entry->rsid, __entry->start,
+		__entry->err, __entry->err > 0 ? "(ok)" : " ")
+);
+
+TRACE_EVENT(vnet_skip_tx_trigger,
+
+	TP_PROTO(int lsid, int rsid, int last),
+
+	TP_ARGS(lsid, rsid, last),
+
+	TP_STRUCT__entry(
+		__field(int, lsid)
+		__field(int, rsid)
+		__field(int, last)
+	),
+
+	TP_fast_assign(
+		__entry->lsid = lsid;
+		__entry->rsid = rsid;
+		__entry->last = last;
+	),
+
+	TP_printk("(%x:%x) Skip Tx trigger. Last trigger sent was %d",
+		__entry->lsid, __entry->rsid, __entry->last)
+);
+#endif /* _TRACE_SOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 103a8ad1fa3b261c78dfc842cb315defe9d40be0 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 3 Feb 2016 04:04:36 +0100
Subject: ethtool: add speed/duplex validation functions

Add functions which check if the speed/duplex are defined.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 57fa39005e79..b2e180181629 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1319,11 +1319,45 @@ enum ethtool_sfeatures_retval_bits {
 
 #define SPEED_UNKNOWN		-1
 
+static inline int ethtool_validate_speed(__u32 speed)
+{
+	switch (speed) {
+	case SPEED_10:
+	case SPEED_100:
+	case SPEED_1000:
+	case SPEED_2500:
+	case SPEED_5000:
+	case SPEED_10000:
+	case SPEED_20000:
+	case SPEED_25000:
+	case SPEED_40000:
+	case SPEED_50000:
+	case SPEED_56000:
+	case SPEED_100000:
+	case SPEED_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Duplex, half or full. */
 #define DUPLEX_HALF		0x00
 #define DUPLEX_FULL		0x01
 #define DUPLEX_UNKNOWN		0xff
 
+static inline int ethtool_validate_duplex(__u8 duplex)
+{
+	switch (duplex) {
+	case DUPLEX_HALF:
+	case DUPLEX_FULL:
+	case DUPLEX_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Which connector port. */
 #define PORT_TP			0x00
 #define PORT_AUI		0x01
-- 
cgit v1.2.3


From 6fa251663069e05daadd1666cbf3b658bf840ea4 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:49 +0200
Subject: ipv4: Namespaceify tcp syn retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp.c             |  3 ++-
 net/ipv4/tcp_ipv4.c        |  2 ++
 net/ipv4/tcp_timer.c       |  4 ++--
 6 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2b7907a35568..b7b5bd64df35 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -98,6 +98,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_keepalive_probes;
 	int sysctl_tcp_keepalive_intvl;
 
+	int sysctl_tcp_syn_retries;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 479d535609fd..825485c7cc1a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_syn_retries;
 extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4d367b4139a3..ae9dd8823134 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -291,15 +291,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &ip_ttl_min,
 		.extra2		= &ip_ttl_max,
 	},
-	{
-		.procname	= "tcp_syn_retries",
-		.data		= &sysctl_tcp_syn_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &tcp_syn_retries_min,
-		.extra2		= &tcp_syn_retries_max
-	},
 	{
 		.procname	= "tcp_synack_retries",
 		.data		= &sysctl_tcp_synack_retries,
@@ -960,6 +951,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "tcp_syn_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_syn_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_syn_retries_min,
+		.extra2		= &tcp_syn_retries_max
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c5075779e017..3dbb3637bb4b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2731,6 +2731,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	int val, len;
 
 	if (get_user(len, optlen))
@@ -2765,7 +2766,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = keepalive_probes(tp);
 		break;
 	case TCP_SYNCNT:
-		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		break;
 	case TCP_LINGER2:
 		val = tp->linger2;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a4d523709ab3..f7464852aaa1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2388,6 +2388,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
+	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a4730a28b220..c5d51f530c65 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
@@ -157,6 +156,7 @@ static int tcp_write_timeout(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	int retry_until;
 	bool do_reset, syn_set = false;
 
@@ -169,7 +169,7 @@ static int tcp_write_timeout(struct sock *sk)
 				NET_INC_STATS_BH(sock_net(sk),
 						 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 		}
-		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		syn_set = true;
 	} else {
 		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
-- 
cgit v1.2.3


From 7c083ecb3ba4583a625d5ff9655d1a819e374493 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:50 +0200
Subject: ipv4: Namespaceify tcp synack retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h        |  1 +
 include/net/tcp.h               |  1 -
 net/ipv4/inet_connection_sock.c |  7 ++-----
 net/ipv4/sysctl_net_ipv4.c      | 14 +++++++-------
 net/ipv4/tcp_ipv4.c             |  1 +
 net/ipv4/tcp_timer.c            |  3 +--
 6 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b7b5bd64df35..9e83084ab8c1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -99,6 +99,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_keepalive_intvl;
 
 	int sysctl_tcp_syn_retries;
+	int sysctl_tcp_synack_retries;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 825485c7cc1a..05659e860039 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46b9c887bede..9b17c1792dce 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -482,10 +482,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 #define AF_INET_FAMILY(fam) true
 #endif
 
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
-
-
 /* Decide when to expire the request and when to resend SYN-ACK */
 static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
 				  const int max_retries,
@@ -557,6 +553,7 @@ static void reqsk_timer_handler(unsigned long data)
 {
 	struct request_sock *req = (struct request_sock *)data;
 	struct sock *sk_listener = req->rsk_listener;
+	struct net *net = sock_net(sk_listener);
 	struct inet_connection_sock *icsk = inet_csk(sk_listener);
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	int qlen, expire = 0, resend = 0;
@@ -566,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data)
 	if (sk_state_load(sk_listener) != TCP_LISTEN)
 		goto drop;
 
-	max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
 	thresh = max_retries;
 	/* Normally all the openreqs are young and become mature
 	 * (i.e. converted to established socket) for first timeout.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ae9dd8823134..bb682e36d8b7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -291,13 +291,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &ip_ttl_min,
 		.extra2		= &ip_ttl_max,
 	},
-	{
-		.procname	= "tcp_synack_retries",
-		.data		= &sysctl_tcp_synack_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_orphans",
 		.data		= &sysctl_tcp_max_orphans,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &tcp_syn_retries_min,
 		.extra2		= &tcp_syn_retries_max
 	},
+	{
+		.procname	= "tcp_synack_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_synack_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f7464852aaa1..3146279695b9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2389,6 +2389,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c5d51f530c65..ca25fdf0c525 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
@@ -332,7 +331,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int max_retries = icsk->icsk_syn_retries ? :
-	    sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+	    sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
 	struct request_sock *req;
 
 	req = tcp_sk(sk)->fastopen_rsk;
-- 
cgit v1.2.3


From 12ed8244ed8b31b023ea6d2851fd8b15f2999e9b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:51 +0200
Subject: ipv4: Namespaceify tcp syncookies sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  1 -
 net/ipv4/syncookies.c      |  4 +---
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       | 10 ++++++----
 net/ipv4/tcp_ipv4.c        |  3 ++-
 net/ipv4/tcp_minisocks.c   |  3 ---
 net/ipv6/syncookies.c      |  2 +-
 8 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9e83084ab8c1..ac000fccdf0f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -101,6 +101,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_syn_retries;
 	int sysctl_tcp_synack_retries;
 
+	int sysctl_tcp_syncookies;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 05659e860039..1fb23b70d237 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ extern int sysctl_tcp_fin_timeout;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
-extern int sysctl_tcp_syncookies;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49020..ba0dcffada3b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -19,8 +19,6 @@
 #include <net/tcp.h>
 #include <net/route.h>
 
-extern int sysctl_tcp_syncookies;
-
 static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 
 #define COOKIEBITS 24	/* Upper bits store count */
@@ -307,7 +305,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	__u8 rcv_wscale;
 	struct flowi4 fl4;
 
-	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bb682e36d8b7..d80142570a8d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -341,15 +341,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-#ifdef CONFIG_SYN_COOKIES
-	{
-		.procname	= "tcp_syncookies",
-		.data		= &sysctl_tcp_syncookies,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-#endif
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -960,6 +951,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_SYN_COOKIES
+	{
+		.procname	= "tcp_syncookies",
+		.data		= &init_net.ipv4.sysctl_tcp_syncookies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 596c1cb6759a..b17aba42a368 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6163,9 +6163,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	const char *msg = "Dropping request";
 	bool want_cookie = false;
+	struct net *net = sock_net(sk);
 
 #ifdef CONFIG_SYN_COOKIES
-	if (sysctl_tcp_syncookies) {
+	if (net->ipv4.sysctl_tcp_syncookies) {
 		msg = "Sending cookies";
 		want_cookie = true;
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
@@ -6174,7 +6175,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 
 	if (!queue->synflood_warned &&
-	    sysctl_tcp_syncookies != 2 &&
+	    net->ipv4.sysctl_tcp_syncookies != 2 &&
 	    xchg(&queue->synflood_warned, 1) == 0)
 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 			proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -6207,6 +6208,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct sock *fastopen_sk = NULL;
 	struct dst_entry *dst = NULL;
 	struct request_sock *req;
@@ -6217,7 +6219,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	 * limitations, they conserve resources and peer is
 	 * evidently real one.
 	 */
-	if ((sysctl_tcp_syncookies == 2 ||
+	if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
 	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
 		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
 		if (!want_cookie)
@@ -6283,7 +6285,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			}
 		}
 		/* Kill the following clause, if you dislike this way. */
-		else if (!sysctl_tcp_syncookies &&
+		else if (!net->ipv4.sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 !tcp_peer_is_proven(req, dst, false,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3146279695b9..98313d10a2e0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -860,7 +860,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
-
 #ifdef CONFIG_TCP_MD5SIG
 /*
  * RFC2385 MD5 checksumming requires a mapping of
@@ -2391,6 +2390,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 
+	net->ipv4.sysctl_tcp_syncookies = 0;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 75632a925824..fadd8b978951 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -27,9 +27,6 @@
 #include <net/inet_common.h>
 #include <net/xfrm.h>
 
-int sysctl_tcp_syncookies __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_syncookies);
-
 int sysctl_tcp_abort_on_overflow __read_mostly;
 
 struct inet_timewait_death_row tcp_death_row = {
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 2906ef20795e..0e393ff7f5d0 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -148,7 +148,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	struct dst_entry *dst;
 	__u8 rcv_wscale;
 
-	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk))
-- 
cgit v1.2.3


From 1043e25ff96a1efc7bd34d11f5f32203a28a3bd7 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:52 +0200
Subject: ipv4: Namespaceify tcp reordering sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 +-
 include/net/tcp.h          |  4 +++-
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  2 +-
 net/ipv4/tcp_input.c       | 12 ++++++------
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_metrics.c     |  3 ++-
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ac000fccdf0f..eb4cd0a3c296 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -100,8 +100,8 @@ struct netns_ipv4 {
 
 	int sysctl_tcp_syn_retries;
 	int sysctl_tcp_synack_retries;
-
 	int sysctl_tcp_syncookies;
+	int sysctl_tcp_reordering;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1fb23b70d237..7e9a147cabae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -961,9 +961,11 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
  */
 static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 {
+	struct net *net = sock_net((struct sock *)tp);
+
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
 		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-		sysctl_tcp_reordering == 3;
+		net->ipv4.sysctl_tcp_reordering == 3;
 }
 
 static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d80142570a8d..7cd20570588f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -455,13 +455,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "tcp_reordering",
-		.data		= &sysctl_tcp_reordering,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_reordering",
 		.data		= &sysctl_tcp_max_reordering,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
+	{
+		.procname	= "tcp_reordering",
+		.data		= &init_net.ipv4.sysctl_tcp_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3dbb3637bb4b..f4db6b04cdb4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 	u64_stats_init(&tp->syncp);
 
-	tp->reordering = sysctl_tcp_reordering;
+	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b17aba42a368..5ee6fe0d152d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
 int sysctl_tcp_max_reordering __read_mostly = 300;
-EXPORT_SYMBOL(sysctl_tcp_reordering);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -1883,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
@@ -1933,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk)
 	 * suggests that the degree of reordering is over-estimated.
 	 */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
-	    tp->sacked_out >= sysctl_tcp_reordering)
+	    tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
 		tp->reordering = min_t(unsigned int, tp->reordering,
-				       sysctl_tcp_reordering);
+				       net->ipv4.sysctl_tcp_reordering);
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	tcp_ecn_queue_cwr(tp);
@@ -2119,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 packets_out;
+	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
@@ -2133,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	 */
 	packets_out = tp->packets_out;
 	if (packets_out <= tp->reordering &&
-	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
 	    !tcp_may_send_now(sk)) {
 		/* We have nothing to send. This connection is limited
 		 * either by receiver window or by application.
@@ -3317,7 +3317,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 	 * new SACK or ECE mark may first advance cwnd here and later reduce
 	 * cwnd in tcp_fastretrans_alert() based on more states.
 	 */
-	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
 		return flag & FLAG_FORWARD_PROGRESS;
 
 	return flag & FLAG_DATA_ACKED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 98313d10a2e0..10dfc8b5c0f8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2389,8 +2389,8 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
-
 	net->ipv4.sysctl_tcp_syncookies = 0;
+	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c8cbc2b4b792..c26241f3057b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct tcp_metrics_block *tm;
 	unsigned long rtt;
 	u32 val;
@@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk)
 		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
 			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 			if (val < tp->reordering &&
-			    tp->reordering != sysctl_tcp_reordering)
+			    tp->reordering != net->ipv4.sysctl_tcp_reordering)
 				tcp_metric_set(tm, TCP_METRIC_REORDERING,
 					       tp->reordering);
 		}
-- 
cgit v1.2.3


From ae5c3f406cffe15ffd2aa544961b7cd027468d46 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:53 +0200
Subject: ipv4: Namespaceify tcp_retries1 sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 16 ++++++++--------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_timer.c       |  8 ++++----
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index eb4cd0a3c296..dee6ba647461 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -102,6 +102,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_synack_retries;
 	int sysctl_tcp_syncookies;
 	int sysctl_tcp_reordering;
+	int sysctl_tcp_retries1;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7e9a147cabae..da96b9af3e5f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7cd20570588f..52853c6dc929 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,14 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_retries1",
-		.data		= &sysctl_tcp_retries1,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra2		= &tcp_retr1_max
-	},
 	{
 		.procname	= "tcp_retries2",
 		.data		= &sysctl_tcp_retries2,
@@ -960,6 +952,14 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_retries1",
+		.data		= &init_net.ipv4.sysctl_tcp_retries1,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra2		= &tcp_retr1_max
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 10dfc8b5c0f8..57fe3c6bfb30 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2391,6 +2391,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 	net->ipv4.sysctl_tcp_syncookies = 0;
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ca25fdf0c525..6694e33149b9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
@@ -171,7 +170,7 @@ static int tcp_write_timeout(struct sock *sk)
 		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		syn_set = true;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
 			/* Some middle-boxes may black-hole Fast Open _after_
 			 * the handshake. Therefore we conservatively disable
 			 * Fast Open on this path on recurring timeouts with
@@ -180,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
 			if (tp->syn_data_acked &&
 			    tp->bytes_acked <= tp->rx_opt.mss_clamp) {
 				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-				if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+				if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
 					NET_INC_STATS_BH(sock_net(sk),
 							 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 			}
@@ -359,6 +358,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 void tcp_retransmit_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (tp->fastopen_rsk) {
@@ -489,7 +489,7 @@ out_reset_timer:
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+	if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))
 		__sk_dst_reset(sk);
 
 out:;
-- 
cgit v1.2.3


From c6214a97c86c660de4f7ddb8eed925192e646161 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:54 +0200
Subject: ipv4: Namespaceify tcp_retries2 sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  3 ++-
 net/ipv4/tcp_timer.c       |  5 ++---
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index dee6ba647461..d92c8e5d0fbc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_syncookies;
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_retries1;
+	int sysctl_tcp_retries2;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index da96b9af3e5f..a786cfa6301b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 52853c6dc929..8e339d43619c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,13 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_retries2",
-		.data		= &sysctl_tcp_retries2,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fin_timeout",
 		.data		= &sysctl_tcp_fin_timeout,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra2		= &tcp_retr1_max
 	},
+	{
+		.procname	= "tcp_retries2",
+		.data		= &init_net.ipv4.sysctl_tcp_retries2,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 57fe3c6bfb30..0710e6108a5e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2392,6 +2392,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_syncookies = 0;
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fda379cd600d..7beb3f688b7a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3476,6 +3476,7 @@ void tcp_send_probe0(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	unsigned long probe_max;
 	int err;
 
@@ -3489,7 +3490,7 @@ void tcp_send_probe0(struct sock *sk)
 	}
 
 	if (err <= 0) {
-		if (icsk->icsk_backoff < sysctl_tcp_retries2)
+		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
 			icsk->icsk_backoff++;
 		icsk->icsk_probes_out++;
 		probe_max = TCP_RTO_MAX;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6694e33149b9..09f4e0297e56 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
@@ -189,7 +188,7 @@ static int tcp_write_timeout(struct sock *sk)
 			dst_negative_advice(sk);
 		}
 
-		retry_until = sysctl_tcp_retries2;
+		retry_until = net->ipv4.sysctl_tcp_retries2;
 		if (sock_flag(sk, SOCK_DEAD)) {
 			const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
 
@@ -303,7 +302,7 @@ static void tcp_probe_timer(struct sock *sk)
 		 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
 		goto abort;
 
-	max_probes = sysctl_tcp_retries2;
+	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
 	if (sock_flag(sk, SOCK_DEAD)) {
 		const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
 
-- 
cgit v1.2.3


From c402d9beffb6141ab2e4d2ad8be71128803a28ca Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:55 +0200
Subject: ipv4: Namespaceify tcp_orphan_retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_timer.c       |  3 +--
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d92c8e5d0fbc..080230321985 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -104,6 +104,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_retries1;
 	int sysctl_tcp_retries2;
+	int sysctl_tcp_orphan_retries;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a786cfa6301b..71f840b89c76 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8e339d43619c..b7af6336985f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -419,13 +419,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "tcp_orphan_retries",
-		.data		= &sysctl_tcp_orphan_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fack",
 		.data		= &sysctl_tcp_fack,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_orphan_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_orphan_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0710e6108a5e..1240dd62eee1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2393,6 +2393,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+	net->ipv4.sysctl_tcp_orphan_retries = 0;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 09f4e0297e56..49bc474f8e35 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
 static void tcp_write_err(struct sock *sk)
@@ -78,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 /* Calculate maximal number or retries on an orphaned socket. */
 static int tcp_orphan_retries(struct sock *sk, bool alive)
 {
-	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+	int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
 
 	/* We know from an ICMP that something is wrong. */
 	if (sk->sk_err_soft && !alive)
-- 
cgit v1.2.3


From 1e579caa18b96f9eb18f4f5416658cd15f37c062 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:56 +0200
Subject: ipv4: Namespaceify tcp_fin_timeout sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  3 +--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  7 +++----
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 080230321985..de5ff4385e84 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -105,6 +105,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_retries1;
 	int sysctl_tcp_retries2;
 	int sysctl_tcp_orphan_retries;
+	int sysctl_tcp_fin_timeout;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 71f840b89c76..3f160c2e6960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -239,7 +239,6 @@ extern struct inet_timewait_death_row tcp_death_row;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
-extern int sysctl_tcp_fin_timeout;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
@@ -1249,7 +1248,7 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
 
 static inline int tcp_fin_time(const struct sock *sk)
 {
-	int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout;
+	int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
 	const int rto = inet_csk(sk)->icsk_rto;
 
 	if (fin_timeout < (rto << 2) - (rto >> 1))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b7af6336985f..8bd335a2cba8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,13 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_fin_timeout",
-		.data		= &sysctl_tcp_fin_timeout,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &init_net.ipv4.sysctl_tcp_fin_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f4db6b04cdb4..014f18e2f7b3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,8 +282,6 @@
 #include <asm/unaligned.h>
 #include <net/busy_poll.h>
 
-int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
-
 int sysctl_tcp_min_tso_segs __read_mostly = 2;
 
 int sysctl_tcp_autocorking __read_mostly = 1;
@@ -2330,6 +2328,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
 	int val;
 	int err = 0;
 
@@ -2526,7 +2525,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		if (val < 0)
 			tp->linger2 = -1;
-		else if (val > sysctl_tcp_fin_timeout / HZ)
+		else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
 			tp->linger2 = 0;
 		else
 			tp->linger2 = val * HZ;
@@ -2771,7 +2770,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		val = tp->linger2;
 		if (val >= 0)
-			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+			val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
 		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1240dd62eee1..36c83c28d9c9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2394,6 +2394,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
+	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 
 	return 0;
 fail:
-- 
cgit v1.2.3


From 4979f2d9f7262b9b180bc83de8d70f7a7721c085 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:57 +0200
Subject: ipv4: Namespaceify tcp_notsent_lowat sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  4 ++--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  3 ---
 5 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index de5ff4385e84..4d6ec3f6fafe 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -106,6 +106,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_retries2;
 	int sysctl_tcp_orphan_retries;
 	int sysctl_tcp_fin_timeout;
+	unsigned int sysctl_tcp_notsent_lowat;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3f160c2e6960..9b2cb0c8d876 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -267,7 +267,6 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
-extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_min_rtt_wlen;
 extern int sysctl_tcp_autocorking;
@@ -1682,7 +1681,8 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
 
 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
 {
-	return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
+	struct net *net = sock_net((struct sock *)tp);
+	return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
 }
 
 static inline bool tcp_stream_memory_free(const struct sock *sk)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8bd335a2cba8..44bb59824267 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -455,13 +455,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_notsent_lowat",
-		.data		= &sysctl_tcp_notsent_lowat,
-		.maxlen		= sizeof(sysctl_tcp_notsent_lowat),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_rmem",
 		.data		= &sysctl_tcp_rmem,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "tcp_notsent_lowat",
+		.data		= &init_net.ipv4.sysctl_tcp_notsent_lowat,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 36c83c28d9c9..11ae706f53a1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,6 +2395,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7beb3f688b7a..7d2c7a400456 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
-unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
-EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
-- 
cgit v1.2.3


From 157ede6784ba2837c7dc43f195418c75927f8488 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Wed, 3 Feb 2016 09:57:04 +0100
Subject: bridge: mdb: add support for offloaded mdb entries

Add new bitmask member 'flags' to br_mdb_entry structure. Adding
MDB_FLAGS_OFFLOAD bit which indicates MDB entries is offloaded to hardware.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 18db14477bdd..ec3547234998 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -183,6 +183,8 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
+#define MDB_FLAGS_OFFLOAD	(1 << 0)
+	__u8 flags;
 	__u16 vid;
 	struct {
 		union {
-- 
cgit v1.2.3


From 5ee14e6d336f1daacf5ba73e831029c5ab7ae329 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 3 Feb 2016 13:17:01 +0100
Subject: bonding: 3ad: apply ad_actor settings changes immediately

Currently the bonding allows to set ad_actor_system and prio while the
bond device is down, but these are actually applied only if there aren't
any slaves yet (applied to bond device when first slave shows up, and to
slaves at 3ad bind time). After this patch changes are applied immediately
and the new values can be used/seen after the bond's upped so it's not
necessary anymore to release all and enslave again to see the changes.

CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c     | 40 +++++++++++++++++++++++++++++++++++---
 drivers/net/bonding/bond_options.c |  4 ++++
 include/net/bond_3ad.h             |  1 +
 3 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 4cbb8b27a891..ee94056dbb2e 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -357,6 +357,14 @@ static u8 __get_duplex(struct port *port)
 	return retval;
 }
 
+static void __ad_actor_update_port(struct port *port)
+{
+	const struct bonding *bond = bond_get_bond_by_slave(port->slave);
+
+	port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr;
+	port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority;
+}
+
 /* Conversions */
 
 /**
@@ -1963,9 +1971,7 @@ void bond_3ad_bind_slave(struct slave *slave)
 		port->actor_admin_port_key = bond->params.ad_user_port_key << 6;
 		ad_update_actor_keys(port, false);
 		/* actor system is the bond's system */
-		port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr;
-		port->actor_system_priority =
-		    BOND_AD_INFO(bond).system.sys_priority;
+		__ad_actor_update_port(port);
 		/* tx timer(to verify that no more than MAX_TX_IN_SECOND
 		 * lacpdu's are sent in one second)
 		 */
@@ -2147,6 +2153,34 @@ out:
 	spin_unlock_bh(&bond->mode_lock);
 }
 
+/**
+ * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports
+ * @bond: bonding struct to work on
+ *
+ * If an ad_actor setting gets changed we need to update the individual port
+ * settings so the bond device will use the new values when it gets upped.
+ */
+void bond_3ad_update_ad_actor_settings(struct bonding *bond)
+{
+	struct list_head *iter;
+	struct slave *slave;
+
+	ASSERT_RTNL();
+
+	BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio;
+	if (is_zero_ether_addr(bond->params.ad_actor_system))
+		BOND_AD_INFO(bond).system.sys_mac_addr =
+		    *((struct mac_addr *)bond->dev->dev_addr);
+	else
+		BOND_AD_INFO(bond).system.sys_mac_addr =
+		    *((struct mac_addr *)bond->params.ad_actor_system);
+
+	spin_lock_bh(&bond->mode_lock);
+	bond_for_each_slave(bond, slave, iter)
+		__ad_actor_update_port(&(SLAVE_AD_INFO(slave)->port));
+	spin_unlock_bh(&bond->mode_lock);
+}
+
 /**
  * bond_3ad_state_machine_handler - handle state machines timeout
  * @bond: bonding struct to work on
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 55e93b6b6d21..ed0bdae64f5e 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1392,6 +1392,8 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond,
 		    newval->value);
 
 	bond->params.ad_actor_sys_prio = newval->value;
+	bond_3ad_update_ad_actor_settings(bond);
+
 	return 0;
 }
 
@@ -1418,6 +1420,8 @@ static int bond_option_ad_actor_system_set(struct bonding *bond,
 
 	netdev_info(bond->dev, "Setting ad_actor_system to %pM\n", mac);
 	ether_addr_copy(bond->params.ad_actor_system, mac);
+	bond_3ad_update_ad_actor_settings(bond);
+
 	return 0;
 
 err:
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index f1fbc3b11962..f358ad5e4214 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -306,5 +306,6 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
+void bond_3ad_update_ad_actor_settings(struct bonding *bond);
 #endif /* _NET_BOND_3AD_H */
 
-- 
cgit v1.2.3


From 086c653f5862591a9cfe2386f5650d03adacc33a Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:35 -0500
Subject: sock: struct proto hash function may error

In order to support fast reuseport lookups in TCP, the hash function
defined in struct proto must be capable of returning an error code.
This patch changes the function signature of all related hash functions
to return an integer and handles or propagates this return value at
all call sites.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h   |  2 +-
 include/net/phonet/phonet.h     |  2 +-
 include/net/ping.h              |  2 +-
 include/net/raw.h               |  2 +-
 include/net/sock.h              |  6 +++---
 include/net/udp.h               |  3 ++-
 net/ieee802154/socket.c         | 17 +++++++++++++----
 net/ipv4/af_inet.c              |  9 ++++++---
 net/ipv4/inet_connection_sock.c |  8 +++++---
 net/ipv4/inet_hashtables.c      |  4 +++-
 net/ipv4/ping.c                 |  4 +++-
 net/ipv4/raw.c                  |  4 +++-
 net/ipv6/af_inet6.c             |  6 +++++-
 net/phonet/socket.c             |  6 ++++--
 net/sctp/socket.c               |  3 ++-
 15 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index de2e3ade6102..554440e7f83d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -208,7 +208,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
 void __inet_hash(struct sock *sk, struct sock *osk);
-void inet_hash(struct sock *sk);
+int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 68e509750caa..039cc29cb4a8 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -51,7 +51,7 @@ void pn_sock_init(void);
 struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa);
 void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb);
 void phonet_get_local_port_range(int *min, int *max);
-void pn_sock_hash(struct sock *sk);
+int pn_sock_hash(struct sock *sk);
 void pn_sock_unhash(struct sock *sk);
 int pn_sock_get_port(struct sock *sk, unsigned short sport);
 
diff --git a/include/net/ping.h b/include/net/ping.h
index ac80cb45e630..5fd7cc244833 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -65,7 +65,7 @@ struct pingfakehdr {
 };
 
 int  ping_get_port(struct sock *sk, unsigned short ident);
-void ping_hash(struct sock *sk);
+int ping_hash(struct sock *sk);
 void ping_unhash(struct sock *sk);
 
 int  ping_init_sock(struct sock *sk);
diff --git a/include/net/raw.h b/include/net/raw.h
index 6a40c6562dd2..3e789008394d 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file,
 
 #endif
 
-void raw_hash_sk(struct sock *sk);
+int raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
 
 struct raw_sock {
diff --git a/include/net/sock.h b/include/net/sock.h
index f5ea148853e2..255d3e03727b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -984,7 +984,7 @@ struct proto {
 	void		(*release_cb)(struct sock *sk);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
-	void			(*hash)(struct sock *sk);
+	int			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
@@ -1194,10 +1194,10 @@ static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
 /* With per-bucket locks this operation is not-atomic, so that
  * this version is not worse.
  */
-static inline void __sk_prot_rehash(struct sock *sk)
+static inline int __sk_prot_rehash(struct sock *sk)
 {
 	sk->sk_prot->unhash(sk);
-	sk->sk_prot->hash(sk);
+	return sk->sk_prot->hash(sk);
 }
 
 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
diff --git a/include/net/udp.h b/include/net/udp.h
index 2842541e28e7..92927f729ac8 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -177,9 +177,10 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 }
 
 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
-static inline void udp_lib_hash(struct sock *sk)
+static inline int udp_lib_hash(struct sock *sk)
 {
 	BUG();
+	return 0;
 }
 
 void udp_lib_unhash(struct sock *sk);
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a548be247e15..e0bd013a1e5e 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
 static HLIST_HEAD(raw_head);
 static DEFINE_RWLOCK(raw_lock);
 
-static void raw_hash(struct sock *sk)
+static int raw_hash(struct sock *sk)
 {
 	write_lock_bh(&raw_lock);
 	sk_add_node(sk, &raw_head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&raw_lock);
+
+	return 0;
 }
 
 static void raw_unhash(struct sock *sk)
@@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk)
 	return container_of(sk, struct dgram_sock, sk);
 }
 
-static void dgram_hash(struct sock *sk)
+static int dgram_hash(struct sock *sk)
 {
 	write_lock_bh(&dgram_lock);
 	sk_add_node(sk, &dgram_head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&dgram_lock);
+
+	return 0;
 }
 
 static void dgram_unhash(struct sock *sk)
@@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock,
 	/* Checksums on by default */
 	sock_set_flag(sk, SOCK_ZAPPED);
 
-	if (sk->sk_prot->hash)
-		sk->sk_prot->hash(sk);
+	if (sk->sk_prot->hash) {
+		rc = sk->sk_prot->hash(sk);
+		if (rc) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 
 	if (sk->sk_prot->init) {
 		rc = sk->sk_prot->init(sk);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5c5db6636704..eade66db214e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -370,7 +370,11 @@ lookup_protocol:
 		 */
 		inet->inet_sport = htons(inet->inet_num);
 		/* Add to protocol hash chains. */
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
 	}
 
 	if (sk->sk_prot->init) {
@@ -1142,8 +1146,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	 * Besides that, it does not check for connection
 	 * uniqueness. Wait for troubles.
 	 */
-	__sk_prot_rehash(sk);
-	return 0;
+	return __sk_prot_rehash(sk);
 }
 
 int inet_sk_rebuild_header(struct sock *sk)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 9b17c1792dce..12c8d389dc18 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -734,6 +734,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
+	int err = -EADDRINUSE;
 
 	reqsk_queue_alloc(&icsk->icsk_accept_queue);
 
@@ -751,13 +752,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 		inet->inet_sport = htons(inet->inet_num);
 
 		sk_dst_reset(sk);
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
 
-		return 0;
+		if (likely(!err))
+			return 0;
 	}
 
 	sk->sk_state = TCP_CLOSE;
-	return -EADDRINUSE;
+	return err;
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980797fc..b6023b7baae0 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -468,13 +468,15 @@ void __inet_hash(struct sock *sk, struct sock *osk)
 }
 EXPORT_SYMBOL(__inet_hash);
 
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
 		__inet_hash(sk, NULL);
 		local_bh_enable();
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index c117b21b937d..f6f93fc2c61f 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -145,10 +145,12 @@ fail:
 }
 EXPORT_SYMBOL_GPL(ping_get_port);
 
-void ping_hash(struct sock *sk)
+int ping_hash(struct sock *sk)
 {
 	pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
 	BUG(); /* "Please do not press this button again." */
+
+	return 0;
 }
 
 void ping_unhash(struct sock *sk)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bc35f1842512..d6352515d738 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
 };
 
-void raw_hash_sk(struct sock *sk)
+int raw_hash_sk(struct sock *sk)
 {
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 	struct hlist_head *head;
@@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk)
 	sk_add_node(sk, head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&h->lock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(raw_hash_sk);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9f5137cd604e..b11c37cfd67c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -235,7 +235,11 @@ lookup_protocol:
 		 * creation time automatically shares.
 		 */
 		inet->inet_sport = htons(inet->inet_num);
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
 	}
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index d575ef4e9aa6..ffd5f2297584 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
-void pn_sock_hash(struct sock *sk)
+int pn_sock_hash(struct sock *sk)
 {
 	struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);
 
 	mutex_lock(&pnsocks.lock);
 	sk_add_node_rcu(sk, hlist);
 	mutex_unlock(&pnsocks.lock);
+
+	return 0;
 }
 EXPORT_SYMBOL(pn_sock_hash);
 
@@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 	pn->resource = spn->spn_resource;
 
 	/* Enable RX on the socket */
-	sk->sk_prot->hash(sk);
+	err = sk->sk_prot->hash(sk);
 out_port:
 	mutex_unlock(&port_mutex);
 out:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5ca2ebfe0be8..6427b9d1197e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6101,9 +6101,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	return retval;
 }
 
-static void sctp_hash(struct sock *sk)
+static int sctp_hash(struct sock *sk)
 {
 	/* STUB */
+	return 0;
 }
 
 static void sctp_unhash(struct sock *sk)
-- 
cgit v1.2.3


From 496611d7b5eaf59c03440c8f2def1d9988ad2459 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:36 -0500
Subject: inet: create IPv6-equivalent inet_hash function

In order to support fast lookups for TCP sockets with SO_REUSEPORT,
the function that adds sockets to the listening hash set needs
to be able to check receive address equality.  Since this equality
check is different for IPv4 and IPv6, we will need two different
socket hashing functions.

This patch adds inet6_hash identical to the existing inet_hash function
and updates the appropriate references.  A following patch will
differentiate the two by passing different comparison functions to
__inet_hash.

Additionally, in order to use the IPv6 address equality function from
inet6_hashtables (which is compiled as a built-in object when IPv6 is
enabled) it also needs to be in a built-in object file as well.  This
moves ipv6_rcv_saddr_equal into inet_hashtables to accomplish this.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h |  2 ++
 net/dccp/ipv6.c                |  2 +-
 net/ipv6/inet6_hashtables.c    | 56 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c            |  2 +-
 net/ipv6/udp.c                 | 44 +--------------------------------
 net/l2tp/l2tp_ip6.c            |  3 ++-
 6 files changed, 63 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 7ff588ca6817..b3c28a9dfbf1 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -96,6 +96,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif);
+
+int inet6_hash(struct sock *sk);
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9c6d0508e63a..90a8269b28d0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = {
 	.sendmsg	   = dccp_sendmsg,
 	.recvmsg	   = dccp_recvmsg,
 	.backlog_rcv	   = dccp_v6_do_rcv,
-	.hash		   = inet_hash,
+	.hash		   = inet6_hash,
 	.unhash		   = inet_unhash,
 	.accept		   = inet_csk_accept,
 	.get_port	   = inet_csk_get_port,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 21ace5a2bf7c..072653dd9c98 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -274,3 +274,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 				   __inet6_check_established);
 }
 EXPORT_SYMBOL_GPL(inet6_hash_connect);
+
+int inet6_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__inet_hash(sk, NULL);
+		local_bh_enable();
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet6_hash);
+
+/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ *                          only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *                          and 0.0.0.0 equals to 0.0.0.0 only
+ */
+int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard)
+{
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+	int sk2_ipv6only = inet_v6_ipv6only(sk2);
+	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+		if (!sk2_ipv6only) {
+			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+				return 1;
+			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+				return match_wildcard;
+		}
+		return 0;
+	}
+
+	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+		return 1;
+
+	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 006396e31cb0..d72bcfb326d8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1865,7 +1865,7 @@ struct proto tcpv6_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.hash			= inet_hash,
+	.hash			= inet6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 22e28a44e3c8..ac4e7e03dded 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 
+#include <net/addrconf.h>
 #include <net/ndisc.h>
 #include <net/protocol.h>
 #include <net/transp_v6.h>
@@ -77,49 +78,6 @@ static u32 udp6_ehashfn(const struct net *net,
 			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
-/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- *                          only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- *                          and 0.0.0.0 equals to 0.0.0.0 only
- */
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
-	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
-	/* if both are mapped, treat as IPv4 */
-	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
-		if (!sk2_ipv6only) {
-			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
-				return 1;
-			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
-				return match_wildcard;
-		}
-		return 0;
-	}
-
-	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
-		return 1;
-
-	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
-	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
-		return 1;
-
-	return 0;
-}
-
 static u32 udp6_portaddr_hash(const struct net *net,
 			      const struct in6_addr *addr6,
 			      unsigned int port)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index a2c8747d2936..6b54ff3ff4cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -25,6 +25,7 @@
 #include <net/udp.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/tcp_states.h>
 #include <net/protocol.h>
 #include <net/xfrm.h>
@@ -718,7 +719,7 @@ static struct proto l2tp_ip6_prot = {
 	.sendmsg	   = l2tp_ip6_sendmsg,
 	.recvmsg	   = l2tp_ip6_recvmsg,
 	.backlog_rcv	   = l2tp_ip6_backlog_recv,
-	.hash		   = inet_hash,
+	.hash		   = inet6_hash,
 	.unhash		   = inet_unhash,
 	.obj_size	   = sizeof(struct l2tp_ip6_sock),
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3


From d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:37 -0500
Subject: tcp: __tcp_hdrlen() helper

tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr.
This splits the size calculation into a helper function that can be
used if a struct tcphdr is already available.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d909feeeaea2..bcbf51da4e1e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
 	return (struct tcphdr *)skb_transport_header(skb);
 }
 
+static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
+{
+	return th->doff * 4;
+}
+
 static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
 {
-	return tcp_hdr(skb)->doff * 4;
+	return __tcp_hdrlen(tcp_hdr(skb));
 }
 
 static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
-- 
cgit v1.2.3


From a583636a83ea383fd07517e5a7a2eedbc5d90fb1 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:38 -0500
Subject: inet: refactor inet[6]_lookup functions to take skb

This is a preliminary step to allow fast socket lookup of SO_REUSEPORT
groups.  Doing so with a BPF filter will require access to the
skb in question.  This change plumbs the skb (and offset to payload
data) through the call stack to the listening socket lookup
implementations where it will be used in a following patch.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h         |  2 ++
 include/net/inet6_hashtables.h | 11 +++++++----
 include/net/inet_hashtables.h  | 18 ++++++++++++------
 net/dccp/ipv4.c                |  2 +-
 net/dccp/ipv6.c                |  2 +-
 net/ipv4/inet_diag.c           |  6 +++---
 net/ipv4/inet_hashtables.c     |  1 +
 net/ipv4/tcp_ipv4.c            | 10 ++++++----
 net/ipv6/inet6_hashtables.c    |  8 ++++++--
 net/ipv6/tcp_ipv6.c            |  8 +++++---
 net/netfilter/xt_TPROXY.c      | 31 ++++++++++++++++++++-----------
 net/netfilter/xt_socket.c      | 28 +++++++++++++++++++++-------
 12 files changed, 85 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 47f52d3cd8df..730d856683e5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -87,6 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
+int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard);
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			 bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index b3c28a9dfbf1..28332bdac333 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net,
 
 struct sock *inet6_lookup_listener(struct net *net,
 				   struct inet_hashinfo *hashinfo,
+				   struct sk_buff *skb, int doff,
 				   const struct in6_addr *saddr,
 				   const __be16 sport,
 				   const struct in6_addr *daddr,
@@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 
 static inline struct sock *__inet6_lookup(struct net *net,
 					  struct inet_hashinfo *hashinfo,
+					  struct sk_buff *skb, int doff,
 					  const struct in6_addr *saddr,
 					  const __be16 sport,
 					  const struct in6_addr *daddr,
@@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net,
 	if (sk)
 		return sk;
 
-	return inet6_lookup_listener(net, hashinfo, saddr, sport,
+	return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
 				     daddr, hnum, dif);
 }
 
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
-					      struct sk_buff *skb,
+					      struct sk_buff *skb, int doff,
 					      const __be16 sport,
 					      const __be16 dport,
 					      int iif)
@@ -86,13 +88,14 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 	if (sk)
 		return sk;
 
-	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
-			      &ipv6_hdr(skb)->saddr, sport,
+	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+			      doff, &ipv6_hdr(skb)->saddr, sport,
 			      &ipv6_hdr(skb)->daddr, ntohs(dport),
 			      iif);
 }
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+			  struct sk_buff *skb, int doff,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 554440e7f83d..82403390af58 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -213,6 +213,7 @@ void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
+				    struct sk_buff *skb, int doff,
 				    const __be32 saddr, const __be16 sport,
 				    const __be32 daddr,
 				    const unsigned short hnum,
@@ -220,10 +221,11 @@ struct sock *__inet_lookup_listener(struct net *net,
 
 static inline struct sock *inet_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
+		struct sk_buff *skb, int doff,
 		__be32 saddr, __be16 sport,
 		__be32 daddr, __be16 dport, int dif)
 {
-	return __inet_lookup_listener(net, hashinfo, saddr, sport,
+	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
 				      daddr, ntohs(dport), dif);
 }
 
@@ -299,6 +301,7 @@ static inline struct sock *
 
 static inline struct sock *__inet_lookup(struct net *net,
 					 struct inet_hashinfo *hashinfo,
+					 struct sk_buff *skb, int doff,
 					 const __be32 saddr, const __be16 sport,
 					 const __be32 daddr, const __be16 dport,
 					 const int dif)
@@ -307,12 +310,13 @@ static inline struct sock *__inet_lookup(struct net *net,
 	struct sock *sk = __inet_lookup_established(net, hashinfo,
 				saddr, sport, daddr, hnum, dif);
 
-	return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
-					     daddr, hnum, dif);
+	return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
+					     sport, daddr, hnum, dif);
 }
 
 static inline struct sock *inet_lookup(struct net *net,
 				       struct inet_hashinfo *hashinfo,
+				       struct sk_buff *skb, int doff,
 				       const __be32 saddr, const __be16 sport,
 				       const __be32 daddr, const __be16 dport,
 				       const int dif)
@@ -320,7 +324,8 @@ static inline struct sock *inet_lookup(struct net *net,
 	struct sock *sk;
 
 	local_bh_disable();
-	sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif);
+	sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+			   dport, dif);
 	local_bh_enable();
 
 	return sk;
@@ -328,6 +333,7 @@ static inline struct sock *inet_lookup(struct net *net,
 
 static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     struct sk_buff *skb,
+					     int doff,
 					     const __be16 sport,
 					     const __be16 dport)
 {
@@ -337,8 +343,8 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 	if (sk)
 		return sk;
 	else
-		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
-				     iph->saddr, sport,
+		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+				     doff, iph->saddr, sport,
 				     iph->daddr, dport, inet_iif(skb));
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 5684e14932bd..1e0c600c83ae 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -802,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
 	}
 
 lookup:
-	sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+	sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
 			       dh->dccph_sport, dh->dccph_dport);
 	if (!sk) {
 		dccp_pr_debug("failed to look up flow ID in table and "
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 90a8269b28d0..45cbe85f0940 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -668,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
 		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
 
 lookup:
-	sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+	sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
 			        dh->dccph_sport, dh->dccph_dport,
 				inet6_iif(skb));
 	if (!sk) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 6029157a19ed..50c0d96b8441 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
 	struct sock *sk;
 
 	if (req->sdiag_family == AF_INET)
-		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+		sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
 				 req->id.idiag_dport, req->id.idiag_src[0],
 				 req->id.idiag_sport, req->id.idiag_if);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6) {
 		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
 		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
-			sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+			sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
 					 req->id.idiag_dport, req->id.idiag_src[3],
 					 req->id.idiag_sport, req->id.idiag_if);
 		else
-			sk = inet6_lookup(net, hashinfo,
+			sk = inet6_lookup(net, hashinfo, NULL, 0,
 					  (struct in6_addr *)req->id.idiag_dst,
 					  req->id.idiag_dport,
 					  (struct in6_addr *)req->id.idiag_src,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b6023b7baae0..5e4290b83255 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -205,6 +205,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
+				    struct sk_buff *skb, int doff,
 				    const __be32 saddr, __be16 sport,
 				    const __be32 daddr, const unsigned short hnum,
 				    const int dif)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0d381fa164f8..3f872a6bc274 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -637,8 +637,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 * Incoming packet is checked with md5 hash with finding key,
 		 * no RST generated if md5 hash doesn't match.
 		 */
-		sk1 = __inet_lookup_listener(net,
-					     &tcp_hashinfo, ip_hdr(skb)->saddr,
+		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
+					     ip_hdr(skb)->saddr,
 					     th->source, ip_hdr(skb)->daddr,
 					     ntohs(th->source), inet_iif(skb));
 		/* don't send rst if it can't find key */
@@ -1581,7 +1581,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
 lookup:
-	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+			       th->dest);
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1695,7 +1696,8 @@ do_time_wait:
 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
 	case TCP_TW_SYN: {
 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
-							&tcp_hashinfo,
+							&tcp_hashinfo, skb,
+							__tcp_hdrlen(th),
 							iph->saddr, th->source,
 							iph->daddr, th->dest,
 							inet_iif(skb));
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 072653dd9c98..004345d26808 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -121,7 +121,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 struct sock *inet6_lookup_listener(struct net *net,
-		struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+		struct inet_hashinfo *hashinfo,
+		struct sk_buff *skb, int doff,
+		const struct in6_addr *saddr,
 		const __be16 sport, const struct in6_addr *daddr,
 		const unsigned short hnum, const int dif)
 {
@@ -177,6 +179,7 @@ begin:
 EXPORT_SYMBOL_GPL(inet6_lookup_listener);
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+			  struct sk_buff *skb, int doff,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif)
@@ -184,7 +187,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 	struct sock *sk;
 
 	local_bh_disable();
-	sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+			    ntohs(dport), dif);
 	local_bh_enable();
 
 	return sk;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d72bcfb326d8..9977b6f19f2a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -866,7 +866,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 * no RST generated if md5 hash doesn't match.
 		 */
 		sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
-					   &tcp_hashinfo, &ipv6h->saddr,
+					   &tcp_hashinfo, NULL, 0,
+					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
 					   ntohs(th->source), tcp_v6_iif(skb));
 		if (!sk1)
@@ -1375,8 +1376,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	hdr = ipv6_hdr(skb);
 
 lookup:
-	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
-				inet6_iif(skb));
+	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+				th->source, th->dest, inet6_iif(skb));
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1500,6 +1501,7 @@ do_time_wait:
 		struct sock *sk2;
 
 		sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
 					    ntohs(th->dest), tcp_v6_iif(skb));
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 3ab591e73ec0..7f4414d26a66 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
  * belonging to established connections going through that one.
  */
 static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
+	struct tcphdr *tcph;
 
 	switch (protocol) {
 	case IPPROTO_TCP:
 		switch (lookup_type) {
 		case NFT_LOOKUP_LISTENER:
-			sk = inet_lookup_listener(net, &tcp_hashinfo,
+			tcph = hp;
+			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+						    ip_hdrlen(skb) +
+						      __tcp_hdrlen(tcph),
 						    saddr, sport,
 						    daddr, dport,
 						    in->ifindex);
@@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 
 #ifdef XT_TPROXY_HAVE_IPV6
 static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
+	struct tcphdr *tcph;
 
 	switch (protocol) {
 	case IPPROTO_TCP:
 		switch (lookup_type) {
 		case NFT_LOOKUP_LISTENER:
-			sk = inet6_lookup_listener(net, &tcp_hashinfo,
+			tcph = hp;
+			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+						   thoff + __tcp_hdrlen(tcph),
 						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex);
@@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v4(net, iph->protocol,
+		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					    iph->saddr, laddr ? laddr : iph->daddr,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NFT_LOOKUP_LISTENER);
@@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
 				   skb->dev, NFT_LOOKUP_ESTABLISHED);
@@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
 					   skb->dev, NFT_LOOKUP_LISTENER);
@@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(par->net, tproto,
+		sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
 					    &iph->saddr,
 					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
 					    hp->source,
@@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(par->net, tproto,
+	sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
 				   par->in, NFT_LOOKUP_ESTABLISHED);
@@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(par->net, tproto,
-					   &iph->saddr, laddr,
+		sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
 					   par->in, NFT_LOOKUP_LISTENER);
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ec08f04b816..49d14ecad444 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb,
  *     box.
  */
 static struct sock *
-xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in)
 {
 	switch (protocol) {
 	case IPPROTO_TCP:
-		return __inet_lookup(net, &tcp_hashinfo,
+		return __inet_lookup(net, &tcp_hashinfo, skb, doff,
 				     saddr, sport, daddr, dport,
 				     in->ifindex);
 	case IPPROTO_UDP:
@@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 					     const struct net_device *indev)
 {
 	const struct iphdr *iph = ip_hdr(skb);
+	struct sk_buff *data_skb = NULL;
+	int doff = 0;
 	__be32 uninitialized_var(daddr), uninitialized_var(saddr);
 	__be16 uninitialized_var(dport), uninitialized_var(sport);
 	u8 uninitialized_var(protocol);
@@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 		sport = hp->source;
 		daddr = iph->daddr;
 		dport = hp->dest;
+		data_skb = (struct sk_buff *)skb;
+		doff = iph->protocol == IPPROTO_TCP ?
+			ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+			ip_hdrlen(skb) + sizeof(*hp);
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
 		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
@@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 	}
 #endif
 
-	return xt_socket_get_sock_v4(net, protocol, saddr, daddr,
-				     sport, dport, indev);
+	return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+				     daddr, sport, dport, indev);
 }
 
 static bool
@@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb,
 }
 
 static struct sock *
-xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in)
 {
 	switch (protocol) {
 	case IPPROTO_TCP:
-		return inet6_lookup(net, &tcp_hashinfo,
+		return inet6_lookup(net, &tcp_hashinfo, skb, doff,
 				    saddr, sport, daddr, dport,
 				    in->ifindex);
 	case IPPROTO_UDP:
@@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 	__be16 uninitialized_var(dport), uninitialized_var(sport);
 	const struct in6_addr *daddr = NULL, *saddr = NULL;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct sk_buff *data_skb = NULL;
+	int doff = 0;
 	int thoff = 0, tproto;
 
 	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
@@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 		sport = hp->source;
 		daddr = &iph->daddr;
 		dport = hp->dest;
+		data_skb = (struct sk_buff *)skb;
+		doff = tproto == IPPROTO_TCP ?
+			thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+			thoff + sizeof(*hp);
 
 	} else if (tproto == IPPROTO_ICMPV6) {
 		struct ipv6hdr ipv6_var;
@@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 		return NULL;
 	}
 
-	return xt_socket_get_sock_v6(net, tproto, saddr, daddr,
+	return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
 				     sport, dport, indev);
 }
 
-- 
cgit v1.2.3


From c125e80b88687b25b321795457309eaaee4bf270 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:40 -0500
Subject: soreuseport: fast reuseport TCP socket selection

This change extends the fast SO_REUSEPORT socket lookup implemented
for UDP to TCP.  Listener sockets with SO_REUSEPORT and the same
receive address are additionally added to an array for faster
random access.  This means that only a single socket from the group
must be found in the listener list before any socket in the group can
be used to receive a packet.  Previously, every socket in the group
needed to be considered before handing off the incoming packet.

This feature also exposes the ability to use a BPF program when
selecting a socket from a reuseport group.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h    |  5 +++-
 net/ipv4/inet_connection_sock.c  | 14 ++++++---
 net/ipv4/inet_hashtables.c       | 64 +++++++++++++++++++++++++++++++++++++---
 net/ipv4/udp.c                   |  4 +--
 net/ipv6/inet6_connection_sock.c |  2 ++
 net/ipv6/inet6_hashtables.c      | 16 +++++++++-
 6 files changed, 93 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 82403390af58..50f635c2c536 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
-void __inet_hash(struct sock *sk, struct sock *osk);
+int __inet_hash(struct sock *sk, struct sock *osk,
+		int (*saddr_same)(const struct sock *sk1,
+				  const struct sock *sk2,
+				  bool match_wildcard));
 int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 12c8d389dc18..c16a2e6273d9 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
 #include <net/tcp.h>
+#include <net/sock_reuseport.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			if ((!reuse || !sk2->sk_reuse ||
 			    sk2->sk_state == TCP_LISTEN) &&
 			    (!reuseport || !sk2->sk_reuseport ||
-			    (sk2->sk_state != TCP_TIME_WAIT &&
+			     rcu_access_pointer(sk->sk_reuseport_cb) ||
+			     (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
 
 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -132,6 +134,7 @@ again:
 					      sk->sk_state != TCP_LISTEN) ||
 					     (tb->fastreuseport > 0 &&
 					      sk->sk_reuseport &&
+					      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 					      uid_eq(tb->fastuid, uid))) &&
 					    (tb->num_owners < smallest_size || smallest_size == -1)) {
 						smallest_size = tb->num_owners;
@@ -193,15 +196,18 @@ tb_found:
 		if (((tb->fastreuse > 0 &&
 		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 		     (tb->fastreuseport > 0 &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-		    smallest_size == -1) {
+		      sk->sk_reuseport &&
+		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+		      uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
 			goto success;
 		} else {
 			ret = 1;
 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+				      sk->sk_reuseport &&
+				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      uid_eq(tb->fastuid, uid))) &&
 				    smallest_size != -1 && --attempts >= 0) {
 					spin_unlock(&head->lock);
 					goto again;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5e4290b83255..c0f9942de924 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
 
+#include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/secure_seq.h>
 #include <net/ip.h>
+#include <net/sock_reuseport.h>
 
 static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
 			const __u16 lport, const __be32 faddr,
@@ -215,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	int score, hiscore, matches = 0, reuseport = 0;
+	bool select_ok = true;
 	u32 phash = 0;
 
 	rcu_read_lock();
@@ -230,6 +233,15 @@ begin:
 			if (reuseport) {
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
+				if (select_ok) {
+					struct sock *sk2;
+					sk2 = reuseport_select_sock(sk, phash,
+								    skb, doff);
+					if (sk2) {
+						result = sk2;
+						goto found;
+					}
+				}
 				matches = 1;
 			}
 		} else if (score == hiscore && reuseport) {
@@ -247,11 +259,13 @@ begin:
 	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
 		goto begin;
 	if (result) {
+found:
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
 		else if (unlikely(compute_score(result, net, hnum, daddr,
 				  dif) < hiscore)) {
 			sock_put(result);
+			select_ok = false;
 			goto begin;
 		}
 	}
@@ -450,34 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 }
 EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
 
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+				   struct inet_listen_hashbucket *ilb,
+				   int (*saddr_same)(const struct sock *sk1,
+						     const struct sock *sk2,
+						     bool match_wildcard))
+{
+	struct sock *sk2;
+	struct hlist_nulls_node *node;
+	kuid_t uid = sock_i_uid(sk);
+
+	sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+		if (sk2 != sk &&
+		    sk2->sk_family == sk->sk_family &&
+		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    saddr_same(sk, sk2, false))
+			return reuseport_add_sock(sk, sk2);
+	}
+
+	/* Initial allocation may have already happened via setsockopt */
+	if (!rcu_access_pointer(sk->sk_reuseport_cb))
+		return reuseport_alloc(sk);
+	return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+		 int (*saddr_same)(const struct sock *sk1,
+				   const struct sock *sk2,
+				   bool match_wildcard))
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
+	int err = 0;
 
 	if (sk->sk_state != TCP_LISTEN) {
 		inet_ehash_nolisten(sk, osk);
-		return;
+		return 0;
 	}
 	WARN_ON(!sk_unhashed(sk));
 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
 	spin_lock(&ilb->lock);
+	if (sk->sk_reuseport) {
+		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+		if (err)
+			goto unlock;
+	}
 	__sk_nulls_add_node_rcu(sk, &ilb->head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
 	spin_unlock(&ilb->lock);
+
+	return err;
 }
 EXPORT_SYMBOL(__inet_hash);
 
 int inet_hash(struct sock *sk)
 {
+	int err = 0;
+
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		__inet_hash(sk, NULL);
+		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
 		local_bh_enable();
 	}
 
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
@@ -496,6 +550,8 @@ void inet_unhash(struct sock *sk)
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
 	spin_lock_bh(lock);
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		reuseport_detach_sock(sk);
 	done = __sk_nulls_del_node_init_rcu(sk);
 	if (done)
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index be0b21852b13..ac3cedb25a9f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port);
  * match_wildcard == false: addresses must be exactly the same, i.e.
  *                          0.0.0.0 only equals to 0.0.0.0
  */
-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
-				bool match_wildcard)
+int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
+			 bool match_wildcard)
 {
 	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 36c3f0155010..532c3ef282c5 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -26,6 +26,7 @@
 #include <net/ip6_route.h>
 #include <net/sock.h>
 #include <net/inet6_connection_sock.h>
+#include <net/sock_reuseport.h>
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb, bool relax)
@@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			if ((!reuse || !sk2->sk_reuse ||
 			     sk2->sk_state == TCP_LISTEN) &&
 			    (!reuseport || !sk2->sk_reuseport ||
+			     rcu_access_pointer(sk->sk_reuseport_cb) ||
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			      !uid_eq(uid,
 				      sock_i_uid((struct sock *)sk2))))) {
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 004345d26808..70f2628be6fa 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -17,11 +17,13 @@
 #include <linux/module.h>
 #include <linux/random.h>
 
+#include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
 #include <net/secure_seq.h>
 #include <net/ip.h>
+#include <net/sock_reuseport.h>
 
 u32 inet6_ehashfn(const struct net *net,
 		  const struct in6_addr *laddr, const u16 lport,
@@ -131,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 	const struct hlist_nulls_node *node;
 	struct sock *result;
 	int score, hiscore, matches = 0, reuseport = 0;
+	bool select_ok = true;
 	u32 phash = 0;
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -148,6 +151,15 @@ begin:
 			if (reuseport) {
 				phash = inet6_ehashfn(net, daddr, hnum,
 						      saddr, sport);
+				if (select_ok) {
+					struct sock *sk2;
+					sk2 = reuseport_select_sock(sk, phash,
+								    skb, doff);
+					if (sk2) {
+						result = sk2;
+						goto found;
+					}
+				}
 				matches = 1;
 			}
 		} else if (score == hiscore && reuseport) {
@@ -165,11 +177,13 @@ begin:
 	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
 		goto begin;
 	if (result) {
+found:
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
 		else if (unlikely(compute_score(result, net, hnum, daddr,
 				  dif) < hiscore)) {
 			sock_put(result);
+			select_ok = false;
 			goto begin;
 		}
 	}
@@ -283,7 +297,7 @@ int inet6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		__inet_hash(sk, NULL);
+		__inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
 		local_bh_enable();
 	}
 
-- 
cgit v1.2.3


From 12b74dfadb5a7a23baf4db941dc9fd9d371f249a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:17 +0100
Subject: ipv4: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv4 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Additionally, enabling this option provides compliance with a SHOULD
clause of RFC 1122.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/uapi/linux/ip.h                |  1 +
 net/ipv4/devinet.c                     |  2 ++
 net/ipv4/ip_input.c                    | 25 ++++++++++++++++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 73b36d7c7b0d..d5910d63214d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1216,6 +1216,13 @@ promote_secondaries - BOOLEAN
 	promote a corresponding secondary IP address instead of
 	removing all the corresponding secondary IP addresses.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IP packets that are received in link-layer
+	multicast (or broadcast) frames.
+	This behavior (for multicast) is actually a SHOULD in RFC
+	1122, but is disabled by default for compatibility reasons.
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 08f894d2ddbd..584834f7e95c 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -165,6 +165,7 @@ enum
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebd9d31e65a..dbbab28a52a4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2192,6 +2192,8 @@ static struct devinet_sysctl_table {
 					      "promote_secondaries"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
 					      "route_localnet"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+					      "drop_unicast_in_l2_multicast"),
 	},
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d77eb0c3b684..852002f64c68 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -362,8 +362,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
-	} else if (rt->rt_type == RTN_BROADCAST)
+	} else if (rt->rt_type == RTN_BROADCAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+	} else if (skb->pkt_type == PACKET_BROADCAST ||
+		   skb->pkt_type == PACKET_MULTICAST) {
+		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+		/* RFC 1122 3.3.6:
+		 *
+		 *   When a host sends a datagram to a link-layer broadcast
+		 *   address, the IP destination address MUST be a legal IP
+		 *   broadcast or IP multicast address.
+		 *
+		 *   A host SHOULD silently discard a datagram that is received
+		 *   via a link-layer broadcast (see Section 2.4) but does not
+		 *   specify an IP multicast or broadcast destination address.
+		 *
+		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
+		 * in a way a form of multicast and the most common use case for
+		 * this is 802.11 protecting against cross-station spoofing (the
+		 * so-called "hole-196" attack) so do it for both.
+		 */
+		if (in_dev &&
+		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+			goto drop;
+	}
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3


From 97daf331455077645ae1f13438bebd3d1a2e94ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:18 +0100
Subject: ipv4: add option to drop gratuitous ARP packets

In certain 802.11 wireless deployments, there will be ARP proxies
that use knowledge of the network to correctly answer requests.
To prevent gratuitous ARP frames on the shared medium from being
a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_gratuitous_arp".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/uapi/linux/ip.h                | 1 +
 net/ipv4/arp.c                         | 8 ++++++++
 net/ipv4/devinet.c                     | 2 ++
 4 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d5910d63214d..a53bbfaff1c7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1223,6 +1223,12 @@ drop_unicast_in_l2_multicast - BOOLEAN
 	1122, but is disabled by default for compatibility reasons.
 	Default: off (0)
 
+drop_gratuitous_arp - BOOLEAN
+	Drop all gratuitous ARP frames, for example if there's a known
+	good ARP proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 584834f7e95c..f291569768dd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -166,6 +166,7 @@ enum
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8fd51..c102eb5ac55c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
 		goto out;
 
+ /*
+  *	For some 802.11 wireless deployments (and possibly other networks),
+  *	there will be an ARP proxy and gratuitous ARP frames are attacks
+  *	and thus should not be accepted.
+  */
+	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+		goto out;
+
 /*
  *     Special case: We must set Frame Relay source Q.922 address
  */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dbbab28a52a4..3d835313575e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2185,6 +2185,8 @@ static struct devinet_sysctl_table {
 					"igmpv3_unsolicited_report_interval"),
 		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
 					"ignore_routes_with_linkdown"),
+		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+					"drop_gratuitous_arp"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3


From abbc30436d39dfed8ebfca338d253f211ac7b094 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:19 +0100
Subject: ipv6: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv6 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    |  8 ++++++++
 net/ipv6/ip6_input.c                   | 10 ++++++++++
 5 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a53bbfaff1c7..e0e7350a4e6a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1674,6 +1674,12 @@ stable_secret - IPv6 address
 
 	By default the stable secret is unset.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IPv6 packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 402753bccafa..4a4c1ae826cb 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		mc_forwarding;
 #endif
 	__s32		disable_ipv6;
+	__s32		drop_unicast_in_l2_multicast;
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 38b4fef20219..4c413570efe8 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -174,6 +174,7 @@ enum {
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 38eeddedfc21..23e325f39f8e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4711,6 +4711,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5784,6 +5785,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
+		{
+			.procname	= "drop_unicast_in_l2_multicast",
+			.data		= &ipv6_devconf.drop_unicast_in_l2_multicast,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..31ac3c56da4b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	    IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
 		goto err;
 
+	/* If enabled, drop unicast packets that were encapsulated in link-layer
+	 * multicast or broadcast to protected against the so-called "hole-196"
+	 * attack in 802.11 wireless.
+	 */
+	if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+	    (skb->pkt_type == PACKET_BROADCAST ||
+	     skb->pkt_type == PACKET_MULTICAST) &&
+	    idev->cnf.drop_unicast_in_l2_multicast)
+		goto err;
+
 	/* RFC4291 2.7
 	 * Nodes must not originate a packet to a multicast address whose scope
 	 * field contains the reserved value 0; if such a packet is received, it
-- 
cgit v1.2.3


From 7a02bf892d8f1e5298af1676f001bee410509d80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:20 +0100
Subject: ipv6: add option to drop unsolicited neighbor advertisements

In certain 802.11 wireless deployments, there will be NA proxies
that use knowledge of the network to correctly answer requests.
To prevent unsolicitd advertisements on the shared medium from
being a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_unsolicited_na".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/ipv6.h                   | 1 +
 include/uapi/linux/ipv6.h              | 1 +
 net/ipv6/addrconf.c                    | 8 ++++++++
 net/ipv6/ndisc.c                       | 9 +++++++++
 5 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e0e7350a4e6a..24ce97f42d35 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1680,6 +1680,13 @@ drop_unicast_in_l2_multicast - BOOLEAN
 
 	By default this is turned off.
 
+drop_unsolicited_na - BOOLEAN
+	Drop all unsolicited neighbor advertisements, for example if there's
+	a known good NA proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4a4c1ae826cb..4b2267e1b7c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -56,6 +56,7 @@ struct ipv6_devconf {
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
+	__s32		drop_unsolicited_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4c413570efe8..ec117b65d5a5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -175,6 +175,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23e325f39f8e..ac0ba9e4e06b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4712,6 +4712,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
+	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5792,6 +5793,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "drop_unsolicited_na",
+			.data		= &ipv6_devconf.drop_unsolicited_na,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 84afb9a77278..c245895a3d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 				    offsetof(struct nd_msg, opt));
 	struct ndisc_options ndopts;
 	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
 
@@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		return;
 	}
 
+	/* For some 802.11 wireless deployments (and possibly other networks),
+	 * there will be a NA proxy and unsolicitd packets are attacks
+	 * and thus should not be accepted.
+	 */
+	if (!msg->icmph.icmp6_solicited && idev &&
+	    idev->cnf.drop_unsolicited_na)
+		return;
+
 	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
-- 
cgit v1.2.3


From 72bb68721f80a1441e871b6afc9ab0b3793d5031 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 5 Feb 2016 11:16:21 +0000
Subject: ethtool: add IPv6 to the NFC API

Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index b2e180181629..5e0940dcbfe8 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -748,6 +748,56 @@ struct ethtool_usrip4_spec {
 	__u8    proto;
 };
 
+/**
+ * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc.
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow.
+ */
+struct ethtool_tcpip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be16	psrc;
+	__be16	pdst;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @spi: Security parameters index
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv6.
+ */
+struct ethtool_ah_espip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	spi;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_usrip6_spec - general flow specification for IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tclass: Traffic Class
+ * @l4_proto: Transport protocol number (nexthdr after any Extension Headers)
+ */
+struct ethtool_usrip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	l4_4_bytes;
+	__u8    tclass;
+	__u8    l4_proto;
+};
+
 union ethtool_flow_union {
 	struct ethtool_tcpip4_spec		tcp_ip4_spec;
 	struct ethtool_tcpip4_spec		udp_ip4_spec;
@@ -755,6 +805,12 @@ union ethtool_flow_union {
 	struct ethtool_ah_espip4_spec		ah_ip4_spec;
 	struct ethtool_ah_espip4_spec		esp_ip4_spec;
 	struct ethtool_usrip4_spec		usr_ip4_spec;
+	struct ethtool_tcpip6_spec		tcp_ip6_spec;
+	struct ethtool_tcpip6_spec		udp_ip6_spec;
+	struct ethtool_tcpip6_spec		sctp_ip6_spec;
+	struct ethtool_ah_espip6_spec		ah_ip6_spec;
+	struct ethtool_ah_espip6_spec		esp_ip6_spec;
+	struct ethtool_usrip6_spec		usr_ip6_spec;
 	struct ethhdr				ether_spec;
 	__u8					hdata[52];
 };
@@ -1401,15 +1457,17 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
 #define	SCTP_V4_FLOW	0x03	/* hash or spec (sctp_ip4_spec) */
 #define	AH_ESP_V4_FLOW	0x04	/* hash only */
-#define	TCP_V6_FLOW	0x05	/* hash only */
-#define	UDP_V6_FLOW	0x06	/* hash only */
-#define	SCTP_V6_FLOW	0x07	/* hash only */
+#define	TCP_V6_FLOW	0x05	/* hash or spec (tcp_ip6_spec; nfc only) */
+#define	UDP_V6_FLOW	0x06	/* hash or spec (udp_ip6_spec; nfc only) */
+#define	SCTP_V6_FLOW	0x07	/* hash or spec (sctp_ip6_spec; nfc only) */
 #define	AH_ESP_V6_FLOW	0x08	/* hash only */
 #define	AH_V4_FLOW	0x09	/* hash or spec (ah_ip4_spec) */
 #define	ESP_V4_FLOW	0x0a	/* hash or spec (esp_ip4_spec) */
-#define	AH_V6_FLOW	0x0b	/* hash only */
-#define	ESP_V6_FLOW	0x0c	/* hash only */
-#define	IP_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	AH_V6_FLOW	0x0b	/* hash or spec (ah_ip6_spec; nfc only) */
+#define	ESP_V6_FLOW	0x0c	/* hash or spec (esp_ip6_spec; nfc only) */
+#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	IP_USER_FLOW	IPV4_USER_FLOW
+#define	IPV6_USER_FLOW	0x0e	/* spec only (usr_ip6_spec; nfc only) */
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
-- 
cgit v1.2.3


From 76443456227097179c14826425f88a95d81a892e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:37 -0800
Subject: net: Move GSO csum into SKB_GSO_CB

This patch moves the checksum maintained by GSO out of skb->csum and into
the GSO context block in order to allow for us to work on outer checksums
while maintaining the inner checksum offsets in the case of the inner
checksum being offloaded, while the outer checksums will be computed.

While updating the code I also did a minor cleanu-up on gso_make_checksum.
The change is mostly to make it so that we store the values and compute the
checksum instead of computing the checksum and then storing the values we
needed to update.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 +++++++-------
 net/core/skbuff.c      | 16 +++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 11f935c1a090..acece7ce376f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3549,6 +3549,7 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 struct skb_gso_cb {
 	int	mac_offset;
 	int	encap_level;
+	__wsum	csum;
 	__u16	csum_start;
 };
 #define SKB_SGO_CB_OFFSET	32
@@ -3585,15 +3586,14 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
  */
 static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
 {
-	int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) -
-		   skb_transport_offset(skb);
-	__wsum partial;
+	unsigned char *csum_start = skb_transport_header(skb);
+	int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
+	__wsum partial = SKB_GSO_CB(skb)->csum;
 
-	partial = csum_partial(skb_transport_header(skb), plen, skb->csum);
-	skb->csum = res;
-	SKB_GSO_CB(skb)->csum_start -= plen;
+	SKB_GSO_CB(skb)->csum = res;
+	SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;
 
-	return csum_fold(partial);
+	return csum_fold(csum_partial(csum_start, plen, partial));
 }
 
 static inline bool skb_is_gso(const struct sk_buff *skb)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2df375ec9c2..02c638a643ea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3100,11 +3100,12 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 		if (!sg && !nskb->remcsum_offload) {
 			nskb->ip_summed = CHECKSUM_NONE;
-			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
-							    skb_put(nskb, len),
-							    len, 0);
+			SKB_GSO_CB(nskb)->csum =
+				skb_copy_and_csum_bits(head_skb, offset,
+						       skb_put(nskb, len),
+						       len, 0);
 			SKB_GSO_CB(nskb)->csum_start =
-			    skb_headroom(nskb) + doffset;
+				skb_headroom(nskb) + doffset;
 			continue;
 		}
 
@@ -3171,11 +3172,12 @@ skip_fraglist:
 
 perform_csum_check:
 		if (!csum && !nskb->remcsum_offload) {
-			nskb->csum = skb_checksum(nskb, doffset,
-						  nskb->len - doffset, 0);
 			nskb->ip_summed = CHECKSUM_NONE;
+			SKB_GSO_CB(nskb)->csum =
+				skb_checksum(nskb, doffset,
+					     nskb->len - doffset, 0);
 			SKB_GSO_CB(nskb)->csum_start =
-			    skb_headroom(nskb) + doffset;
+				skb_headroom(nskb) + doffset;
 		}
 	} while ((offset += len) < head_skb->len);
 
-- 
cgit v1.2.3


From 08b64fcca942733413bc5ac2321d57021d3e8578 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:49 -0800
Subject: net: Store checksum result for offloaded GSO checksums

This patch makes it so that we can offload the checksums for a packet up
to a certain point and then begin computing the checksums via software.
Setting this up is fairly straight forward as all we need to do is reset
the values stored in csum and csum_start for the GSO context block.

One complication for this is remote checksum offload.  In order to allow
the inner checksums to be offloaded while computing the outer checksum
manually we needed to have some way of indicating that the offload wasn't
real.  In order to do that I replaced CHECKSUM_PARTIAL with
CHECKSUM_UNNECESSARY in the case of us computing checksums for the outer
header while skipping computing checksums for the inner headers.  We clean
up the ip_summed flag and set it to either CHECKSUM_PARTIAL or
CHECKSUM_NONE once we hand the packet off to the next lower level.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/ipv4/tcp_offload.c |  8 ++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acece7ce376f..a8fc2220e8ce 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2161,6 +2161,11 @@ static inline int skb_checksum_start_offset(const struct sk_buff *skb)
 	return skb->csum_start - skb_headroom(skb);
 }
 
+static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
+{
+	return skb->head + skb->csum_start;
+}
+
 static inline int skb_transport_offset(const struct sk_buff *skb)
 {
 	return skb_transport_header(skb) - skb->data;
@@ -3576,6 +3581,16 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
 	return 0;
 }
 
+static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
+{
+	/* Do not update partial checksums if remote checksum is enabled. */
+	if (skb->remcsum_offload)
+		return;
+
+	SKB_GSO_CB(skb)->csum = res;
+	SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
+}
+
 /* Compute the checksum for a gso segment. First compute the checksum value
  * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
  * then add in skb->csum (checksum from csum_start to end of packet).
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9864a2dbadce..773083b7f1e9 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		th->fin = th->psh = 0;
 		th->check = newcheck;
 
-		if (skb->ip_summed != CHECKSUM_PARTIAL)
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(skb, ~th->check);
+		else
 			th->check = gso_make_checksum(skb, ~th->check);
 
 		seq += mss;
@@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		      skb->data_len);
 	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
 				(__force u32)delta));
-	if (skb->ip_summed != CHECKSUM_PARTIAL)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(skb, ~th->check);
+	else
 		th->check = gso_make_checksum(skb, ~th->check);
 out:
 	return segs;
-- 
cgit v1.2.3


From 4456ed04ea44b800d691b18c14a68ec9894d2aca Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 7 Feb 2016 23:27:55 +0200
Subject: ethtool: future-proof interface for speed extensions

Many virtual and not quite virtual devices allow any speed to be set
through ethtool. In particular, this applies to the virtio-net devices.
Document this fact to make sure people don't assume the enum lists all
possible values.  Reserve values greater than INT_MAX for future
extension and to avoid conflict with SPEED_UNKNOWN.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 5e0940dcbfe8..4345f80a2e33 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -31,7 +31,7 @@
  *	physical connectors and other link features that are
  *	advertised through autonegotiation or enabled for
  *	auto-detection.
- * @speed: Low bits of the speed
+ * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @duplex: Duplex mode; one of %DUPLEX_*
  * @port: Physical connector type; one of %PORT_*
  * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
@@ -47,7 +47,7 @@
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
  * @maxrxpkt: Historically used to report RX IRQ coalescing; now
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
- * @speed_hi: High bits of the speed
+ * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
  *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
  *	value will be %ETH_TP_MDI_INVALID.  Read-only.
@@ -1359,7 +1359,7 @@ enum ethtool_sfeatures_retval_bits {
  * it was forced up into this mode or autonegotiated.
  */
 
-/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */
 #define SPEED_10		10
 #define SPEED_100		100
 #define SPEED_1000		1000
-- 
cgit v1.2.3


From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Fri, 5 Feb 2016 09:20:52 -0700
Subject: openvswitch: allow management from inside user namespaces

Operations with the GENL_ADMIN_PERM flag fail permissions checks because
this flag means we call netlink_capable, which uses the init user ns.

Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations
which should be allowed inside a user namespace.

The motivation for this is to be able to run openvswitch in unprivileged
containers. I've tested this and it seems to work, but I really have no
idea about the security consequences of this patch, so thoughts would be
much appreciated.

v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function
v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one
    massive one

Reported-by: James Page <james.page@canonical.com>
Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Pravin Shelar <pshelar@ovn.org>
CC: Justin Pettit <jpettit@nicira.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        |  4 ++++
 net/openvswitch/datapath.c     | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c3363ba1ae05..5512c90af7e3 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -21,6 +21,7 @@ struct genlmsghdr {
 #define GENL_CMD_CAP_DO		0x02
 #define GENL_CMD_CAP_DUMP	0x04
 #define GENL_CMD_CAP_HASPOL	0x08
+#define GENL_UNS_ADMIN_PERM	0x10
 
 /*
  * List of reserved static generic netlink identifiers:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f830326b3b1d..0ffd721126e7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -580,6 +580,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
+	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
 		int rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..d6f7fe92744a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -654,7 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_packet_genl_ops[] = {
 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = packet_policy,
 	  .doit = ovs_packet_cmd_execute
 	}
@@ -1391,12 +1391,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_flow_genl_ops[] = {
 	{ .cmd = OVS_FLOW_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_new
 	},
 	{ .cmd = OVS_FLOW_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_del
 	},
@@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	  .dumpit = ovs_flow_cmd_dump
 	},
 	{ .cmd = OVS_FLOW_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_set,
 	},
@@ -1777,12 +1777,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_datapath_genl_ops[] = {
 	{ .cmd = OVS_DP_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_new
 	},
 	{ .cmd = OVS_DP_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_del
 	},
@@ -1793,7 +1793,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	  .dumpit = ovs_dp_cmd_dump
 	},
 	{ .cmd = OVS_DP_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_set,
 	},
@@ -2158,12 +2158,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_vport_genl_ops[] = {
 	{ .cmd = OVS_VPORT_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_new
 	},
 	{ .cmd = OVS_VPORT_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_del
 	},
@@ -2174,7 +2174,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	  .dumpit = ovs_vport_cmd_dump
 	},
 	{ .cmd = OVS_VPORT_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_set,
 	},
-- 
cgit v1.2.3


From 815c52700746cdcc0874a33390bac334a4b90107 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:21 +0200
Subject: igmp: Namespaceify igmp_max_memberships sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  2 ++
 net/ipv4/igmp.c            |  4 +---
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 9c9de11549a7..57d6d06ce0b3 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -38,7 +38,6 @@ static inline struct igmpv3_query *
 }
 
 extern int sysctl_igmp_llm_reports;
-extern int sysctl_igmp_max_memberships;
 extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 4d6ec3f6fafe..759cf624eec2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -108,6 +108,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_fin_timeout;
 	unsigned int sysctl_tcp_notsent_lowat;
 
+	int sysctl_igmp_max_memberships;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 05e4cba14162..5b86257c9d6b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,7 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-#define IP_MAX_MEMBERSHIPS	20
 #define IP_MAX_MSF		10
 
 /* IGMP reports for link-local multicast groups are enabled by default */
@@ -1727,7 +1726,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
 int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
 #ifdef CONFIG_IP_MULTICAST
 int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
@@ -2074,7 +2072,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
 		count++;
 	}
 	err = -ENOBUFS;
-	if (count >= sysctl_igmp_max_memberships)
+	if (count >= net->ipv4.sysctl_igmp_max_memberships)
 		goto done;
 	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 	if (!iml)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44bb59824267..6ea3dbb96db4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,13 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "igmp_max_memberships",
-		.data		= &sysctl_igmp_max_memberships,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "igmp_max_msf",
 		.data		= &sysctl_igmp_max_msf,
@@ -871,6 +864,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "igmp_max_memberships",
+		.data		= &init_net.ipv4.sysctl_igmp_max_memberships,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3f872a6bc274..4b203789900b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2399,6 +2399,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
+	net->ipv4.sysctl_igmp_max_memberships = 20;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit v1.2.3


From 166b6b2d6f01be67a83b87ab5c91350a68b17115 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:22 +0200
Subject: igmp: Namespaceify igmp_max_msf sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            |  5 +----
 net/ipv4/ip_sockglue.c     |  5 +++--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 6 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 57d6d06ce0b3..a91ec9f575e7 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -38,7 +38,6 @@ static inline struct igmpv3_query *
 }
 
 extern int sysctl_igmp_llm_reports;
-extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
 
 struct ip_sf_socklist {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 759cf624eec2..522a2cfe1ad9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -109,6 +109,7 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_notsent_lowat;
 
 	int sysctl_igmp_max_memberships;
+	int sysctl_igmp_max_msf;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5b86257c9d6b..6da2e467b63c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,8 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-#define IP_MAX_MSF		10
-
 /* IGMP reports for link-local multicast groups are enabled by default */
 int sysctl_igmp_llm_reports __read_mostly = 1;
 
@@ -1726,7 +1724,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
 #ifdef CONFIG_IP_MULTICAST
 int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
 #endif
@@ -2244,7 +2241,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	}
 	/* else, add a new source to the filter */
 
-	if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+	if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
 		err = -ENOBUFS;
 		goto done;
 	}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5f73a7c03e27..92808f147ef5 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -571,6 +571,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			    int optname, char __user *optval, unsigned int optlen)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
 	int val = 0, err;
 	bool needs_rtnl = setsockopt_needs_rtnl(optname);
 
@@ -910,7 +911,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		/* numsrc >= (1G-4) overflow in 32 bits */
 		if (msf->imsf_numsrc >= 0x3ffffffcU ||
-		    msf->imsf_numsrc > sysctl_igmp_max_msf) {
+		    msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
 			kfree(msf);
 			err = -ENOBUFS;
 			break;
@@ -1065,7 +1066,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 
 		/* numsrc >= (4G-140)/128 overflow in 32 bits */
 		if (gsf->gf_numsrc >= 0x1ffffff ||
-		    gsf->gf_numsrc > sysctl_igmp_max_msf) {
+		    gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
 			err = -ENOBUFS;
 			goto mc_msf_out;
 		}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6ea3dbb96db4..225659a02cf2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,13 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "igmp_max_msf",
-		.data		= &sysctl_igmp_max_msf,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_qrv",
@@ -871,6 +864,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "igmp_max_msf",
+		.data		= &init_net.ipv4.sysctl_igmp_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4b203789900b..055d8a9a0c61 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2400,6 +2400,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
 	net->ipv4.sysctl_igmp_max_memberships = 20;
+	net->ipv4.sysctl_igmp_max_msf = 10;
 
 	return 0;
 fail:
-- 
cgit v1.2.3


From 87a8a2ae65b7721893c7922f963502be8fa01c94 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov@siteground.com>
Date: Tue, 9 Feb 2016 00:13:50 +0200
Subject: igmp: Namespaceify igmp_llm_reports sysctl knob

This was initially introduced in df2cf4a78e488d26 ("IGMP: Inhibit
reports for local multicast groups") by defining the sysctl in the
ipv4_net_table array, however it was never implemented to be
namespace aware. Fix this by changing the code accordingly.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            | 26 +++++++++++++++-----------
 net/ipv4/sysctl_net_ipv4.c |  2 +-
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index a91ec9f575e7..c683f4bf642b 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,7 +37,6 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
-extern int sysctl_igmp_llm_reports;
 extern int sysctl_igmp_qrv;
 
 struct ip_sf_socklist {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 522a2cfe1ad9..cbbf8115e8a7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -110,6 +110,7 @@ struct netns_ipv4 {
 
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
+	int sysctl_igmp_llm_reports;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6da2e467b63c..2e22ee0efc98 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,9 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-/* IGMP reports for link-local multicast groups are enabled by default */
-int sysctl_igmp_llm_reports __read_mostly = 1;
-
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
 
@@ -430,6 +427,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	int type, int gdeleted, int sdeleted)
 {
 	struct net_device *dev = pmc->interface->dev;
+	struct net *net = dev_net(dev);
 	struct igmpv3_report *pih;
 	struct igmpv3_grec *pgr = NULL;
 	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
@@ -437,7 +435,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
-	if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return skb;
 
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
@@ -540,6 +538,7 @@ empty_source:
 static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 {
 	struct sk_buff *skb = NULL;
+	struct net *net = dev_net(in_dev->dev);
 	int type;
 
 	if (!pmc) {
@@ -548,7 +547,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 			if (pmc->multiaddr == IGMP_ALL_HOSTS)
 				continue;
 			if (ipv4_is_local_multicast(pmc->multiaddr) &&
-			     !sysctl_igmp_llm_reports)
+			     !net->ipv4.sysctl_igmp_llm_reports)
 				continue;
 			spin_lock_bh(&pmc->lock);
 			if (pmc->sfcount[MCAST_EXCLUDE])
@@ -684,7 +683,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
 		return igmpv3_send_report(in_dev, pmc);
 
-	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
 		return 0;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
@@ -855,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
 static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 {
 	struct ip_mc_list *im;
+	struct net *net = dev_net(in_dev->dev);
 
 	/* Timers are only set for non-local groups */
 
 	if (group == IGMP_ALL_HOSTS)
 		return false;
-	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
 		return false;
 
 	rcu_read_lock();
@@ -884,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 	__be32			group = ih->group;
 	int			max_delay;
 	int			mark = 0;
+	struct net		*net = dev_net(in_dev->dev);
 
 
 	if (len == 8) {
@@ -969,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
 		if (ipv4_is_local_multicast(im->multiaddr) &&
-		    !sysctl_igmp_llm_reports)
+		    !net->ipv4.sysctl_igmp_llm_reports)
 			continue;
 		spin_lock_bh(&im->lock);
 		if (im->tm_running)
@@ -1184,6 +1185,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 {
 	struct in_device *in_dev = im->interface;
 #ifdef CONFIG_IP_MULTICAST
+	struct net *net = dev_net(in_dev->dev);
 	int reporter;
 #endif
 
@@ -1195,7 +1197,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
-	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return;
 
 	reporter = im->reporter;
@@ -1220,6 +1222,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 static void igmp_group_added(struct ip_mc_list *im)
 {
 	struct in_device *in_dev = im->interface;
+	struct net *net = dev_net(in_dev->dev);
 
 	if (im->loaded == 0) {
 		im->loaded = 1;
@@ -1229,7 +1232,7 @@ static void igmp_group_added(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
-	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return;
 
 	if (in_dev->dead)
@@ -1530,6 +1533,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 	struct ip_mc_list *im;
 	int type;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
@@ -1537,7 +1541,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
 		if (ipv4_is_local_multicast(im->multiaddr) &&
-		    !sysctl_igmp_llm_reports)
+		    !net->ipv4.sysctl_igmp_llm_reports)
 			continue;
 
 		/* a failover is happening and switches
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 225659a02cf2..fc40fa1303d3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -852,7 +852,7 @@ static struct ctl_table ipv4_net_table[] = {
 	},
 	{
 		.procname	= "igmp_link_local_mcast_reports",
-		.data		= &sysctl_igmp_llm_reports,
+		.data		= &init_net.ipv4.sysctl_igmp_llm_reports,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 055d8a9a0c61..6c3c1d5232c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2401,6 +2401,8 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_igmp_max_memberships = 20;
 	net->ipv4.sysctl_igmp_max_msf = 10;
+	/* IGMP reports for link-local multicast groups are enabled by default */
+	net->ipv4.sysctl_igmp_llm_reports = 1;
 
 	return 0;
 fail:
-- 
cgit v1.2.3


From 165094afcee79e4d5b6e94032a5d3be157460b4a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:24 +0200
Subject: igmp: Namespacify igmp_qrv sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  2 --
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            | 29 +++++++++++++++++------------
 net/ipv4/sysctl_net_ipv4.c | 20 ++++++++++----------
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index c683f4bf642b..12f6fba6d21a 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,8 +37,6 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
-extern int sysctl_igmp_qrv;
-
 struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index cbbf8115e8a7..848fe8056534 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -111,6 +111,7 @@ struct netns_ipv4 {
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
 	int sysctl_igmp_llm_reports;
+	int sysctl_igmp_qrv;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2e22ee0efc98..7c95335bf85e 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -762,9 +762,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
 
 static void igmp_ifc_event(struct in_device *in_dev)
 {
+	struct net *net = dev_net(in_dev->dev);
 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
 		return;
-	in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	igmp_ifc_start_timer(in_dev, 1);
 }
 
@@ -1086,6 +1087,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 {
 	struct ip_mc_list *pmc;
+	struct net *net = dev_net(in_dev->dev);
 
 	/* this is an "ip_mc_list" for convenience; only the fields below
 	 * are actually used. In particular, the refcnt and users are not
@@ -1100,7 +1102,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	pmc->interface = im->interface;
 	in_dev_hold(in_dev);
 	pmc->multiaddr = im->multiaddr;
-	pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	pmc->sfmode = im->sfmode;
 	if (pmc->sfmode == MCAST_INCLUDE) {
 		struct ip_sf_list *psf;
@@ -1245,7 +1247,7 @@ static void igmp_group_added(struct ip_mc_list *im)
 	}
 	/* else, v3 */
 
-	im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	igmp_ifc_event(in_dev);
 #endif
 }
@@ -1314,6 +1316,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 {
 	struct ip_mc_list *im;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
@@ -1340,7 +1343,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
 	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
-	im->unsolicit_count = sysctl_igmp_qrv;
+	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
 #endif
 
 	im->next_rcu = in_dev->mc_list;
@@ -1640,6 +1643,7 @@ void ip_mc_down(struct in_device *in_dev)
 
 void ip_mc_init_dev(struct in_device *in_dev)
 {
+	struct net *net = dev_net(in_dev->dev);
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
@@ -1647,7 +1651,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
 			(unsigned long)in_dev);
 	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
 			(unsigned long)in_dev);
-	in_dev->mr_qrv = sysctl_igmp_qrv;
+	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 
 	spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1658,11 +1662,12 @@ void ip_mc_init_dev(struct in_device *in_dev)
 void ip_mc_up(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
-	in_dev->mr_qrv = sysctl_igmp_qrv;
+	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
 
@@ -1728,9 +1733,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-#ifdef CONFIG_IP_MULTICAST
-int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
-#endif
 
 static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 	__be32 *psfsrc)
@@ -1755,6 +1757,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
 #ifdef CONFIG_IP_MULTICAST
 		struct in_device *in_dev = pmc->interface;
+		struct net *net = dev_net(in_dev->dev);
 #endif
 
 		/* no more filters for this source */
@@ -1765,7 +1768,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 #ifdef CONFIG_IP_MULTICAST
 		if (psf->sf_oldin &&
 		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
-			psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+			psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 			psf->sf_next = pmc->tomb;
 			pmc->tomb = psf;
 			rv = 1;
@@ -1823,12 +1826,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 	    pmc->sfcount[MCAST_INCLUDE]) {
 #ifdef CONFIG_IP_MULTICAST
 		struct ip_sf_list *psf;
+		struct net *net = dev_net(in_dev->dev);
 #endif
 
 		/* filter mode change */
 		pmc->sfmode = MCAST_INCLUDE;
 #ifdef CONFIG_IP_MULTICAST
-		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		in_dev->mr_ifc_count = pmc->crcount;
 		for (psf = pmc->sources; psf; psf = psf->sf_next)
 			psf->sf_crcount = 0;
@@ -1995,6 +1999,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
 #ifdef CONFIG_IP_MULTICAST
 		struct ip_sf_list *psf;
+		struct net *net = dev_net(pmc->interface->dev);
 		in_dev = pmc->interface;
 #endif
 
@@ -2006,7 +2011,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 #ifdef CONFIG_IP_MULTICAST
 		/* else no filters; keep old mode for reports */
 
-		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		in_dev->mr_ifc_count = pmc->crcount;
 		for (psf = pmc->sources; psf; psf = psf->sf_next)
 			psf->sf_crcount = 0;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fc40fa1303d3..b537338f5c97 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,16 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-#ifdef CONFIG_IP_MULTICAST
-	{
-		.procname	= "igmp_qrv",
-		.data		= &sysctl_igmp_qrv,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
-	},
-#endif
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -871,6 +861,16 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_IP_MULTICAST
+	{
+		.procname	= "igmp_qrv",
+		.data		= &init_net.ipv4.sysctl_igmp_qrv,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one
+	},
+#endif
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6c3c1d5232c6..ba5d0146e3f0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2403,6 +2403,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_igmp_max_msf = 10;
 	/* IGMP reports for link-local multicast groups are enabled by default */
 	net->ipv4.sysctl_igmp_llm_reports = 1;
+	net->ipv4.sysctl_igmp_qrv = 2;
 
 	return 0;
 fail:
-- 
cgit v1.2.3


From e02564ee334a7ae46b71fc18576391cb9455433e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sun, 7 Feb 2016 21:52:23 +0100
Subject: ethtool: make validate_speed accept all speeds between 0 and INT_MAX

Devices these days can have any speed and as was recently pointed out
any speed from 0 to INT_MAX is valid so adjust speed validation to
accept such values.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 4345f80a2e33..190aea0faaf4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1377,24 +1377,7 @@ enum ethtool_sfeatures_retval_bits {
 
 static inline int ethtool_validate_speed(__u32 speed)
 {
-	switch (speed) {
-	case SPEED_10:
-	case SPEED_100:
-	case SPEED_1000:
-	case SPEED_2500:
-	case SPEED_5000:
-	case SPEED_10000:
-	case SPEED_20000:
-	case SPEED_25000:
-	case SPEED_40000:
-	case SPEED_50000:
-	case SPEED_56000:
-	case SPEED_100000:
-	case SPEED_UNKNOWN:
-		return 1;
-	}
-
-	return 0;
+	return speed <= INT_MAX || speed == SPEED_UNKNOWN;
 }
 
 /* Duplex, half or full. */
-- 
cgit v1.2.3


From 795bb1c00dd338aa0d12f9a7f1f4776fb3160416 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 8 Feb 2016 13:14:59 +0100
Subject: net: bulk free infrastructure for NAPI context, use napi_consume_skb

Discovered that network stack were hitting the kmem_cache/SLUB
slowpath when freeing SKBs.  Doing bulk free with kmem_cache_free_bulk
can speedup this slowpath.

NAPI context is a bit special, lets take advantage of that for bulk
free'ing SKBs.

In NAPI context we are running in softirq, which gives us certain
protection.  A softirq can run on several CPUs at once.  BUT the
important part is a softirq will never preempt another softirq running
on the same CPU.  This gives us the opportunity to access per-cpu
variables in softirq context.

Extend napi_alloc_cache (before only contained page_frag_cache) to be
a struct with a small array based stack for holding SKBs.  Introduce a
SKB defer and flush API for accessing this.

Introduce napi_consume_skb() as replacement for e.g. dev_consume_skb_any()
when running in NAPI context.  A small trick to handle/detect if we
are called from netpoll is to see if budget is 0.  In that case, we
need to invoke dev_consume_skb_irq().

Joint work with Alexander Duyck.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 ++
 net/core/dev.c         |  1 +
 net/core/skbuff.c      | 83 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a8fc2220e8ce..b56c0103fa15 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2404,6 +2404,9 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 {
 	return __napi_alloc_skb(napi, length, GFP_ATOMIC);
 }
+void napi_consume_skb(struct sk_buff *skb, int budget);
+
+void __kfree_skb_flush(void);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index f1284835b8c9..9b2c7a999e71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5155,6 +5155,7 @@ static void net_rx_action(struct softirq_action *h)
 		}
 	}
 
+	__kfree_skb_flush();
 	local_irq_disable();
 
 	list_splice_tail_init(&sd->poll_list, &list);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b0cce744e2a0..b64187b87773 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,8 +347,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
+#define NAPI_SKB_CACHE_SIZE	64
+
+struct napi_alloc_cache {
+	struct page_frag_cache page;
+	size_t skb_count;
+	void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -378,9 +386,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
-	return __alloc_page_frag(nc, fragsz, gfp_mask);
+	return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -474,7 +482,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 				 gfp_t gfp_mask)
 {
-	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	struct sk_buff *skb;
 	void *data;
 
@@ -494,7 +502,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	if (sk_memalloc_socks())
 		gfp_mask |= __GFP_MEMALLOC;
 
-	data = __alloc_page_frag(nc, len, gfp_mask);
+	data = __alloc_page_frag(&nc->page, len, gfp_mask);
 	if (unlikely(!data))
 		return NULL;
 
@@ -505,7 +513,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	}
 
 	/* use OR instead of assignment to avoid clearing of bits in mask */
-	if (nc->pfmemalloc)
+	if (nc->page.pfmemalloc)
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;
 
@@ -747,6 +755,69 @@ void consume_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(consume_skb);
 
+void __kfree_skb_flush(void)
+{
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+	/* flush skb_cache if containing objects */
+	if (nc->skb_count) {
+		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+				     nc->skb_cache);
+		nc->skb_count = 0;
+	}
+}
+
+static void __kfree_skb_defer(struct sk_buff *skb)
+{
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+	/* drop skb->head and call any destructors for packet */
+	skb_release_all(skb);
+
+	/* record skb to CPU local list */
+	nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+	/* SLUB writes into objects when freeing */
+	prefetchw(skb);
+#endif
+
+	/* flush skb_cache if it is filled */
+	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+				     nc->skb_cache);
+		nc->skb_count = 0;
+	}
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+	if (unlikely(!skb))
+		return;
+
+	/* if budget is 0 assume netpoll w/ IRQs disabled */
+	if (unlikely(!budget)) {
+		dev_consume_skb_irq(skb);
+		return;
+	}
+
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	/* if reaching here SKB is ready to free */
+	trace_consume_skb(skb);
+
+	/* if SKB is a clone, don't handle this case */
+	if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+		__kfree_skb(skb);
+		return;
+	}
+
+	__kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
 /* Make sure a field is enclosed inside headers_start/headers_end section */
 #define CHECK_SKB_FIELD(field) \
 	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
-- 
cgit v1.2.3


From 15fad714be86eab13e7568fecaf475b2a9730d3e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 8 Feb 2016 13:15:04 +0100
Subject: net: bulk free SKBs that were delay free'ed due to IRQ context

The network stack defers SKBs free, in-case free happens in IRQ or
when IRQs are disabled. This happens in __dev_kfree_skb_irq() that
writes SKBs that were free'ed during IRQ to the softirq completion
queue (softnet_data.completion_queue).

These SKBs are naturally delayed, and cleaned up during NET_TX_SOFTIRQ
in function net_tx_action().  Take advantage of this a use the skb
defer and flush API, as we are already in softirq context.

For modern drivers this rarely happens. Although most drivers do call
dev_kfree_skb_any(), which detects the situation and calls
__dev_kfree_skb_irq() when needed.  This due to netpoll can call from
IRQ context.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/dev.c         | 8 +++++++-
 net/core/skbuff.c      | 8 ++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b56c0103fa15..6ec86f1a2ed9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2407,6 +2407,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void __kfree_skb_flush(void);
+void __kfree_skb_defer(struct sk_buff *skb);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 9b2c7a999e71..3f4071a84a03 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h)
 				trace_consume_skb(skb);
 			else
 				trace_kfree_skb(skb, net_tx_action);
-			__kfree_skb(skb);
+
+			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+				__kfree_skb(skb);
+			else
+				__kfree_skb_defer(skb);
 		}
+
+		__kfree_skb_flush();
 	}
 
 	if (sd->output_queue) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b64187b87773..a5bd067ec1a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -767,7 +767,7 @@ void __kfree_skb_flush(void)
 	}
 }
 
-static void __kfree_skb_defer(struct sk_buff *skb)
+static inline void _kfree_skb_defer(struct sk_buff *skb)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
@@ -789,6 +789,10 @@ static void __kfree_skb_defer(struct sk_buff *skb)
 		nc->skb_count = 0;
 	}
 }
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+	_kfree_skb_defer(skb);
+}
 
 void napi_consume_skb(struct sk_buff *skb, int budget)
 {
@@ -814,7 +818,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	__kfree_skb_defer(skb);
+	_kfree_skb_defer(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
 
-- 
cgit v1.2.3


From 179bc67f69b6cb53ad68cfdec5a917c2a2248355 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 20:48:04 +0000
Subject: net: local checksum offload for encapsulation

The arithmetic properties of the ones-complement checksum mean that a
 correctly checksummed inner packet, including its checksum, has a ones
 complement sum depending only on whatever value was used to initialise
 the checksum field before checksumming (in the case of TCP and UDP,
 this is the ones complement sum of the pseudo header, complemented).
Consequently, if we are going to offload the inner checksum with
 CHECKSUM_PARTIAL, we can compute the outer checksum based only on the
 packed data not covered by the inner checksum, and the initial value of
 the inner checksum field.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 24 ++++++++++++++++++++++++
 net/ipv4/ip_tunnel_core.c | 10 +++++-----
 net/ipv4/udp.c            | 20 ++++++++++----------
 net/ipv6/ip6_checksum.c   | 14 +++++++-------
 4 files changed, 46 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6ec86f1a2ed9..cf906d1ce8a7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3702,5 +3702,29 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
 
+/* Local Checksum Offload.
+ * Compute outer checksum based on the assumption that the
+ * inner checksum will be offloaded later.
+ * Fill in outer checksum adjustment (e.g. with sum of outer
+ * pseudo-header) before calling.
+ * Also ensure that inner checksum is in linear data area.
+ */
+static inline __wsum lco_csum(struct sk_buff *skb)
+{
+	char *inner_csum_field;
+	__wsum csum;
+
+	/* Start with complement of inner checksum adjustment */
+	inner_csum_field = skb->data + skb_checksum_start_offset(skb) +
+				skb->csum_offset;
+	csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field);
+	/* Add in checksum of our headers (incl. outer checksum
+	 * adjustment filled in by caller)
+	 */
+	csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum);
+	/* The result is the checksum from skb->data to end of packet */
+	return csum;
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 859d415c0b2d..d74ce93de1fe 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -166,20 +166,20 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
 		return skb;
 	}
 
-	/* If packet is not gso and we are resolving any partial checksum,
+	/* If packet is not gso and we are not offloading inner checksum,
 	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
 	 * on the outer header without confusing devices that implement
 	 * NETIF_F_IP_CSUM with encapsulation.
 	 */
-	if (csum_help)
-		skb->encapsulation = 0;
-
 	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+		skb->encapsulation = 0;
 		err = skb_checksum_help(skb);
 		if (unlikely(err))
 			goto error;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
+	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		skb->ip_summed = CHECKSUM_NONE;
+		skb->encapsulation = 0;
+	}
 
 	return skb;
 error:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ac3cedb25a9f..a59341cf483e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -848,16 +848,18 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 {
 	struct udphdr *uh = udp_hdr(skb);
 
-	if (nocheck)
+	if (nocheck) {
 		uh->check = 0;
-	else if (skb_is_gso(skb))
+	} else if (skb_is_gso(skb)) {
 		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
-	else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		 (skb_dst(skb)->dev->features &
-		  (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
-
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		uh->check = 0;
+		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
+		   (skb_dst(skb)->dev->features &
+		    (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
@@ -865,8 +867,6 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 	} else {
 		__wsum csum;
 
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
 		uh->check = 0;
 		csum = skb_checksum(skb, 0, len, 0);
 		uh->check = udp_v4_check(len, saddr, daddr, csum);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 9a4d7322fb22..4924bd704e89 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -98,11 +98,13 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
 		uh->check = 0;
 	else if (skb_is_gso(skb))
 		uh->check = ~udp_v6_check(len, saddr, daddr, 0);
-	else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		 (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
-
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+	else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		uh->check = 0;
+		uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
+		   (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
@@ -110,8 +112,6 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
 	} else {
 		__wsum csum;
 
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
 		uh->check = 0;
 		csum = skb_checksum(skb, 0, len, 0);
 		uh->check = udp_v6_check(len, saddr, daddr, csum);
-- 
cgit v1.2.3


From 21e2e7f9b5fefdbf94a107a9b24d74baa5148ef3 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 20:50:44 +0000
Subject: net: enable LCO for udp_tunnel_handle_offloads() users

The only protocol affected at present is Geneve.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp_tunnel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index cca2ad3082c3..734c15662ea9 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -103,7 +103,8 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
 {
 	int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
-	return iptunnel_handle_offloads(skb, udp_csum, type);
+	/* As we're a UDP tunnel, we support LCO, so don't need csum_help */
+	return iptunnel_handle_offloads(skb, false, type);
 }
 
 static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff)
-- 
cgit v1.2.3


From 6fa79666e24d32be1b709f5269af41ed9e829e7e Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 21:02:31 +0000
Subject: net: ip_tunnel: remove 'csum_help' argument to
 iptunnel_handle_offloads

All users now pass false, so we can remove it, and remove the code that
 was conditional upon it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c             |  2 +-
 include/net/ip_tunnels.h        |  3 +--
 include/net/udp_tunnel.h        |  3 +--
 net/ipv4/fou.c                  |  4 ++--
 net/ipv4/ip_gre.c               |  3 +--
 net/ipv4/ip_tunnel_core.c       | 18 ++++++------------
 net/ipv4/ipip.c                 |  2 +-
 net/ipv6/sit.c                  |  4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c |  6 ++----
 9 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9f52203ac860..0a23c64379d6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1721,7 +1721,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 	if (WARN_ON(!skb))
 		return -ENOMEM;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6db96ea0144f..bc439f32baa9 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -279,8 +279,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 					     gfp_t flags);
 
-struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
-					 int gso_type_mask);
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
 
 static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 {
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 734c15662ea9..97f5adb121a6 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -103,8 +103,7 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
 {
 	int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
-	/* As we're a UDP tunnel, we support LCO, so don't need csum_help */
-	return iptunnel_handle_offloads(skb, false, type);
+	return iptunnel_handle_offloads(skb, type);
 }
 
 static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index dac1874a5911..88dab0c1670c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -787,7 +787,7 @@ int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 						       SKB_GSO_UDP_TUNNEL;
 	__be16 sport;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
@@ -820,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 
 	optlen += need_priv ? GUE_LEN_PRIV : 0;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9b31532d95f4..65748db44285 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -503,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
 					   bool csum)
 {
-	return iptunnel_handle_offloads(skb, false,
-					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
 static struct rtable *gre_get_rt(struct sk_buff *skb,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d74ce93de1fe..a6e58b6141cd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -148,7 +148,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 
 struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
-					 bool csum_help,
 					 int gso_type_mask)
 {
 	int err;
@@ -166,18 +165,13 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
 		return skb;
 	}
 
-	/* If packet is not gso and we are not offloading inner checksum,
-	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
-	 * on the outer header without confusing devices that implement
-	 * NETIF_F_IP_CSUM with encapsulation.
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
-		skb->encapsulation = 0;
-		err = skb_checksum_help(skb);
-		if (unlikely(err))
-			goto error;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		skb->ip_summed = CHECKSUM_NONE;
+		/* We clear encapsulation here to prevent badly-written
+		 * drivers potentially deciding to offload an inner checksum
+		 * if we set CHECKSUM_PARTIAL on the outer header.
+		 * This should go away when the drivers are all fixed.
+		 */
 		skb->encapsulation = 0;
 	}
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4044da61e747..6ec5b42fd172 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
 	if (IS_ERR(skb))
 		goto out;
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2066d1c25a11..9a6b407f5840 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -911,7 +911,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		goto tx_error;
 	}
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT);
 	if (IS_ERR(skb)) {
 		ip_rt_put(rt);
 		goto out;
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
 	if (IS_ERR(skb))
 		goto out;
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3264cb49b333..a3f5cd9b3c4c 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -1019,8 +1019,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (IS_ERR(skb))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(
-		skb, false, __tun_gso_type_mask(AF_INET, cp->af));
+	skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af));
 	if (IS_ERR(skb))
 		goto tx_error;
 
@@ -1112,8 +1111,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (IS_ERR(skb))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(
-		skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
+	skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af));
 	if (IS_ERR(skb))
 		goto tx_error;
 
-- 
cgit v1.2.3


From e8ae7b000e64cf76283c72cae5e3ecd246618ef4 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 21:03:37 +0000
Subject: Documentation/networking: add checksum-offloads.txt to explain LCO

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/00-INDEX              |   2 +
 Documentation/networking/checksum-offloads.txt | 119 +++++++++++++++++++++++++
 include/linux/skbuff.h                         |   2 +
 3 files changed, 123 insertions(+)
 create mode 100644 Documentation/networking/checksum-offloads.txt

(limited to 'include')

diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index df27a1a50776..415154a487d0 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -44,6 +44,8 @@ can.txt
 	- documentation on CAN protocol family.
 cdc_mbim.txt
 	- 3G/LTE USB modem (Mobile Broadband Interface Model)
+checksum-offloads.txt
+	- Explanation of checksum offloads; LCO, RCO
 cops.txt
 	- info on the COPS LocalTalk Linux driver
 cs89x0.txt
diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt
new file mode 100644
index 000000000000..de2a327766a7
--- /dev/null
+++ b/Documentation/networking/checksum-offloads.txt
@@ -0,0 +1,119 @@
+Checksum Offloads in the Linux Networking Stack
+
+
+Introduction
+============
+
+This document describes a set of techniques in the Linux networking stack
+ to take advantage of checksum offload capabilities of various NICs.
+
+The following technologies are described:
+ * TX Checksum Offload
+ * LCO: Local Checksum Offload
+ * RCO: Remote Checksum Offload
+
+Things that should be documented here but aren't yet:
+ * RX Checksum Offload
+ * CHECKSUM_UNNECESSARY conversion
+
+
+TX Checksum Offload
+===================
+
+The interface for offloading a transmit checksum to a device is explained
+ in detail in comments near the top of include/linux/skbuff.h.
+In brief, it allows to request the device fill in a single ones-complement
+ checksum defined by the sk_buff fields skb->csum_start and
+ skb->csum_offset.  The device should compute the 16-bit ones-complement
+ checksum (i.e. the 'IP-style' checksum) from csum_start to the end of the
+ packet, and fill in the result at (csum_start + csum_offset).
+Because csum_offset cannot be negative, this ensures that the previous
+ value of the checksum field is included in the checksum computation, thus
+ it can be used to supply any needed corrections to the checksum (such as
+ the sum of the pseudo-header for UDP or TCP).
+This interface only allows a single checksum to be offloaded.  Where
+ encapsulation is used, the packet may have multiple checksum fields in
+ different header layers, and the rest will have to be handled by another
+ mechanism such as LCO or RCO.
+No offloading of the IP header checksum is performed; it is always done in
+ software.  This is OK because when we build the IP header, we obviously
+ have it in cache, so summing it isn't expensive.  It's also rather short.
+The requirements for GSO are more complicated, because when segmenting an
+ encapsulated packet both the inner and outer checksums may need to be
+ edited or recomputed for each resulting segment.  See the skbuff.h comment
+ (section 'E') for more details.
+
+A driver declares its offload capabilities in netdev->hw_features; see
+ Documentation/networking/netdev-features for more.  Note that a device
+ which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start
+ and csum_offset given in the SKB; if it tries to deduce these itself in
+ hardware (as some NICs do) the driver should check that the values in the
+ SKB match those which the hardware will deduce, and if not, fall back to
+ checksumming in software instead (with skb_checksum_help or one of the
+ skb_csum_off_chk* functions as mentioned in include/linux/skbuff.h).  This
+ is a pain, but that's what you get when hardware tries to be clever.
+
+The stack should, for the most part, assume that checksum offload is
+ supported by the underlying device.  The only place that should check is
+ validate_xmit_skb(), and the functions it calls directly or indirectly.
+ That function compares the offload features requested by the SKB (which
+ may include other offloads besides TX Checksum Offload) and, if they are
+ not supported or enabled on the device (determined by netdev->features),
+ performs the corresponding offload in software.  In the case of TX
+ Checksum Offload, that means calling skb_checksum_help(skb).
+
+
+LCO: Local Checksum Offload
+===========================
+
+LCO is a technique for efficiently computing the outer checksum of an
+ encapsulated datagram when the inner checksum is due to be offloaded.
+The ones-complement sum of a correctly checksummed TCP or UDP packet is
+ equal to the sum of the pseudo header, because everything else gets
+ 'cancelled out' by the checksum field.  This is because the sum was
+ complemented before being written to the checksum field.
+More generally, this holds in any case where the 'IP-style' ones complement
+ checksum is used, and thus any checksum that TX Checksum Offload supports.
+That is, if we have set up TX Checksum Offload with a start/offset pair, we
+ know that _after the device has filled in that checksum_, the ones
+ complement sum from csum_start to the end of the packet will be equal to
+ _whatever value we put in the checksum field beforehand_.  This allows us
+ to compute the outer checksum without looking at the payload: we simply
+ stop summing when we get to csum_start, then add the 16-bit word at
+ (csum_start + csum_offset).
+Then, when the true inner checksum is filled in (either by hardware or by
+ skb_checksum_help()), the outer checksum will become correct by virtue of
+ the arithmetic.
+
+LCO is performed by the stack when constructing an outer UDP header for an
+ encapsulation such as VXLAN or GENEVE, in udp_set_csum().  Similarly for
+ the IPv6 equivalents, in udp6_set_csum().
+It is also performed when constructing an IPv4 GRE header, in
+ net/ipv4/ip_gre.c:build_header().  It is *not* currently performed when
+ constructing an IPv6 GRE header; the GRE checksum is computed over the
+ whole packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be
+ possible to use LCO here as IPv6 GRE still uses an IP-style checksum.
+All of the LCO implementations use a helper function lco_csum(), in
+ include/linux/skbuff.h.
+
+LCO can safely be used for nested encapsulations; in this case, the outer
+ encapsulation layer will sum over both its own header and the 'middle'
+ header.  This does mean that the 'middle' header will get summed multiple
+ times, but there doesn't seem to be a way to avoid that without incurring
+ bigger costs (e.g. in SKB bloat).
+
+
+RCO: Remote Checksum Offload
+============================
+
+RCO is a technique for eliding the inner checksum of an encapsulated
+ datagram, allowing the outer checksum to be offloaded.  It does, however,
+ involve a change to the encapsulation protocols, which the receiver must
+ also support.  For this reason, it is disabled by default.
+RCO is detailed in the following Internet-Drafts:
+https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00
+https://tools.ietf.org/html/draft-herbert-vxlan-rco-00
+In Linux, RCO is implemented individually in each encapsulation protocol,
+ and most tunnel types have flags controlling its use.  For instance, VXLAN
+ has the flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that
+ RCO should be used when transmitting to a given remote destination.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf906d1ce8a7..39206751463e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3705,6 +3705,8 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 /* Local Checksum Offload.
  * Compute outer checksum based on the assumption that the
  * inner checksum will be offloaded later.
+ * See Documentation/networking/checksum-offloads.txt for
+ * explanation of how this works.
  * Fill in outer checksum adjustment (e.g. with sum of outer
  * pseudo-header) before calling.
  * Also ensure that inner checksum is in linear data area.
-- 
cgit v1.2.3


From d4ab4286276fcd6c155bafdf4422b712068d2516 Mon Sep 17 00:00:00 2001
From: "Keller, Jacob E" <jacob.e.keller@intel.com>
Date: Mon, 8 Feb 2016 16:05:03 -0800
Subject: ethtool: correctly ensure {GS}CHANNELS doesn't conflict with GS{RXFH}

Ethernet drivers implementing both {GS}RXFH and {GS}CHANNELS ethtool ops
incorrectly allow SCHANNELS when it would conflict with the settings
from SRXFH. This occurs because it is not possible for drivers to
understand whether their Rx flow indirection table has been configured
or is in the default state. In addition, drivers currently behave in
various ways when increasing the number of Rx channels.

Some drivers will always destroy the Rx flow indirection table when this
occurs, whether it has been set by the user or not. Other drivers will
attempt to preserve the table even if the user has never modified it
from the default driver settings. Neither of these situation is
desirable because it leads to unexpected behavior or loss of user
configuration.

The correct behavior is to simply return -EINVAL when SCHANNELS would
conflict with the current Rx flow table settings. However, it should
only do so if the current settings were modified by the user. If we
required that the new settings never conflict with the current (default)
Rx flow settings, we would force users to first reduce their Rx flow
settings and then reduce the number of Rx channels.

This patch proposes a solution implemented in net/core/ethtool.c which
ensures that all drivers behave correctly. It checks whether the RXFH
table has been configured to non-default settings, and stores this
information in a private netdev flag. When the number of channels is
requested to change, it first ensures that the current Rx flow table is
not going to assign flows to now disabled channels.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 +++++++
 net/core/ethtool.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 219f53c30cb3..0499569c256d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1291,6 +1291,7 @@ struct net_device_ops {
  * @IFF_OPENVSWITCH: device is a Open vSwitch master
  * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
  * @IFF_TEAM: device is a team device
+ * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1318,6 +1319,7 @@ enum netdev_priv_flags {
 	IFF_OPENVSWITCH			= 1<<22,
 	IFF_L3MDEV_SLAVE		= 1<<23,
 	IFF_TEAM			= 1<<24,
+	IFF_RXFH_CONFIGURED		= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1345,6 +1347,7 @@ enum netdev_priv_flags {
 #define IFF_OPENVSWITCH			IFF_OPENVSWITCH
 #define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
 #define IFF_TEAM			IFF_TEAM
+#define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4048,6 +4051,11 @@ static inline bool netif_is_lag_port(const struct net_device *dev)
 	return netif_is_bond_slave(dev) || netif_is_team_port(dev);
 }
 
+static inline bool netif_is_rxfh_configured(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_RXFH_CONFIGURED;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 453c803f1c87..379bdc59b1c8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -642,6 +642,37 @@ void netdev_rss_key_fill(void *buffer, size_t len)
 }
 EXPORT_SYMBOL(netdev_rss_key_fill);
 
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+	u32 dev_size, current_max = 0;
+	u32 *indir;
+	int ret;
+
+	if (!dev->ethtool_ops->get_rxfh_indir_size ||
+	    !dev->ethtool_ops->get_rxfh)
+		return -EOPNOTSUPP;
+	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+	if (dev_size == 0)
+		return -EOPNOTSUPP;
+
+	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+	if (ret)
+		goto out;
+
+	while (dev_size--)
+		current_max = max(current_max, indir[dev_size]);
+
+	*max = current_max;
+
+out:
+	kfree(indir);
+	return ret;
+}
+
 static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
@@ -738,6 +769,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 	}
 
 	ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+	if (ret)
+		goto out;
+
+	/* indicate whether rxfh was set to default */
+	if (user_size == 0)
+		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+	else
+		dev->priv_flags |= IFF_RXFH_CONFIGURED;
 
 out:
 	kfree(indir);
@@ -897,6 +936,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	}
 
 	ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+	if (ret)
+		goto out;
+
+	/* indicate whether rxfh was set to default */
+	if (rxfh.indir_size == 0)
+		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+	else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+		dev->priv_flags |= IFF_RXFH_CONFIGURED;
 
 out:
 	kfree(rss_config);
@@ -1228,6 +1275,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_channels channels;
+	u32 max_rx_in_use = 0;
 
 	if (!dev->ethtool_ops->set_channels)
 		return -EOPNOTSUPP;
@@ -1235,6 +1283,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	if (copy_from_user(&channels, useraddr, sizeof(channels)))
 		return -EFAULT;
 
+	/* ensure the new Rx count fits within the configured Rx flow
+	 * indirection table settings */
+	if (netif_is_rxfh_configured(dev) &&
+	    !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+	    (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+	    return -EINVAL;
+
 	return dev->ethtool_ops->set_channels(dev, &channels);
 }
 
-- 
cgit v1.2.3


From 911362c70df5b766c243dc297fadeaced786ffd8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:53 +0100
Subject: net: add dst_cache support

This patch add a generic, lockless dst cache implementation.
The need for lock is avoided updating the dst cache fields
only in per cpu scope, and requiring that the cache manipulation
functions are invoked with the local bh disabled.

The refresh_ts and reset_ts fields are used to ensure the cache
consistency in case of cuncurrent cache update (dst_cache_set*) and
reset operation (dst_cache_reset).

Consider the following scenario:

CPU1:                                   	CPU2:
  <cache lookup with emtpy cache: it fails>
  <get dst via uncached route lookup>
						<related configuration changes>
                                        	dst_cache_reset()
  dst_cache_set()

The dst entry set passed to dst_cache_set() should not be used
for later dst cache lookup, because it's obtained using old
configuration values.

Since the refresh_ts is updated only on dst_cache lookup, the
cached value in the above scenario will be discarded on the next
lookup.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_cache.h |  97 ++++++++++++++++++++++++++++
 net/Kconfig             |   4 ++
 net/core/Makefile       |   1 +
 net/core/dst_cache.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 270 insertions(+)
 create mode 100644 include/net/dst_cache.h
 create mode 100644 net/core/dst_cache.c

(limited to 'include')

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
new file mode 100644
index 000000000000..151accae708b
--- /dev/null
+++ b/include/net/dst_cache.h
@@ -0,0 +1,97 @@
+#ifndef _NET_DST_CACHE_H
+#define _NET_DST_CACHE_H
+
+#include <linux/jiffies.h>
+#include <net/dst.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+
+struct dst_cache {
+	struct dst_cache_pcpu __percpu *cache;
+	unsigned long reset_ts;
+};
+
+/**
+ *	dst_cache_get - perform cache lookup
+ *	@dst_cache: the cache
+ *
+ *	The caller should use dst_cache_get_ip4() if it need to retrieve the
+ *	source address to be used when xmitting to the cached dst.
+ *	local BH must be disabled.
+ */
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache);
+
+/**
+ *	dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address
+ *	@dst_cache: the cache
+ *	@saddr: return value for the retrieved source address
+ *
+ *	local BH must be disabled.
+ */
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr);
+
+/**
+ *	dst_cache_set_ip4 - store the ipv4 dst into the cache
+ *	@dst_cache: the cache
+ *	@dst: the entry to be cached
+ *	@saddr: the source address to be stored inside the cache
+ *
+ *	local BH must be disabled.
+ */
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       __be32 saddr);
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+/**
+ *	dst_cache_set_ip6 - store the ipv6 dst into the cache
+ *	@dst_cache: the cache
+ *	@dst: the entry to be cached
+ *	@saddr: the source address to be stored inside the cache
+ *
+ *	local BH must be disabled.
+ */
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       const struct in6_addr *addr);
+
+/**
+ *	dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address
+ *	@dst_cache: the cache
+ *	@saddr: return value for the retrieved source address
+ *
+ *	local BH must be disabled.
+ */
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+				    struct in6_addr *saddr);
+#endif
+
+/**
+ *	dst_cache_reset - invalidate the cache contents
+ *	@dst_cache: the cache
+ *
+ *	This do not free the cached dst to avoid races and contentions.
+ *	the dst will be freed on later cache lookup.
+ */
+static inline void dst_cache_reset(struct dst_cache *dst_cache)
+{
+	dst_cache->reset_ts = jiffies;
+}
+
+/**
+ *	dst_cache_init - initialize the cache, allocating the required storage
+ *	@dst_cache: the cache
+ *	@gfp: allocation flags
+ */
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
+
+/**
+ *	dst_cache_destroy - empty the cache and free the allocated storage
+ *	@dst_cache: the cache
+ *
+ *	No synchronization is enforced: it must be called only when the cache
+ *	is unsed.
+ */
+void dst_cache_destroy(struct dst_cache *dst_cache);
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 174354618f8a..b80efecfc1a0 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -392,6 +392,10 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config DST_CACHE
+	bool "dst cache"
+	default n
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 0b835de04de3..7a8fb8aef992 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000000..3938f3f38d69
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+	unsigned long refresh_ts;
+	struct dst_entry *dst;
+	u32 cookie;
+	union {
+		struct in_addr in_saddr;
+		struct in6_addr in6_saddr;
+	};
+};
+
+void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+			       struct dst_entry *dst, u32 cookie)
+{
+	dst_release(dst_cache->dst);
+	if (dst)
+		dst_hold(dst);
+
+	dst_cache->cookie = cookie;
+	dst_cache->dst = dst;
+}
+
+struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+					struct dst_cache_pcpu *idst)
+{
+	struct dst_entry *dst;
+
+	dst = idst->dst;
+	if (!dst)
+		goto fail;
+
+	/* the cache already hold a dst reference; it can't go away */
+	dst_hold(dst);
+
+	if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+		dst_cache_per_cpu_dst_set(idst, NULL, 0);
+		dst_release(dst);
+		goto fail;
+	}
+	return dst;
+
+fail:
+	idst->refresh_ts = jiffies;
+	return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+	if (!dst_cache->cache)
+		return NULL;
+
+	return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+	struct dst_cache_pcpu *idst;
+	struct dst_entry *dst;
+
+	if (!dst_cache->cache)
+		return NULL;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst = dst_cache_per_cpu_get(dst_cache, idst);
+	if (!dst)
+		return NULL;
+
+	*saddr = idst->in_saddr.s_addr;
+	return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       __be32 saddr)
+{
+	struct dst_cache_pcpu *idst;
+
+	if (!dst_cache->cache)
+		return;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst_cache_per_cpu_dst_set(idst, dst, 0);
+	idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       const struct in6_addr *addr)
+{
+	struct dst_cache_pcpu *idst;
+
+	if (!dst_cache->cache)
+		return;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+				  rt6_get_cookie((struct rt6_info *)dst));
+	idst->in6_saddr = *addr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+				    struct in6_addr *saddr)
+{
+	struct dst_cache_pcpu *idst;
+	struct dst_entry *dst;
+
+	if (!dst_cache->cache)
+		return NULL;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst = dst_cache_per_cpu_get(dst_cache, idst);
+	if (!dst)
+		return NULL;
+
+	*saddr = idst->in6_saddr;
+	return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+	dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+					    gfp | __GFP_ZERO);
+	if (!dst_cache->cache)
+		return -ENOMEM;
+
+	dst_cache_reset(dst_cache);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	for_each_possible_cpu(i)
+		dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+	free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
-- 
cgit v1.2.3


From 607f725f6f7d5ec3759fbc16224afb60e2152a5b Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:54 +0100
Subject: net: replace dst_cache ip6_tunnel implementation with the generic one

This also fix a potential race into the existing tunnel code, which
could lead to the wrong dst to be permanenty cached:

CPU1:					CPU2:
  <xmit on ip6_tunnel>
  <cache lookup fails>
  dst = ip6_route_output(...)
					<tunnel params are changed via nl>
					dst_cache_reset() // no effect,
							// the cache is empty
  dst_cache_set() // the wrong dst
	// is permanenty stored
	// into the cache

With the new dst implementation the above race is not possible
since the first cache lookup after dst_cache_reset will fail due
to the timestamp check

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |  14 +------
 net/ipv6/Kconfig         |   1 +
 net/ipv6/ip6_gre.c       |  12 +++---
 net/ipv6/ip6_tunnel.c    | 103 +++--------------------------------------------
 net/ipv6/ip6_vti.c       |   2 +-
 5 files changed, 16 insertions(+), 116 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 0d0ce0b2d870..499a707765ea 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -6,6 +6,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/ip6_tunnel.h>
 #include <net/ip_tunnels.h>
+#include <net/dst_cache.h>
 
 #define IP6TUNNEL_ERR_TIMEO (30*HZ)
 
@@ -33,12 +34,6 @@ struct __ip6_tnl_parm {
 	__be32			o_key;
 };
 
-struct ip6_tnl_dst {
-	seqlock_t lock;
-	struct dst_entry __rcu *dst;
-	u32 cookie;
-};
-
 /* IPv6 tunnel */
 struct ip6_tnl {
 	struct ip6_tnl __rcu *next;	/* next tunnel in list */
@@ -46,7 +41,7 @@ struct ip6_tnl {
 	struct net *net;	/* netns for packet i/o */
 	struct __ip6_tnl_parm parms;	/* tunnel configuration parameters */
 	struct flowi fl;	/* flowi template for xmit */
-	struct ip6_tnl_dst __percpu *dst_cache;	/* cached dst */
+	struct dst_cache dst_cache;	/* cached dst */
 
 	int err_count;
 	unsigned long err_time;
@@ -66,11 +61,6 @@ struct ipv6_tlv_tnl_enc_lim {
 	__u8 encap_limit;	/* tunnel encapsulation limit   */
 } __packed;
 
-struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t);
-int ip6_tnl_dst_init(struct ip6_tnl *t);
-void ip6_tnl_dst_destroy(struct ip6_tnl *t);
-void ip6_tnl_dst_reset(struct ip6_tnl *t);
-void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst);
 int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 		const struct in6_addr *raddr);
 int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 40c897515ddc..11e875ffd7ac 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE
 config IPV6_TUNNEL
 	tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
 	select INET6_TUNNEL
+	select DST_CACHE
 	---help---
 	  Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
 	  RFC 2473.
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f37f18b6b40c..a94e50602813 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
 
 	ip6gre_tunnel_unlink(ign, t);
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
 }
 
@@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark)
-		dst = ip6_tnl_dst_get(tunnel);
+		dst = dst_cache_get(&tunnel->dst_cache);
 
 	if (!dst) {
 		dst = ip6_route_output(net, NULL, fl6);
@@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark && ndst)
-		ip6_tnl_dst_set(tunnel, ndst);
+		dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr);
 	skb_dst_set(skb, dst);
 
 	proto = NEXTHDR_GRE;
@@ -1009,7 +1009,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t,
 	t->parms.o_key = p->o_key;
 	t->parms.i_flags = p->i_flags;
 	t->parms.o_flags = p->o_flags;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	ip6gre_tnl_link_config(t, set_mtu);
 	return 0;
 }
@@ -1219,7 +1219,7 @@ static void ip6gre_dev_free(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 
-	ip6_tnl_dst_destroy(t);
+	dst_cache_destroy(&t->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1257,7 +1257,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	ret = ip6_tnl_dst_init(tunnel);
+	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
 	if (ret) {
 		free_percpu(dev->tstats);
 		dev->tstats = NULL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 137fca42aaa6..3f3aabd2f07b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 	return &dev->stats;
 }
 
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
-				    struct dst_entry *dst)
-{
-	write_seqlock_bh(&idst->lock);
-	dst_release(rcu_dereference_protected(
-			    idst->dst,
-			    lockdep_is_held(&idst->lock.lock)));
-	if (dst) {
-		dst_hold(dst);
-		idst->cookie = rt6_get_cookie((struct rt6_info *)dst);
-	} else {
-		idst->cookie = 0;
-	}
-	rcu_assign_pointer(idst->dst, dst);
-	write_sequnlock_bh(&idst->lock);
-}
-
-struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t)
-{
-	struct ip6_tnl_dst *idst;
-	struct dst_entry *dst;
-	unsigned int seq;
-	u32 cookie;
-
-	idst = raw_cpu_ptr(t->dst_cache);
-
-	rcu_read_lock();
-	do {
-		seq = read_seqbegin(&idst->lock);
-		dst = rcu_dereference(idst->dst);
-		cookie = idst->cookie;
-	} while (read_seqretry(&idst->lock, seq));
-
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-		dst = NULL;
-	rcu_read_unlock();
-
-	if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) {
-		ip6_tnl_per_cpu_dst_set(idst, NULL);
-		dst_release(dst);
-		dst = NULL;
-	}
-	return dst;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);
-
-void ip6_tnl_dst_reset(struct ip6_tnl *t)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
-
-void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst)
-{
-	ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst);
-
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_set);
-
-void ip6_tnl_dst_destroy(struct ip6_tnl *t)
-{
-	if (!t->dst_cache)
-		return;
-
-	ip6_tnl_dst_reset(t);
-	free_percpu(t->dst_cache);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy);
-
-int ip6_tnl_dst_init(struct ip6_tnl *t)
-{
-	int i;
-
-	t->dst_cache = alloc_percpu(struct ip6_tnl_dst);
-	if (!t->dst_cache)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i)
-		seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
-
 /**
  * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
  *   @remote: the address of the tunnel exit-point
@@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 
-	ip6_tnl_dst_destroy(t);
+	dst_cache_destroy(&t->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev)
 		RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
 	else
 		ip6_tnl_unlink(ip6n, t);
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
 }
 
@@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
 		neigh_release(neigh);
 	} else if (!fl6->flowi6_mark)
-		dst = ip6_tnl_dst_get(t);
+		dst = dst_cache_get(&t->dst_cache);
 
 	if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
 		goto tx_err_link_failure;
@@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark && ndst)
-		ip6_tnl_dst_set(t, ndst);
+		dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
 	skb_dst_set(skb, dst);
 
 	skb->transport_header = skb->network_header;
@@ -1366,7 +1275,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 	t->parms.flowinfo = p->flowinfo;
 	t->parms.link = p->link;
 	t->parms.proto = p->proto;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	ip6_tnl_link_config(t);
 	return 0;
 }
@@ -1637,7 +1546,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	ret = ip6_tnl_dst_init(t);
+	ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
 	if (ret) {
 		free_percpu(dev->tstats);
 		dev->tstats = NULL;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 0a8610b33d79..d90a11f14040 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 	t->parms.i_key = p->i_key;
 	t->parms.o_key = p->o_key;
 	t->parms.proto = p->proto;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	vti6_link_config(t);
 	return 0;
 }
-- 
cgit v1.2.3


From e09acddf873bf775b208b452a4c3a3fd26fa9427 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:55 +0100
Subject: ip_tunnel: replace dst_cache with generic implementation

The current ip_tunnel cache implementation is prone to a race
that will cause the wrong dst to be cached on cuncurrent dst cache
miss and ip tunnel update via netlink.

Replacing with the generic implementation fix the issue.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  9 ++----
 net/ipv4/Kconfig         |  1 +
 net/ipv4/ip_tunnel.c     | 78 ++++++++----------------------------------------
 net/ipv6/sit.c           | 17 ++++++-----
 4 files changed, 25 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index bc439f32baa9..fd36936d85a6 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -13,6 +13,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/lwtunnel.h>
+#include <net/dst_cache.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -85,11 +86,6 @@ struct ip_tunnel_prl_entry {
 	struct rcu_head			rcu_head;
 };
 
-struct ip_tunnel_dst {
-	struct dst_entry __rcu 		*dst;
-	__be32				 saddr;
-};
-
 struct metadata_dst;
 
 struct ip_tunnel {
@@ -108,7 +104,7 @@ struct ip_tunnel {
 	int		tun_hlen;	/* Precalculated header length */
 	int		mlink;
 
-	struct ip_tunnel_dst __percpu *dst_cache;
+	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
 
@@ -247,7 +243,6 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p);
 void ip_tunnel_setup(struct net_device *dev, int net_id);
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t);
 int ip_tunnel_encap_setup(struct ip_tunnel *t,
 			  struct ip_tunnel_encap *ipencap);
 
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 775824720b6b..395d82754626 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX
 
 config NET_IP_TUNNEL
 	tristate
+	select DST_CACHE
 	default n
 
 config NET_IPGRE
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c7bd72e9b544..4569da7dfa88 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
 			 IP_TNL_HASH_BITS);
 }
 
-static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
-			     struct dst_entry *dst, __be32 saddr)
-{
-	struct dst_entry *old_dst;
-
-	dst_clone(dst);
-	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
-	dst_release(old_dst);
-	idst->saddr = saddr;
-}
-
-static noinline void tunnel_dst_set(struct ip_tunnel *t,
-			   struct dst_entry *dst, __be32 saddr)
-{
-	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
-}
-
-static void tunnel_dst_reset(struct ip_tunnel *t)
-{
-	tunnel_dst_set(t, NULL, 0);
-}
-
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
-}
-EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
-					u32 cookie, __be32 *saddr)
-{
-	struct ip_tunnel_dst *idst;
-	struct dst_entry *dst;
-
-	rcu_read_lock();
-	idst = raw_cpu_ptr(t->dst_cache);
-	dst = rcu_dereference(idst->dst);
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-		dst = NULL;
-	if (dst) {
-		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
-			*saddr = idst->saddr;
-		} else {
-			tunnel_dst_reset(t);
-			dst_release(dst);
-			dst = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return (struct rtable *)dst;
-}
-
 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
 				__be16 flags, __be32 key)
 {
@@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
-			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
 			ip_rt_put(rt);
 		}
 		if (dev->type != ARPHRD_ETHER)
@@ -729,7 +675,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 		goto tx_error;
 
-	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
+			 NULL;
 
 	if (!rt) {
 		rt = ip_route_output_key(tunnel->net, &fl4);
@@ -739,7 +686,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 		if (connected)
-			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
 	}
 
 	if (rt->dst.dev == dev) {
@@ -836,7 +784,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 		if (set_mtu)
 			dev->mtu = mtu;
 	}
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(dev);
 }
 
@@ -961,7 +909,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	gro_cells_destroy(&tunnel->gro_cells);
-	free_percpu(tunnel->dst_cache);
+	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1155,15 +1103,15 @@ int ip_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
-	if (!tunnel->dst_cache) {
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
 		free_percpu(dev->tstats);
-		return -ENOMEM;
+		return err;
 	}
 
 	err = gro_cells_init(&tunnel->gro_cells, dev);
 	if (err) {
-		free_percpu(tunnel->dst_cache);
+		dst_cache_destroy(&tunnel->dst_cache);
 		free_percpu(dev->tstats);
 		return err;
 	}
@@ -1193,7 +1141,7 @@ void ip_tunnel_uninit(struct net_device *dev)
 	if (itn->fb_tunnel_dev != dev)
 		ip_tunnel_del(itn, netdev_priv(dev));
 
-	ip_tunnel_dst_reset_all(tunnel);
+	dst_cache_reset(&tunnel->dst_cache);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 9a6b407f5840..0625ac6356b5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 		ipip6_tunnel_unlink(sitn, tunnel);
 		ipip6_tunnel_del_prl(tunnel, NULL);
 	}
-	ip_tunnel_dst_reset_all(tunnel);
+	dst_cache_reset(&tunnel->dst_cache);
 	dev_put(dev);
 }
 
@@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
 		t->parms.link = p->link;
 		ipip6_tunnel_bind_dev(t->dev);
 	}
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(t->dev);
 }
 
@@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
 	t->ip6rd.relay_prefix = relay_prefix;
 	t->ip6rd.prefixlen = ip6rd->prefixlen;
 	t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(t->dev);
 	return 0;
 }
@@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
 			break;
 		}
-		ip_tunnel_dst_reset_all(t);
+		dst_cache_reset(&t->dst_cache);
 		netdev_state_change(dev);
 		break;
 
@@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
-	free_percpu(tunnel->dst_cache);
+	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 static int ipip6_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int err;
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
@@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
-	if (!tunnel->dst_cache) {
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
 		free_percpu(dev->tstats);
-		return -ENOMEM;
+		return err;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 0c1d70af924b966cc71e9e48920b2b635441aa50 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:56 +0100
Subject: net: use dst_cache for vxlan device

In case of UDP traffic with datagram length
below MTU this give about 3% performance increase
when tunneling over ipv4 and about 70% when
tunneling over ipv6.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 55 +++++++++++++++++++++++++++++++++++++++++++++--------
 include/net/vxlan.h |  1 +
 2 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0a23c64379d6..ad673037bd73 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -480,6 +480,8 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
 	if (!rd)
 		return 0;
+
+	dst_cache_reset(&rd->dst_cache);
 	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
@@ -501,6 +503,12 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
 		return -ENOBUFS;
+
+	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
+		kfree(rd);
+		return -ENOBUFS;
+	}
+
 	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
@@ -749,8 +757,10 @@ static void vxlan_fdb_free(struct rcu_head *head)
 	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
 	struct vxlan_rdst *rd, *nd;
 
-	list_for_each_entry_safe(rd, nd, &f->remotes, list)
+	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
+		dst_cache_destroy(&rd->dst_cache);
 		kfree(rd);
+	}
 	kfree(f);
 }
 
@@ -1754,11 +1764,24 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 
 static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 				      struct sk_buff *skb, int oif, u8 tos,
-				      __be32 daddr, __be32 *saddr)
+				      __be32 daddr, __be32 *saddr,
+				      struct dst_cache *dst_cache,
+				      struct ip_tunnel_info *info)
 {
 	struct rtable *rt = NULL;
+	bool use_cache = false;
 	struct flowi4 fl4;
 
+	/* when the ip_tunnel_info is availble, the tos used for lookup is
+	 * packet independent, so we can use the cache
+	 */
+	if (dst_cache && !skb->mark && (!tos || info)) {
+		use_cache = true;
+		rt = dst_cache_get_ip4(dst_cache, saddr);
+		if (rt)
+			return rt;
+	}
+
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.flowi4_oif = oif;
 	fl4.flowi4_tos = RT_TOS(tos);
@@ -1768,8 +1791,11 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 	fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;
 
 	rt = ip_route_output_key(vxlan->net, &fl4);
-	if (!IS_ERR(rt))
+	if (!IS_ERR(rt)) {
 		*saddr = fl4.saddr;
+		if (use_cache)
+			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+	}
 	return rt;
 }
 
@@ -1777,12 +1803,21 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif,
 					  const struct in6_addr *daddr,
-					  struct in6_addr *saddr)
+					  struct in6_addr *saddr,
+					  struct dst_cache *dst_cache)
 {
+	bool use_cache = false;
 	struct dst_entry *ndst;
 	struct flowi6 fl6;
 	int err;
 
+	if (dst_cache && !skb->mark) {
+		use_cache = true;
+		ndst = dst_cache_get_ip6(dst_cache, saddr);
+		if (ndst)
+			return ndst;
+	}
+
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = oif;
 	fl6.daddr = *daddr;
@@ -1797,6 +1832,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 		return ERR_PTR(err);
 
 	*saddr = fl6.saddr;
+	if (use_cache)
+		dst_cache_set_ip6(dst_cache, ndst, saddr);
 	return ndst;
 }
 #endif
@@ -1938,7 +1975,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		rt = vxlan_get_route(vxlan, skb,
 				     rdst ? rdst->remote_ifindex : 0, tos,
-				     dst->sin.sin_addr.s_addr, &saddr);
+				     dst->sin.sin_addr.s_addr, &saddr,
+				     rdst ? &rdst->dst_cache : NULL, info);
 		if (IS_ERR(rt)) {
 			netdev_dbg(dev, "no route to %pI4\n",
 				   &dst->sin.sin_addr.s_addr);
@@ -1990,7 +2028,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0,
-					&dst->sin6.sin6_addr, &saddr);
+					&dst->sin6.sin6_addr, &saddr,
+					rdst ? &rdst->dst_cache : NULL);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
@@ -2331,7 +2370,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 			return -EINVAL;
 		rt = vxlan_get_route(vxlan, skb, 0, info->key.tos,
 				     info->key.u.ipv4.dst,
-				     &info->key.u.ipv4.src);
+				     &info->key.u.ipv4.src, NULL, info);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 		ip_rt_put(rt);
@@ -2343,7 +2382,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0,
 					&info->key.u.ipv6.dst,
-					&info->key.u.ipv6.src);
+					&info->key.u.ipv6.src, NULL);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
 		dst_release(ndst);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 25bd919c9ef0..b314e4af89c5 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -148,6 +148,7 @@ struct vxlan_rdst {
 	u32			 remote_ifindex;
 	struct list_head	 list;
 	struct rcu_head		 rcu;
+	struct dst_cache	 dst_cache;
 };
 
 struct vxlan_config {
-- 
cgit v1.2.3


From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:57 +0100
Subject: net: add dst_cache to ovs vxlan lwtunnel

In case of UDP traffic with datagram length
below MTU this give about 2% performance increase
when tunneling over ipv4 and about 60% when tunneling
over ipv6

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            | 15 ++++++++-------
 include/net/dst_metadata.h     |  1 +
 include/net/ip_tunnels.h       |  3 +++
 net/core/dst.c                 | 10 +++++++++-
 net/openvswitch/Kconfig        |  1 +
 net/openvswitch/flow_netlink.c |  6 ++++++
 6 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ad673037bd73..ee1206d9f8df 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1775,7 +1775,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 	/* when the ip_tunnel_info is availble, the tos used for lookup is
 	 * packet independent, so we can use the cache
 	 */
-	if (dst_cache && !skb->mark && (!tos || info)) {
+	if (!skb->mark && (!tos || info)) {
 		use_cache = true;
 		rt = dst_cache_get_ip4(dst_cache, saddr);
 		if (rt)
@@ -1806,13 +1806,11 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct in6_addr *saddr,
 					  struct dst_cache *dst_cache)
 {
-	bool use_cache = false;
 	struct dst_entry *ndst;
 	struct flowi6 fl6;
 	int err;
 
-	if (dst_cache && !skb->mark) {
-		use_cache = true;
+	if (!skb->mark) {
 		ndst = dst_cache_get_ip6(dst_cache, saddr);
 		if (ndst)
 			return ndst;
@@ -1832,7 +1830,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 		return ERR_PTR(err);
 
 	*saddr = fl6.saddr;
-	if (use_cache)
+	if (!skb->mark)
 		dst_cache_set_ip6(dst_cache, ndst, saddr);
 	return ndst;
 }
@@ -1886,6 +1884,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct dst_cache *dst_cache;
 	struct ip_tunnel_info *info;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk;
@@ -1910,6 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
 		vni = rdst->remote_vni;
 		dst = &rdst->remote_ip;
+		dst_cache = &rdst->dst_cache;
 	} else {
 		if (!info) {
 			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
@@ -1924,6 +1924,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		else
 			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
 		dst = &remote_ip;
+		dst_cache = &info->dst_cache;
 	}
 
 	if (vxlan_addr_any(dst)) {
@@ -1976,7 +1977,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		rt = vxlan_get_route(vxlan, skb,
 				     rdst ? rdst->remote_ifindex : 0, tos,
 				     dst->sin.sin_addr.s_addr, &saddr,
-				     rdst ? &rdst->dst_cache : NULL, info);
+				     dst_cache, info);
 		if (IS_ERR(rt)) {
 			netdev_dbg(dev, "no route to %pI4\n",
 				   &dst->sin.sin_addr.s_addr);
@@ -2029,7 +2030,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0,
 					&dst->sin6.sin6_addr, &saddr,
-					rdst ? &rdst->dst_cache : NULL);
+					dst_cache);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 30a56ab2ccfb..84b833af6882 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 		      sizeof(a->u.tun_info) + a->u.tun_info.options_len);
 }
 
+void metadata_dst_free(struct metadata_dst *);
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fd36936d85a6..87408ab80856 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -58,6 +58,9 @@ struct ip_tunnel_key {
 
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache	dst_cache;
+#endif
 	u8			options_len;
 	u8			mode;
 };
diff --git a/net/core/dst.c b/net/core/dst.c
index a1656e3b8d72..b5cbbe07f786 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,7 +265,7 @@ again:
 	lwtstate_put(dst->lwtstate);
 
 	if (dst->flags & DST_METADATA)
-		kfree(dst);
+		metadata_dst_free((struct metadata_dst *)dst);
 	else
 		kmem_cache_free(dst->ops->kmem_cachep, dst);
 
@@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+	dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+	kfree(md_dst);
+}
+
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
 {
 	int cpu;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d143aa9f6654..cd5fd9d728a7 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -10,6 +10,7 @@ config OPENVSWITCH
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
+	select DST_CACHE
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d1bd4a45ca2d..58b8efc23668 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (!tun_dst)
 		return -ENOMEM;
 
+	err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
+	if (err) {
+		dst_release((struct dst_entry *)tun_dst);
+		return err;
+	}
+
 	a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
 			 sizeof(*ovs_tun), log);
 	if (IS_ERR(a)) {
-- 
cgit v1.2.3


From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 22:02:53 -0800
Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info

tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow,
in usec unit. Might be ~0U if not yet known.

tcpi_notsent_bytes reports the amount of bytes in the write queue that
were not yet sent.

This is done in a single patch to not add a temporary 32bit padding hole
in tcp_info.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp.c           | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 65a77b071e22..fe95446e9abf 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -196,6 +196,9 @@ struct tcp_info {
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	__u32	tcpi_notsent_bytes;
+	__u32	tcpi_min_rtt;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 014f18e2f7b3..f93150d15199 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2642,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 	unsigned int start;
+	int notsent_bytes;
 	u64 rate64;
 	u32 rate;
 
@@ -2722,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
+
+	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+	info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+	info->tcpi_min_rtt = tcp_min_rtt(tp);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-- 
cgit v1.2.3


From fa50d974d104113630d68b7d03233a6686230d0c Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:27 +0200
Subject: ipv4: Namespaceify ip_default_ttl sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h                 |  1 +
 include/net/route.h                      |  5 ++---
 net/bridge/netfilter/nft_reject_bridge.c |  8 +++++---
 net/ipv4/ip_output.c                     |  3 ---
 net/ipv4/ip_sockglue.c                   |  5 ++++-
 net/ipv4/netfilter/ipt_SYNPROXY.c        |  3 ++-
 net/ipv4/proc.c                          |  2 +-
 net/ipv4/sysctl_net_ipv4.c               | 20 +++++++++++---------
 8 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 848fe8056534..bc8f7f94abcb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -80,6 +80,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_ecn;
 	int sysctl_tcp_ecn_fallback;
 
+	int sysctl_ip_default_ttl;
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 	int sysctl_ip_nonlocal_bind;
diff --git a/include/net/route.h b/include/net/route.h
index a3b9ef74a389..9b0a523bb428 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -329,14 +329,13 @@ static inline int inet_iif(const struct sk_buff *skb)
 	return skb->skb_iif;
 }
 
-extern int sysctl_ip_default_ttl;
-
 static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 {
 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+	struct net *net = dev_net(dst->dev);
 
 	if (hoplimit == 0)
-		hoplimit = sysctl_ip_default_ttl;
+		hoplimit = net->ipv4.sysctl_ip_default_ttl;
 	return hoplimit;
 }
 
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index fdba3d9fbff3..adc8d7221dbb 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -48,6 +48,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
 	struct iphdr *niph;
 	const struct tcphdr *oth;
 	struct tcphdr _oth;
+	struct net *net = sock_net(oldskb->sk);
 
 	if (!nft_bridge_iphdr_validate(oldskb))
 		return;
@@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
-				   sysctl_ip_default_ttl);
+				   net->ipv4.sysctl_ip_default_ttl);
 	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
-	niph->ttl	= sysctl_ip_default_ttl;
+	niph->ttl	= net->ipv4.sysctl_ip_default_ttl;
 	niph->tot_len	= htons(nskb->len);
 	ip_send_check(niph);
 
@@ -85,6 +86,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
 	void *payload;
 	__wsum csum;
 	u8 proto;
+	struct net *net = sock_net(oldskb->sk);
 
 	if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb))
 		return;
@@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
-				   sysctl_ip_default_ttl);
+				   net->ipv4.sysctl_ip_default_ttl);
 
 	skb_reset_transport_header(nskb);
 	icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 64878efa045c..f734c42acdaf 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -79,9 +79,6 @@
 #include <linux/netlink.h>
 #include <linux/tcp.h>
 
-int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-
 static int
 ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	    unsigned int mtu,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 92808f147ef5..3f1befc4e17b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1341,10 +1341,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		val = inet->tos;
 		break;
 	case IP_TTL:
+	{
+		struct net *net = sock_net(sk);
 		val = (inet->uc_ttl == -1 ?
-		       sysctl_ip_default_ttl :
+		       net->ipv4.sysctl_ip_default_ttl :
 		       inet->uc_ttl);
 		break;
+	}
 	case IP_HDRINCL:
 		val = inet->hdrincl;
 		break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 5fdc556514ba..7b8fbb352877 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -21,6 +21,7 @@ static struct iphdr *
 synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 {
 	struct iphdr *iph;
+	struct net *net = sock_net(skb->sk);
 
 	skb_reset_network_header(skb);
 	iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
@@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	iph->tos	= 0;
 	iph->id		= 0;
 	iph->frag_off	= htons(IP_DF);
-	iph->ttl	= sysctl_ip_default_ttl;
+	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;
 	iph->protocol	= IPPROTO_TCP;
 	iph->check	= 0;
 	iph->saddr	= saddr;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3abd9d7a3adf..9f665b63a927 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "\nIp: %d %d",
 		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
-		   sysctl_ip_default_ttl);
+		   net->ipv4.sysctl_ip_default_ttl);
 
 	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b537338f5c97..a833a9f9e4cd 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -282,15 +282,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_default_ttl",
-		.data		= &sysctl_ip_default_ttl,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &ip_ttl_min,
-		.extra2		= &ip_ttl_max,
-	},
 	{
 		.procname	= "tcp_max_orphans",
 		.data		= &sysctl_tcp_max_orphans,
@@ -752,6 +743,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_default_ttl",
+		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &ip_ttl_min,
+		.extra2		= &ip_ttl_max,
+	},
 	{
 		.procname	= "ip_local_port_range",
 		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),
@@ -988,6 +988,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	if (!net->ipv4.sysctl_local_reserved_ports)
 		goto err_ports;
 
+	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+
 	return 0;
 
 err_ports:
-- 
cgit v1.2.3


From 287b7f38fd6842e534db1783cead3843f7677b79 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:29 +0200
Subject: ipv4: Namespacify ip_dynaddr sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           |  3 ---
 include/net/netns/ipv4.h   |  2 ++
 net/ipv4/af_inet.c         | 10 ++--------
 net/ipv4/sysctl_net_ipv4.c | 15 ++++++++-------
 4 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 1a98f1ca1638..e3fb25d76421 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -248,9 +248,6 @@ extern int inet_peer_maxttl;
 /* From ip_input.c */
 extern int sysctl_ip_early_demux;
 
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
 void ipfrag_init(void);
 
 void ip_static_sysctl_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index bc8f7f94abcb..b7e3fb2587da 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -84,6 +84,8 @@ struct netns_ipv4 {
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 	int sysctl_ip_nonlocal_bind;
+	/* Shall we try to damage output packets if routing dev changes? */
+	int sysctl_ip_dynaddr;
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eade66db214e..209d1ed28954 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1095,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p)
 }
 EXPORT_SYMBOL(inet_unregister_protosw);
 
-/*
- *      Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr __read_mostly;
-
 static int inet_sk_reselect_saddr(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -1131,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	if (new_saddr == old_saddr)
 		return 0;
 
-	if (sysctl_ip_dynaddr > 1) {
+	if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
 		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
 			__func__, &old_saddr, &new_saddr);
 	}
@@ -1186,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
 		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
 		 */
-		if (!sysctl_ip_dynaddr ||
+		if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
 		    sk->sk_state != TCP_SYN_SENT ||
 		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
 		    (err = inet_sk_reselect_saddr(sk)) != 0)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a833a9f9e4cd..04ac5b763385 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -303,13 +303,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_dynaddr",
-		.data		= &sysctl_ip_dynaddr,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -743,6 +736,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_dynaddr",
+		.data		= &init_net.ipv4.sysctl_ip_dynaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
@@ -989,6 +989,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 		goto err_ports;
 
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+	net->ipv4.sysctl_ip_dynaddr = 0;
 
 	return 0;
 
-- 
cgit v1.2.3


From e21145a9871aa5ae07e01926105bb8e523d64095 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:30 +0200
Subject: ipv4: namespacify ip_early_demux sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           |  3 ---
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/ip_input.c        |  5 +----
 net/ipv4/sysctl_net_ipv4.c | 15 ++++++++-------
 net/ipv6/ip6_input.c       |  2 +-
 5 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index e3fb25d76421..cbb134b2f0e4 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -245,9 +245,6 @@ extern int inet_peer_threshold;
 extern int inet_peer_minttl;
 extern int inet_peer_maxttl;
 
-/* From ip_input.c */
-extern int sysctl_ip_early_demux;
-
 void ipfrag_init(void);
 
 void ip_static_sysctl_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b7e3fb2587da..a69cde3ce460 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -86,6 +86,7 @@ struct netns_ipv4 {
 	int sysctl_ip_nonlocal_bind;
 	/* Shall we try to damage output packets if routing dev changes? */
 	int sysctl_ip_dynaddr;
+	int sysctl_ip_early_demux;
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 852002f64c68..e3d782746d9d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -308,15 +308,12 @@ drop:
 	return true;
 }
 
-int sysctl_ip_early_demux __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_ip_early_demux);
-
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
-	if (sysctl_ip_early_demux &&
+	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
 	    !ip_is_fragment(iph)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 04ac5b763385..1e1fe6086dd9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -296,13 +296,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_early_demux",
-		.data		= &sysctl_ip_early_demux,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -743,6 +736,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_early_demux",
+		.data		= &init_net.ipv4.sysctl_ip_early_demux,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
@@ -990,6 +990,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
 	net->ipv4.sysctl_ip_dynaddr = 0;
+	net->ipv4.sysctl_ip_early_demux = 1;
 
 	return 0;
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 31ac3c56da4b..c05c425c2389 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,7 +49,7 @@
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
 		const struct inet6_protocol *ipprot;
 
 		ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-- 
cgit v1.2.3


From 0fbf4cb27e061204c8cee8e7eb2870416bdf30fd Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:31 +0200
Subject: ipv4: namespacify ip fragment max dist sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h |  1 +
 net/ipv4/ip_fragment.c  | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 12aac0fd6ee7..909972aa3acd 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -13,6 +13,7 @@ struct netns_frags {
 	int			timeout;
 	int			high_thresh;
 	int			low_thresh;
+	int			max_dist;
 };
 
 /**
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 187c6fcc3027..957161413335 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -54,8 +54,6 @@
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
  * as well. Or notify me, at least. --ANK
  */
-
-static int sysctl_ipfrag_max_dist __read_mostly = 64;
 static const char ip_frag_cache_name[] = "ip4-frags";
 
 struct ipfrag_skb_cb
@@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 	qp->daddr = arg->iph->daddr;
 	qp->vif = arg->vif;
 	qp->user = arg->user;
-	qp->peer = sysctl_ipfrag_max_dist ?
+	qp->peer = q->net->max_dist ?
 		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
 		NULL;
 }
@@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 static int ip_frag_too_far(struct ipq *qp)
 {
 	struct inet_peer *peer = qp->peer;
-	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int max = qp->q.net->max_dist;
 	unsigned int start, end;
 
 	int rc;
@@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "ipfrag_max_dist",
+		.data		= &init_net.ipv4.frags.max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
 	{ }
 };
 
@@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "ipfrag_max_dist",
-		.data		= &sysctl_ipfrag_max_dist,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
-	},
 	{ }
 };
 
@@ -790,6 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 		table[1].data = &net->ipv4.frags.low_thresh;
 		table[1].extra2 = &net->ipv4.frags.high_thresh;
 		table[2].data = &net->ipv4.frags.timeout;
+		table[3].data = &net->ipv4.frags.max_dist;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -865,6 +864,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 */
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
+	net->ipv4.frags.max_dist = 64;
+
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res)
 		return res;
-- 
cgit v1.2.3


From e4c6734eaab90695db0ea8456307790cb0c1ccb5 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:16:15 -0800
Subject: net: rework ndo tc op to consume additional qdisc handle parameter

The ndo_setup_tc() op was added to support drivers offloading tx
qdiscs however only support for mqprio was ever added. So we
only ever added support for passing the number of traffic classes
to the driver.

This patch generalizes the ndo_setup_tc op so that a handle can
be provided to indicate if the offload is for ingress or egress
or potentially even child qdiscs.

CC: Murali Karicheri <m-karicheri2@ti.com>
CC: Shradha Shah <sshah@solarflare.com>
CC: Or Gerlitz <ogerlitz@mellanox.com>
CC: Ariel Elior <ariel.elior@qlogic.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
CC: Bruce Allan <bruce.w.allan@intel.com>
CC: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Don Skidmore <donald.c.skidmore@intel.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c         |  5 ++++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  7 +++++++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h  |  1 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c        |  5 ++++-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c  | 10 +++++++++-
 drivers/net/ethernet/intel/i40e/i40e.h           |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c      | 17 ++++++++++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    | 11 ++++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   | 12 ++++++++++--
 drivers/net/ethernet/sfc/efx.h                   |  2 +-
 drivers/net/ethernet/sfc/tx.c                    |  5 ++++-
 drivers/net/ethernet/ti/netcp_core.c             |  5 ++++-
 include/linux/netdevice.h                        |  3 ++-
 net/sched/sch_mqprio.c                           |  5 +++--
 16 files changed, 74 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 8a9b493566c9..9955cae3cabc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1626,12 +1626,15 @@ static void xgbe_poll_controller(struct net_device *netdev)
 }
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
-static int xgbe_setup_tc(struct net_device *netdev, u8 tc)
+static int xgbe_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int offset, queue;
 	u8 i;
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (tc && (tc != pdata->hw_feat.tc_cnt))
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 9e42bcaf9917..b262cba34dfa 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4272,6 +4272,13 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 	return 0;
 }
 
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+	return bnx2x_setup_tc(dev, num_tc);
+}
+
 /* called with rtnl_lock */
 int bnx2x_change_mac_addr(struct net_device *dev, void *p)
 {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 37369865ca6d..60a4109dcdeb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -486,6 +486,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c5845252c920..81fc51c4ec2b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13061,7 +13061,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= poll_bnx2x,
 #endif
-	.ndo_setup_tc		= bnx2x_setup_tc,
+	.ndo_setup_tc		= __bnx2x_setup_tc,
 #ifdef CONFIG_BNX2X_SRIOV
 	.ndo_set_vf_mac		= bnx2x_set_vf_mac,
 	.ndo_set_vf_vlan	= bnx2x_set_vf_vlan,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5dc89e527e7d..ff08faf44ee5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5370,10 +5370,13 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u8 tc)
+static int bnxt_setup_tc(struct net_device *dev, u32 handle, u8 tc)
 {
 	struct bnxt *bp = netdev_priv(dev);
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (tc > bp->max_tc) {
 		netdev_err(dev, "too many traffic classes requested: %d Max supported is %d\n",
 			   tc, bp->max_tc);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 662569d5b7c0..12701a492325 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1204,6 +1204,14 @@ err_queueing_scheme:
 	return err;
 }
 
+static int __fm10k_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return fm10k_setup_tc(dev, tc);
+}
+
 static int fm10k_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 {
 	switch (cmd) {
@@ -1386,7 +1394,7 @@ static const struct net_device_ops fm10k_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= fm10k_vlan_rx_kill_vid,
 	.ndo_set_rx_mode	= fm10k_set_rx_mode,
 	.ndo_get_stats64	= fm10k_get_stats64,
-	.ndo_setup_tc		= fm10k_setup_tc,
+	.ndo_setup_tc		= __fm10k_setup_tc,
 	.ndo_set_vf_mac		= fm10k_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= fm10k_ndo_set_vf_vlan,
 	.ndo_set_vf_rate	= fm10k_ndo_set_vf_bw,
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 53ed3bdd8363..ef9ca075d5e5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -788,7 +788,7 @@ struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr,
 				      bool is_vf, bool is_netdev);
 #ifdef I40E_FCOE
 int i40e_close(struct net_device *netdev);
-int i40e_setup_tc(struct net_device *netdev, u8 tc);
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc);
 void i40e_netpoll(struct net_device *netdev);
 int i40e_fcoe_enable(struct net_device *netdev);
 int i40e_fcoe_disable(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
index 579a46ca82df..7c66ce416ec7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
@@ -1457,7 +1457,7 @@ static const struct net_device_ops i40e_fcoe_netdev_ops = {
 	.ndo_tx_timeout		= i40e_tx_timeout,
 	.ndo_vlan_rx_add_vid	= i40e_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= i40e_vlan_rx_kill_vid,
-	.ndo_setup_tc		= i40e_setup_tc,
+	.ndo_setup_tc		= __i40e_setup_tc,
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= i40e_netpoll,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 320b0491abd9..abcb6c152186 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5253,11 +5253,7 @@ void i40e_down(struct i40e_vsi *vsi)
  * @netdev: net device to configure
  * @tc: number of traffic classes to enable
  **/
-#ifdef I40E_FCOE
-int i40e_setup_tc(struct net_device *netdev, u8 tc)
-#else
 static int i40e_setup_tc(struct net_device *netdev, u8 tc)
-#endif
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -5310,6 +5306,17 @@ exit:
 	return ret;
 }
 
+#ifdef I40E_FCOE
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+#else
+static int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+#endif
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+	return i40e_setup_tc(netdev, tc);
+}
+
 /**
  * i40e_open - Called when a network interface is made active
  * @netdev: network interface device structure
@@ -8951,7 +8958,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= i40e_netpoll,
 #endif
-	.ndo_setup_tc		= i40e_setup_tc,
+	.ndo_setup_tc		= __i40e_setup_tc,
 #ifdef I40E_FCOE
 	.ndo_fcoe_enable	= i40e_fcoe_enable,
 	.ndo_fcoe_disable	= i40e_fcoe_disable,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0c701b8438b6..1ba714efd78c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8200,6 +8200,15 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	return 0;
 }
 
+int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+{
+	/* Only support egress tc setup for now */
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return ixgbe_setup_tc(dev, tc);
+}
+
 #ifdef CONFIG_PCI_IOV
 void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter)
 {
@@ -8658,7 +8667,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
 	.ndo_get_stats64	= ixgbe_get_stats64,
 #ifdef CONFIG_IXGBE_DCB
-	.ndo_setup_tc		= ixgbe_setup_tc,
+	.ndo_setup_tc		= __ixgbe_setup_tc,
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0c7e3f69a73b..d5c6c16b9457 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -69,6 +69,14 @@ int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 	return 0;
 }
 
+static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, u8 up)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return mlx4_en_setup_tc(dev, up);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 struct mlx4_en_filter {
@@ -2466,7 +2474,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
 	.ndo_fix_features	= mlx4_en_fix_features,
-	.ndo_setup_tc		= mlx4_en_setup_tc,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
@@ -2504,7 +2512,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
 	.ndo_fix_features	= mlx4_en_fix_features,
-	.ndo_setup_tc		= mlx4_en_setup_tc,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 10827476bc0b..7815fa09b15d 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -32,7 +32,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
-int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
+int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index f7a0ec1bca97..8f1d53e2aca7 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -562,7 +562,7 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
+int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
@@ -570,6 +570,9 @@ int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
 	unsigned tc;
 	int rc;
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index c61d66d38634..40cde814608b 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1835,13 +1835,16 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
-static int netcp_setup_tc(struct net_device *dev, u8 num_tc)
+static int netcp_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
 {
 	int i;
 
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
 	    (dev->real_num_tx_queues < num_tc))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0499569c256d..48928b6f9cb6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -51,6 +51,7 @@
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
 #include <uapi/linux/if_bonding.h>
+#include <uapi/linux/pkt_cls.h>
 
 struct netpoll_info;
 struct device;
@@ -1150,7 +1151,7 @@ struct net_device_ops {
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
-	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc);
+	int			(*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index ad70ecf57ce7..f5a0e8a4dbd7 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -39,7 +39,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, 0);
+		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0);
 	else
 		netdev_set_num_tc(dev, 0);
 }
@@ -141,7 +141,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 */
 	if (qopt->hw) {
 		priv->hw_owned = 1;
-		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle,
+						    qopt->num_tc);
 		if (err)
 			goto err;
 	} else {
-- 
cgit v1.2.3


From 16e5cc647173a97e33b3e3ba81f73eb455561794 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:16:43 -0800
Subject: net: rework setup_tc ndo op to consume general tc operand

This patch updates setup_tc so we can pass additional parameters into
the ndo op in a generic way. To do this we provide structured union
and type flag.

This lets each classifier and qdisc provide its own set of attributes
without having to add new ndo ops or grow the signature of the
callback.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c        |  9 ++++++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  7 ++++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  8 ++++++--
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c |  7 ++++---
 drivers/net/ethernet/intel/i40e/i40e.h          |  3 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c     | 10 ++++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |  7 ++++---
 drivers/net/ethernet/sfc/efx.h                  |  3 ++-
 drivers/net/ethernet/sfc/tx.c                   |  9 ++++++---
 drivers/net/ethernet/ti/netcp_core.c            | 13 +++++++------
 include/linux/netdevice.h                       | 20 +++++++++++++++++++-
 net/sched/sch_mqprio.c                          |  9 ++++++---
 14 files changed, 78 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 9955cae3cabc..cfd3f7efda1c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1626,15 +1626,18 @@ static void xgbe_poll_controller(struct net_device *netdev)
 }
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
-static int xgbe_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+static int xgbe_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+			 struct tc_to_netdev *tc_to_netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int offset, queue;
-	u8 i;
+	u8 i, tc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc_to_netdev->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	tc = tc_to_netdev->tc;
+
 	if (tc && (tc != pdata->hw_feat.tc_cnt))
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index b262cba34dfa..45843d150868 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4272,11 +4272,12 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 	return 0;
 }
 
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return bnx2x_setup_tc(dev, num_tc);
+	return bnx2x_setup_tc(dev, tc->tc);
 }
 
 /* called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 60a4109dcdeb..0e68fadecfdb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -486,7 +486,8 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc);
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index ff08faf44ee5..169920aa39f3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5370,13 +5370,17 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			 struct tc_to_netdev *ntc)
 {
 	struct bnxt *bp = netdev_priv(dev);
+	u8 tc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	tc = ntc->tc;
+
 	if (tc > bp->max_tc) {
 		netdev_err(dev, "too many traffic classes requested: %d Max supported is %d\n",
 			   tc, bp->max_tc);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 12701a492325..dc1a82148ff0 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1204,12 +1204,13 @@ err_queueing_scheme:
 	return err;
 }
 
-static int __fm10k_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+static int __fm10k_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			    struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return fm10k_setup_tc(dev, tc);
+	return fm10k_setup_tc(dev, tc->tc);
 }
 
 static int fm10k_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ef9ca075d5e5..933c4b3d92c8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -788,7 +788,8 @@ struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr,
 				      bool is_vf, bool is_netdev);
 #ifdef I40E_FCOE
 int i40e_close(struct net_device *netdev);
-int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc);
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+		    struct tc_to_netdev *tc);
 void i40e_netpoll(struct net_device *netdev);
 int i40e_fcoe_enable(struct net_device *netdev);
 int i40e_fcoe_disable(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index abcb6c152186..257d16207976 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5307,14 +5307,16 @@ exit:
 }
 
 #ifdef I40E_FCOE
-int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+		    struct tc_to_netdev *tc)
 #else
-static int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+static int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+			   struct tc_to_netdev *tc)
 #endif
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return i40e_setup_tc(netdev, tc);
+	return i40e_setup_tc(netdev, tc->tc);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1ba714efd78c..dca2298f4c36 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8200,13 +8200,14 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	return 0;
 }
 
-int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+int __ixgbe_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc)
 {
 	/* Only support egress tc setup for now */
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return ixgbe_setup_tc(dev, tc);
+	return ixgbe_setup_tc(dev, tc->tc);
 }
 
 #ifdef CONFIG_PCI_IOV
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d5c6c16b9457..01d6a9695586 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -69,12 +69,13 @@ int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 	return 0;
 }
 
-static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, u8 up)
+static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			      struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return mlx4_en_setup_tc(dev, up);
+	return mlx4_en_setup_tc(dev, tc->tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 7815fa09b15d..5e3f93f04e62 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -32,7 +32,8 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc);
+int efx_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
+		 struct tc_to_netdev *tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 8f1d53e2aca7..2cdb5718ed66 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -562,17 +562,20 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc)
+int efx_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
+		 struct tc_to_netdev *ntc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
 	struct efx_tx_queue *tx_queue;
-	unsigned tc;
+	unsigned tc, num_tc;
 	int rc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	num_tc = ntc->tc;
+
 	if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 40cde814608b..8586a2034019 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1835,25 +1835,26 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
-static int netcp_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+static int netcp_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			  struct tc_to_netdev tc)
 {
 	int i;
 
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
-	    (dev->real_num_tx_queues < num_tc))
+	    (dev->real_num_tx_queues < tc->tc))
 		return -EINVAL;
 
 	/* Configure traffic class to queue mappings */
-	if (num_tc) {
-		netdev_set_num_tc(dev, num_tc);
-		for (i = 0; i < num_tc; i++)
+	if (tc->tc) {
+		netdev_set_num_tc(dev, tc->tc);
+		for (i = 0; i < tc->tc; i++)
 			netdev_set_tc_queue(dev, i, 1, i);
 	} else {
 		netdev_reset_tc(dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48928b6f9cb6..e396060f815f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,6 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
+/* This structure holds attributes of qdisc and classifiers
+ * that are being passed to the netdevice through the setup_tc op.
+ */
+enum {
+	TC_SETUP_MQPRIO,
+};
+
+struct tc_to_netdev {
+	unsigned int type;
+	union {
+		u8 tc;
+	};
+};
+
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1151,7 +1166,10 @@ struct net_device_ops {
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
-	int			(*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc);
+	int			(*ndo_setup_tc)(struct net_device *dev,
+						u32 handle,
+						__be16 protocol,
+						struct tc_to_netdev *tc);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f5a0e8a4dbd7..f9947d1f4952 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO};
 	unsigned int ntx;
 
 	if (priv->qdiscs) {
@@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0);
+		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 	else
 		netdev_set_num_tc(dev, 0);
 }
@@ -140,9 +141,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
+		struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO,
+					  .tc = qopt->num_tc};
+
 		priv->hw_owned = 1;
-		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle,
-						    qopt->num_tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 		if (err)
 			goto err;
 	} else {
-- 
cgit v1.2.3


From a1b7c5fd7fe98f51fbbc393ee1fc4c1cdb2f0119 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:17:09 -0800
Subject: net: sched: add cls_u32 offload hooks for netdevs

This patch allows netdev drivers to consume cls_u32 offloads via
the ndo_setup_tc ndo op.

This works aligns with how network drivers have been doing qdisc
offloads for mqprio.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++-
 include/net/pkt_cls.h     | 34 ++++++++++++++++
 net/sched/cls_u32.c       | 99 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e396060f815f..47671ce04ac4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,17 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
-/* This structure holds attributes of qdisc and classifiers
+/* These structures hold the attributes of qdisc and classifiers
  * that are being passed to the netdevice through the setup_tc op.
  */
 enum {
 	TC_SETUP_MQPRIO,
+	TC_SETUP_CLSU32,
 };
 
+struct tc_cls_u32_offload;
+
 struct tc_to_netdev {
 	unsigned int type;
 	union {
 		u8 tc;
+		struct tc_cls_u32_offload *cls_u32;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bc49967e1a68..59789ca6e2c8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -358,4 +358,38 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
+struct tc_cls_u32_knode {
+	struct tcf_exts *exts;
+	u8 fshift;
+	u32 handle;
+	u32 val;
+	u32 mask;
+	u32 link_handle;
+	struct tc_u32_sel *sel;
+};
+
+struct tc_cls_u32_hnode {
+	u32 handle;
+	u32 prio;
+	unsigned int divisor;
+};
+
+enum tc_clsu32_command {
+	TC_CLSU32_NEW_KNODE,
+	TC_CLSU32_REPLACE_KNODE,
+	TC_CLSU32_DELETE_KNODE,
+	TC_CLSU32_NEW_HNODE,
+	TC_CLSU32_REPLACE_HNODE,
+	TC_CLSU32_DELETE_HNODE,
+};
+
+struct tc_cls_u32_offload {
+	/* knode values */
+	enum tc_clsu32_command command;
+	union {
+		struct tc_cls_u32_knode knode;
+		struct tc_cls_u32_hnode hnode;
+	};
+};
+
 #endif
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4fbb67430ce4..d54bc942ea87 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -43,6 +43,7 @@
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <linux/netdevice.h>
 
 struct tc_u_knode {
 	struct tc_u_knode __rcu	*next;
@@ -424,6 +425,93 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 	return 0;
 }
 
+static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
+		offload.cls_u32->knode.handle = handle;
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+		offload.cls_u32->hnode.divisor = h->divisor;
+		offload.cls_u32->hnode.handle = h->handle;
+		offload.cls_u32->hnode.prio = h->prio;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
+		offload.cls_u32->hnode.divisor = h->divisor;
+		offload.cls_u32->hnode.handle = h->handle;
+		offload.cls_u32->hnode.prio = h->prio;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+		offload.cls_u32->knode.handle = n->handle;
+		offload.cls_u32->knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+		offload.cls_u32->knode.val = n->val;
+		offload.cls_u32->knode.mask = n->mask;
+#else
+		offload.cls_u32->knode.val = 0;
+		offload.cls_u32->knode.mask = 0;
+#endif
+		offload.cls_u32->knode.sel = &n->sel;
+		offload.cls_u32->knode.exts = &n->exts;
+		if (n->ht_down)
+			offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 {
 	struct tc_u_knode *n;
@@ -434,6 +522,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
+			u32_remove_hw_knode(tp, n->handle);
 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
 		}
 	}
@@ -454,6 +543,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	     phn;
 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
 		if (phn == ht) {
+			u32_clear_hw_hnode(tp, ht);
 			RCU_INIT_POINTER(*hn, ht->next);
 			kfree_rcu(ht, rcu);
 			return 0;
@@ -540,8 +630,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 	if (ht == NULL)
 		return 0;
 
-	if (TC_U32_KEY(ht->handle))
+	if (TC_U32_KEY(ht->handle)) {
+		u32_remove_hw_knode(tp, ht->handle);
 		return u32_delete_key(tp, (struct tc_u_knode *)ht);
+	}
 
 	if (root_ht == ht)
 		return -EINVAL;
@@ -769,6 +861,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
+		u32_replace_hw_knode(tp, new);
 		return 0;
 	}
 
@@ -795,6 +888,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
+
+		u32_replace_hw_hnode(tp, ht);
 		return 0;
 	}
 
@@ -877,7 +972,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-
+		u32_replace_hw_knode(tp, n);
 		*arg = (unsigned long)n;
 		return 0;
 	}
-- 
cgit v1.2.3


From 1c78c64e9c6f43a490427d55cd2d213b7c6795c1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:17:37 -0800
Subject: net: add tc offload feature flag

Its useful to turn off the qdisc offload feature at a per device
level. This gives us a big hammer to enable/disable offloading.
More fine grained control (i.e. per rule) may be supported later.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 3 +++
 net/core/ethtool.c              | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index d9654f0eecb3..a734bf43d190 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -67,6 +67,8 @@ enum {
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
 
+	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
+
 	/*
 	 * Add your fresh new feature above and remember to update
 	 * netdev_features_strings[] in net/core/ethtool.c and maybe
@@ -124,6 +126,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
+#define NETIF_F_HW_TC		__NETIF_F(HW_TC)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 65f907aea777..c2d3118b1395 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
+	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From 3b01cf56daf96acf9b155d6201d94bc8b4de218e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:18:03 -0800
Subject: net: tc: helper functions to query action types

This is a helper function drivers can use to learn if the
action type is a drop action.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 592a6bc02b0b..04a31830711b 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -2,6 +2,7 @@
 #define __NET_TC_GACT_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_gact.h>
 
 struct tcf_gact {
 	struct tcf_common	common;
@@ -15,4 +16,19 @@ struct tcf_gact {
 #define to_gact(a) \
 	container_of(a->priv, struct tcf_gact, common)
 
+#ifdef CONFIG_NET_CLS_ACT
+static inline bool is_tcf_gact_shot(const struct tc_action *a)
+{
+	struct tcf_gact *gact;
+
+	if (a->ops && a->ops->type != TCA_ACT_GACT)
+		return false;
+
+	gact = a->priv;
+	if (gact->tcf_action == TC_ACT_SHOT)
+		return true;
+
+	return false;
+}
+#endif
 #endif /* __NET_TC_GACT_H */
-- 
cgit v1.2.3


From 1cd4d5c4326a7ed3bb0e346bd7d20f5057a80ae6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Feb 2016 14:28:05 +0800
Subject: sctp: remove the unused sctp_datamsg_free()

Since commit 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg
after sending a msg") used sctp_datamsg_put in sctp_sendmsg, instead of
sctp_datamsg_free, this function has no use in sctp.

So we will remove it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 -
 net/sctp/chunk.c           | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 205630bb5010..d05b56641abc 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -535,7 +535,6 @@ struct sctp_datamsg {
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
 					    struct sctp_sndrcvinfo *,
 					    struct iov_iter *);
-void sctp_datamsg_free(struct sctp_datamsg *);
 void sctp_datamsg_put(struct sctp_datamsg *);
 void sctp_chunk_fail(struct sctp_chunk *, int error);
 int sctp_chunk_abandoned(struct sctp_chunk *);
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a3380917f197..3aa43073e0b9 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
 	return msg;
 }
 
-void sctp_datamsg_free(struct sctp_datamsg *msg)
-{
-	struct sctp_chunk *chunk;
-
-	/* This doesn't have to be a _safe vairant because
-	 * sctp_chunk_free() only drops the refs.
-	 */
-	list_for_each_entry(chunk, &msg->chunks, frag_list)
-		sctp_chunk_free(chunk);
-
-	sctp_datamsg_put(msg);
-}
-
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
-- 
cgit v1.2.3


From fc48b7a6148af974b49db145812a8b060324a503 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Mon, 15 Feb 2016 13:22:35 -0500
Subject: qed/qede: use 8.7.3.0 FW.

This patch moves the qed* driver into utilizing the 8.7.3.0 FW.
This new FW is required for a lot of new SW features, including:
  - Vlan filtering offload
  - Encapsulation offload support
  - HW ingress aggregations
As well as paving the way for the possibility of adding storage protocols
in the future.

V2:
 - Fix kbuild test robot error/warnings.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@qlogic.com>
Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h              |   43 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.c          |    3 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c          |   88 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          | 2690 +++++++++-----------
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    |   22 +-
 drivers/net/ethernet/qlogic/qed/qed_init_ops.c     |  155 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c           |   13 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c         |    2 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c          |   37 +-
 drivers/net/ethernet/qlogic/qed/qed_sp.h           |    2 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c  |   17 +-
 drivers/net/ethernet/qlogic/qede/qede.h            |    8 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c    |    6 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  264 +-
 include/linux/qed/common_hsi.h                     |   36 +-
 include/linux/qed/eth_common.h                     |  171 +-
 include/linux/qed/qed_if.h                         |    8 +-
 17 files changed, 1795 insertions(+), 1770 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 1292c360390c..d34da638b5d5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -70,8 +70,8 @@ struct qed_sb_sp_info;
 struct qed_mcp_info;
 
 struct qed_rt_data {
-	u32 init_val;
-	bool b_valid;
+	u32	*init_val;
+	bool	*b_valid;
 };
 
 /* The PCI personality is not quite synonymous to protocol ID:
@@ -120,6 +120,10 @@ enum QED_PORT_MODE {
 	QED_PORT_MODE_DE_1X25G
 };
 
+enum qed_dev_cap {
+	QED_DEV_CAP_ETH,
+};
+
 struct qed_hw_info {
 	/* PCI personality */
 	enum qed_pci_personality	personality;
@@ -151,6 +155,7 @@ struct qed_hw_info {
 
 	u32				port_mode;
 	u32				hw_mode;
+	unsigned long		device_capabilities;
 };
 
 struct qed_hw_cid_data {
@@ -267,7 +272,7 @@ struct qed_hwfn {
 	struct qed_hw_info		hw_info;
 
 	/* rt_array (for init-tool) */
-	struct qed_rt_data		*rt_data;
+	struct qed_rt_data		rt_data;
 
 	/* SPQ */
 	struct qed_spq			*p_spq;
@@ -350,9 +355,20 @@ struct qed_dev {
 	char	name[NAME_SIZE];
 
 	u8	type;
-#define QED_DEV_TYPE_BB_A0      (0 << 0)
-#define QED_DEV_TYPE_MASK       (0x3)
-#define QED_DEV_TYPE_SHIFT      (0)
+#define QED_DEV_TYPE_BB (0 << 0)
+#define QED_DEV_TYPE_AH BIT(0)
+/* Translate type/revision combo into the proper conditions */
+#define QED_IS_BB(dev)  ((dev)->type == QED_DEV_TYPE_BB)
+#define QED_IS_BB_A0(dev)       (QED_IS_BB(dev) && \
+				 CHIP_REV_IS_A0(dev))
+#define QED_IS_BB_B0(dev)       (QED_IS_BB(dev) && \
+				 CHIP_REV_IS_B0(dev))
+
+#define QED_GET_TYPE(dev)       (QED_IS_BB_A0(dev) ? CHIP_BB_A0 : \
+				 QED_IS_BB_B0(dev) ? CHIP_BB_B0 : CHIP_K2)
+
+	u16	vendor_id;
+	u16	device_id;
 
 	u16	chip_num;
 #define CHIP_NUM_MASK                   0xffff
@@ -361,6 +377,8 @@ struct qed_dev {
 	u16	chip_rev;
 #define CHIP_REV_MASK                   0xf
 #define CHIP_REV_SHIFT                  12
+#define CHIP_REV_IS_A0(_cdev)   (!(_cdev)->chip_rev)
+#define CHIP_REV_IS_B0(_cdev)   ((_cdev)->chip_rev == 1)
 
 	u16				chip_metal;
 #define CHIP_METAL_MASK                 0xff
@@ -375,10 +393,10 @@ struct qed_dev {
 	u8				num_funcs_in_port;
 
 	u8				path_id;
-	enum mf_mode			mf_mode;
-#define IS_MF(_p_hwfn)          (((_p_hwfn)->cdev)->mf_mode != SF)
-#define IS_MF_SI(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == MF_NPAR)
-#define IS_MF_SD(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == MF_OVLAN)
+	enum qed_mf_mode		mf_mode;
+#define IS_MF_DEFAULT(_p_hwfn)  (((_p_hwfn)->cdev)->mf_mode == QED_MF_DEFAULT)
+#define IS_MF_SI(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_NPAR)
+#define IS_MF_SD(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_OVLAN)
 
 	int				pcie_width;
 	int				pcie_speed;
@@ -441,11 +459,6 @@ struct qed_dev {
 	const struct firmware		*firmware;
 };
 
-#define QED_GET_TYPE(dev)       (((dev)->type & QED_DEV_TYPE_MASK) >> \
-				 QED_DEV_TYPE_SHIFT)
-#define QED_IS_BB_A0(dev)       (QED_GET_TYPE(dev) == QED_DEV_TYPE_BB_A0)
-#define QED_IS_BB(dev)  (QED_IS_BB_A0(dev))
-
 #define NUM_OF_SBS(dev)         MAX_SB_PER_PATH_BB
 #define NUM_OF_ENG_PFS(dev)     MAX_NUM_PFS_BB
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 7ccdb46c6764..d3f7a0215e7e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -581,7 +581,8 @@ void qed_qm_init_pf(struct qed_hwfn *p_hwfn)
 	params.num_pf_cids = iids.cids;
 	params.start_pq = qm_info->start_pq;
 	params.num_pf_pqs = qm_info->num_pqs;
-	params.start_vport = qm_info->num_vports;
+	params.start_vport = qm_info->start_vport;
+	params.num_vports = qm_info->num_vports;
 	params.pf_wfq = qm_info->pf_wfq;
 	params.pf_rl = qm_info->pf_rl;
 	params.pq_params = qm_info->qm_pq_params;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 817bbd5476ff..bc17ed2c9cac 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -341,11 +341,6 @@ void qed_resc_setup(struct qed_dev *cdev)
 	}
 }
 
-#define FINAL_CLEANUP_CMD_OFFSET        (0)
-#define FINAL_CLEANUP_CMD (0x1)
-#define FINAL_CLEANUP_VALID_OFFSET      (6)
-#define FINAL_CLEANUP_VFPF_ID_SHIFT     (7)
-#define FINAL_CLEANUP_COMP (0x2)
 #define FINAL_CLEANUP_POLL_CNT          (100)
 #define FINAL_CLEANUP_POLL_TIME         (10)
 int qed_final_cleanup(struct qed_hwfn *p_hwfn,
@@ -355,12 +350,14 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 	u32 command = 0, addr, count = FINAL_CLEANUP_POLL_CNT;
 	int rc = -EBUSY;
 
-	addr = GTT_BAR0_MAP_REG_USDM_RAM + USTORM_FLR_FINAL_ACK_OFFSET;
+	addr = GTT_BAR0_MAP_REG_USDM_RAM +
+		USTORM_FLR_FINAL_ACK_OFFSET(p_hwfn->rel_pf_id);
 
-	command |= FINAL_CLEANUP_CMD << FINAL_CLEANUP_CMD_OFFSET;
-	command |= 1 << FINAL_CLEANUP_VALID_OFFSET;
-	command |= id << FINAL_CLEANUP_VFPF_ID_SHIFT;
-	command |= FINAL_CLEANUP_COMP << SDM_OP_GEN_COMP_TYPE_SHIFT;
+	command |= X_FINAL_CLEANUP_AGG_INT <<
+		SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT;
+	command |= 1 << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT;
+	command |= id << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_SHIFT;
+	command |= SDM_COMP_TYPE_AGG_INT << SDM_OP_GEN_COMP_TYPE_SHIFT;
 
 	/* Make sure notification is not set before initiating final cleanup */
 	if (REG_RD(p_hwfn, addr)) {
@@ -415,18 +412,16 @@ static void qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 	}
 
 	switch (p_hwfn->cdev->mf_mode) {
-	case SF:
-		hw_mode |= 1 << MODE_SF;
+	case QED_MF_DEFAULT:
+	case QED_MF_NPAR:
+		hw_mode |= 1 << MODE_MF_SI;
 		break;
-	case MF_OVLAN:
+	case QED_MF_OVLAN:
 		hw_mode |= 1 << MODE_MF_SD;
 		break;
-	case MF_NPAR:
-		hw_mode |= 1 << MODE_MF_SI;
-		break;
 	default:
-		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as SF\n");
-		hw_mode |= 1 << MODE_SF;
+		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+		hw_mode |= 1 << MODE_MF_SI;
 	}
 
 	hw_mode |= 1 << MODE_ASIC;
@@ -1018,8 +1013,7 @@ static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	int num_funcs, i;
 
-	num_funcs = IS_MF(p_hwfn) ? MAX_NUM_PFS_BB
-				  : p_hwfn->cdev->num_ports_in_engines;
+	num_funcs = MAX_NUM_PFS_BB;
 
 	resc_num[QED_SB] = min_t(u32,
 				 (MAX_SB_PER_PATH_BB / num_funcs),
@@ -1071,7 +1065,7 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn,
 			       struct qed_ptt *p_ptt)
 {
 	u32 nvm_cfg1_offset, mf_mode, addr, generic_cont0, core_cfg;
-	u32 port_cfg_addr, link_temp, val, nvm_cfg_addr;
+	u32 port_cfg_addr, link_temp, nvm_cfg_addr, device_capabilities;
 	struct qed_mcp_link_params *link;
 
 	/* Read global nvm_cfg address */
@@ -1134,21 +1128,6 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn,
 		break;
 	}
 
-	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
-	       offsetof(struct nvm_cfg1, func[MCP_PF_ID(p_hwfn)]) +
-	       offsetof(struct nvm_cfg1_func, device_id);
-	val = qed_rd(p_hwfn, p_ptt, addr);
-
-	if (IS_MF(p_hwfn)) {
-		p_hwfn->hw_info.device_id =
-			(val & NVM_CFG1_FUNC_MF_VENDOR_DEVICE_ID_MASK) >>
-			NVM_CFG1_FUNC_MF_VENDOR_DEVICE_ID_OFFSET;
-	} else {
-		p_hwfn->hw_info.device_id =
-			(val & NVM_CFG1_FUNC_VENDOR_DEVICE_ID_MASK) >>
-			NVM_CFG1_FUNC_VENDOR_DEVICE_ID_OFFSET;
-	}
-
 	/* Read default link configuration */
 	link = &p_hwfn->mcp_info->link_input;
 	port_cfg_addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
@@ -1220,18 +1199,28 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn,
 
 	switch (mf_mode) {
 	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
-		p_hwfn->cdev->mf_mode = MF_OVLAN;
+		p_hwfn->cdev->mf_mode = QED_MF_OVLAN;
 		break;
 	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
-		p_hwfn->cdev->mf_mode = MF_NPAR;
+		p_hwfn->cdev->mf_mode = QED_MF_NPAR;
 		break;
-	case NVM_CFG1_GLOB_MF_MODE_FORCED_SF:
-		p_hwfn->cdev->mf_mode = SF;
+	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
+		p_hwfn->cdev->mf_mode = QED_MF_DEFAULT;
 		break;
 	}
 	DP_INFO(p_hwfn, "Multi function mode is %08x\n",
 		p_hwfn->cdev->mf_mode);
 
+	/* Read Multi-function information from shmem */
+	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+		offsetof(struct nvm_cfg1, glob) +
+		offsetof(struct nvm_cfg1_glob, device_capabilities);
+
+	device_capabilities = qed_rd(p_hwfn, p_ptt, addr);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET)
+		__set_bit(QED_DEV_CAP_ETH,
+			  &p_hwfn->hw_info.device_capabilities);
+
 	return qed_mcp_fill_shmem_func_info(p_hwfn, p_ptt);
 }
 
@@ -1293,29 +1282,36 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 
 static void qed_get_dev_info(struct qed_dev *cdev)
 {
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
 	u32 tmp;
 
-	cdev->chip_num = (u16)qed_rd(cdev->hwfns, cdev->hwfns[0].p_main_ptt,
+	/* Read Vendor Id / Device Id */
+	pci_read_config_word(cdev->pdev, PCI_VENDOR_ID,
+			     &cdev->vendor_id);
+	pci_read_config_word(cdev->pdev, PCI_DEVICE_ID,
+			     &cdev->device_id);
+	cdev->chip_num = (u16)qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				     MISCS_REG_CHIP_NUM);
-	cdev->chip_rev = (u16)qed_rd(cdev->hwfns, cdev->hwfns[0].p_main_ptt,
+	cdev->chip_rev = (u16)qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				     MISCS_REG_CHIP_REV);
 	MASK_FIELD(CHIP_REV, cdev->chip_rev);
 
+	cdev->type = QED_DEV_TYPE_BB;
 	/* Learn number of HW-functions */
-	tmp = qed_rd(cdev->hwfns, cdev->hwfns[0].p_main_ptt,
+	tmp = qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 		     MISCS_REG_CMT_ENABLED_FOR_PAIR);
 
-	if (tmp & (1 << cdev->hwfns[0].rel_pf_id)) {
+	if (tmp & (1 << p_hwfn->rel_pf_id)) {
 		DP_NOTICE(cdev->hwfns, "device in CMT mode\n");
 		cdev->num_hwfns = 2;
 	} else {
 		cdev->num_hwfns = 1;
 	}
 
-	cdev->chip_bond_id = qed_rd(cdev->hwfns, cdev->hwfns[0].p_main_ptt,
+	cdev->chip_bond_id = qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				    MISCS_REG_CHIP_TEST_REG) >> 4;
 	MASK_FIELD(CHIP_BOND_ID, cdev->chip_bond_id);
-	cdev->chip_metal = (u16)qed_rd(cdev->hwfns, cdev->hwfns[0].p_main_ptt,
+	cdev->chip_metal = (u16)qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				       MISCS_REG_CHIP_METAL);
 	MASK_FIELD(CHIP_METAL, cdev->chip_metal);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 264e954675d1..49bbf696a16d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -34,6 +34,8 @@ enum common_event_opcode {
 	COMMON_EVENT_RESERVED3,
 	COMMON_EVENT_RESERVED4,
 	COMMON_EVENT_RESERVED5,
+	COMMON_EVENT_RESERVED6,
+	COMMON_EVENT_EMPTY,
 	MAX_COMMON_EVENT_OPCODE
 };
 
@@ -45,6 +47,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_RESERVED,
 	COMMON_RAMROD_RESERVED2,
 	COMMON_RAMROD_RESERVED3,
+	COMMON_RAMROD_EMPTY,
 	MAX_COMMON_RAMROD_CMD_ID
 };
 
@@ -331,6 +334,179 @@ struct xstorm_core_conn_ag_ctx {
 	__le16	word15 /* word15 */;
 };
 
+struct tstorm_core_conn_ag_ctx {
+	u8	byte0 /* cdu_validation */;
+	u8	byte1 /* state */;
+	u8	flags0;
+#define TSTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
+#define TSTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
+#define TSTORM_CORE_CONN_AG_CTX_BIT2_MASK     0x1       /* bit2 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT2_SHIFT    2
+#define TSTORM_CORE_CONN_AG_CTX_BIT3_MASK     0x1       /* bit3 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT3_SHIFT    3
+#define TSTORM_CORE_CONN_AG_CTX_BIT4_MASK     0x1       /* bit4 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT4_SHIFT    4
+#define TSTORM_CORE_CONN_AG_CTX_BIT5_MASK     0x1       /* bit5 */
+#define TSTORM_CORE_CONN_AG_CTX_BIT5_SHIFT    5
+#define TSTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* timer0cf */
+#define TSTORM_CORE_CONN_AG_CTX_CF0_SHIFT     6
+	u8 flags1;
+#define TSTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* timer1cf */
+#define TSTORM_CORE_CONN_AG_CTX_CF1_SHIFT     0
+#define TSTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* timer2cf */
+#define TSTORM_CORE_CONN_AG_CTX_CF2_SHIFT     2
+#define TSTORM_CORE_CONN_AG_CTX_CF3_MASK      0x3       /* timer_stop_all */
+#define TSTORM_CORE_CONN_AG_CTX_CF3_SHIFT     4
+#define TSTORM_CORE_CONN_AG_CTX_CF4_MASK      0x3       /* cf4 */
+#define TSTORM_CORE_CONN_AG_CTX_CF4_SHIFT     6
+	u8 flags2;
+#define TSTORM_CORE_CONN_AG_CTX_CF5_MASK      0x3       /* cf5 */
+#define TSTORM_CORE_CONN_AG_CTX_CF5_SHIFT     0
+#define TSTORM_CORE_CONN_AG_CTX_CF6_MASK      0x3       /* cf6 */
+#define TSTORM_CORE_CONN_AG_CTX_CF6_SHIFT     2
+#define TSTORM_CORE_CONN_AG_CTX_CF7_MASK      0x3       /* cf7 */
+#define TSTORM_CORE_CONN_AG_CTX_CF7_SHIFT     4
+#define TSTORM_CORE_CONN_AG_CTX_CF8_MASK      0x3       /* cf8 */
+#define TSTORM_CORE_CONN_AG_CTX_CF8_SHIFT     6
+	u8 flags3;
+#define TSTORM_CORE_CONN_AG_CTX_CF9_MASK      0x3       /* cf9 */
+#define TSTORM_CORE_CONN_AG_CTX_CF9_SHIFT     0
+#define TSTORM_CORE_CONN_AG_CTX_CF10_MASK     0x3       /* cf10 */
+#define TSTORM_CORE_CONN_AG_CTX_CF10_SHIFT    2
+#define TSTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
+#define TSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   4
+#define TSTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
+#define TSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   5
+#define TSTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
+#define TSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   6
+#define TSTORM_CORE_CONN_AG_CTX_CF3EN_MASK    0x1       /* cf3en */
+#define TSTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT   7
+	u8 flags4;
+#define TSTORM_CORE_CONN_AG_CTX_CF4EN_MASK    0x1       /* cf4en */
+#define TSTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT   0
+#define TSTORM_CORE_CONN_AG_CTX_CF5EN_MASK    0x1       /* cf5en */
+#define TSTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT   1
+#define TSTORM_CORE_CONN_AG_CTX_CF6EN_MASK    0x1       /* cf6en */
+#define TSTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT   2
+#define TSTORM_CORE_CONN_AG_CTX_CF7EN_MASK    0x1       /* cf7en */
+#define TSTORM_CORE_CONN_AG_CTX_CF7EN_SHIFT   3
+#define TSTORM_CORE_CONN_AG_CTX_CF8EN_MASK    0x1       /* cf8en */
+#define TSTORM_CORE_CONN_AG_CTX_CF8EN_SHIFT   4
+#define TSTORM_CORE_CONN_AG_CTX_CF9EN_MASK    0x1       /* cf9en */
+#define TSTORM_CORE_CONN_AG_CTX_CF9EN_SHIFT   5
+#define TSTORM_CORE_CONN_AG_CTX_CF10EN_MASK   0x1       /* cf10en */
+#define TSTORM_CORE_CONN_AG_CTX_CF10EN_SHIFT  6
+#define TSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 7
+	u8 flags5;
+#define TSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 0
+#define TSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 1
+#define TSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 2
+#define TSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 3
+#define TSTORM_CORE_CONN_AG_CTX_RULE5EN_MASK  0x1       /* rule5en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT 4
+#define TSTORM_CORE_CONN_AG_CTX_RULE6EN_MASK  0x1       /* rule6en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT 5
+#define TSTORM_CORE_CONN_AG_CTX_RULE7EN_MASK  0x1       /* rule7en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT 6
+#define TSTORM_CORE_CONN_AG_CTX_RULE8EN_MASK  0x1       /* rule8en */
+#define TSTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT 7
+	__le32	reg0 /* reg0 */;
+	__le32	reg1 /* reg1 */;
+	__le32	reg2 /* reg2 */;
+	__le32	reg3 /* reg3 */;
+	__le32	reg4 /* reg4 */;
+	__le32	reg5 /* reg5 */;
+	__le32	reg6 /* reg6 */;
+	__le32	reg7 /* reg7 */;
+	__le32	reg8 /* reg8 */;
+	u8	byte2 /* byte2 */;
+	u8	byte3 /* byte3 */;
+	__le16	word0 /* word0 */;
+	u8	byte4 /* byte4 */;
+	u8	byte5 /* byte5 */;
+	__le16	word1 /* word1 */;
+	__le16	word2 /* conn_dpi */;
+	__le16	word3 /* word3 */;
+	__le32	reg9 /* reg9 */;
+	__le32	reg10 /* reg10 */;
+};
+
+struct ustorm_core_conn_ag_ctx {
+	u8	reserved /* cdu_validation */;
+	u8	byte1 /* state */;
+	u8	flags0;
+#define USTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
+#define USTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
+#define USTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
+#define USTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
+#define USTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* timer0cf */
+#define USTORM_CORE_CONN_AG_CTX_CF0_SHIFT     2
+#define USTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* timer1cf */
+#define USTORM_CORE_CONN_AG_CTX_CF1_SHIFT     4
+#define USTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* timer2cf */
+#define USTORM_CORE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define USTORM_CORE_CONN_AG_CTX_CF3_MASK      0x3       /* timer_stop_all */
+#define USTORM_CORE_CONN_AG_CTX_CF3_SHIFT     0
+#define USTORM_CORE_CONN_AG_CTX_CF4_MASK      0x3       /* cf4 */
+#define USTORM_CORE_CONN_AG_CTX_CF4_SHIFT     2
+#define USTORM_CORE_CONN_AG_CTX_CF5_MASK      0x3       /* cf5 */
+#define USTORM_CORE_CONN_AG_CTX_CF5_SHIFT     4
+#define USTORM_CORE_CONN_AG_CTX_CF6_MASK      0x3       /* cf6 */
+#define USTORM_CORE_CONN_AG_CTX_CF6_SHIFT     6
+	u8 flags2;
+#define USTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
+#define USTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define USTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
+#define USTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define USTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
+#define USTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define USTORM_CORE_CONN_AG_CTX_CF3EN_MASK    0x1       /* cf3en */
+#define USTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT   3
+#define USTORM_CORE_CONN_AG_CTX_CF4EN_MASK    0x1       /* cf4en */
+#define USTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT   4
+#define USTORM_CORE_CONN_AG_CTX_CF5EN_MASK    0x1       /* cf5en */
+#define USTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT   5
+#define USTORM_CORE_CONN_AG_CTX_CF6EN_MASK    0x1       /* cf6en */
+#define USTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT   6
+#define USTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
+#define USTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 7
+	u8 flags3;
+#define USTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
+#define USTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 0
+#define USTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
+#define USTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 1
+#define USTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
+#define USTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 2
+#define USTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
+#define USTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 3
+#define USTORM_CORE_CONN_AG_CTX_RULE5EN_MASK  0x1       /* rule5en */
+#define USTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT 4
+#define USTORM_CORE_CONN_AG_CTX_RULE6EN_MASK  0x1       /* rule6en */
+#define USTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT 5
+#define USTORM_CORE_CONN_AG_CTX_RULE7EN_MASK  0x1       /* rule7en */
+#define USTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT 6
+#define USTORM_CORE_CONN_AG_CTX_RULE8EN_MASK  0x1       /* rule8en */
+#define USTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT 7
+	u8	byte2 /* byte2 */;
+	u8	byte3 /* byte3 */;
+	__le16	word0 /* conn_dpi */;
+	__le16	word1 /* word1 */;
+	__le32	rx_producers /* reg0 */;
+	__le32	reg1 /* reg1 */;
+	__le32	reg2 /* reg2 */;
+	__le32	reg3 /* reg3 */;
+	__le16	word2 /* word2 */;
+	__le16	word3 /* word3 */;
+};
+
 /* The core storm context for the Mstorm */
 struct mstorm_core_conn_st_ctx {
 	__le32 reserved[24];
@@ -349,8 +525,9 @@ struct core_conn_context {
 	struct regpair			pstorm_st_padding[2];
 	struct xstorm_core_conn_st_ctx	xstorm_st_context;
 	struct xstorm_core_conn_ag_ctx	xstorm_ag_context;
+	struct tstorm_core_conn_ag_ctx	tstorm_ag_context;
+	struct ustorm_core_conn_ag_ctx	ustorm_ag_context;
 	struct mstorm_core_conn_st_ctx	mstorm_st_context;
-	struct regpair			mstorm_st_padding[2];
 	struct ustorm_core_conn_st_ctx	ustorm_st_context;
 	struct regpair			ustorm_st_padding[2] /* padding */;
 };
@@ -397,10 +574,12 @@ union event_ring_element {
 };
 
 enum personality_type {
+	BAD_PERSONALITY_TYP,
 	PERSONALITY_RESERVED,
 	PERSONALITY_RESERVED2,
 	PERSONALITY_RDMA_AND_ETH /* Roce or Iwarp */,
 	PERSONALITY_RESERVED3,
+	PERSONALITY_CORE,
 	PERSONALITY_ETH /* Ethernet */,
 	PERSONALITY_RESERVED4,
 	MAX_PERSONALITY_TYPE
@@ -570,7 +749,7 @@ enum block_addr {
 	GRCBASE_NWM		= 0x800000,
 	GRCBASE_NWS		= 0x700000,
 	GRCBASE_MS		= 0x6a0000,
-	GRCBASE_PHY_PCIE	= 0x618000,
+	GRCBASE_PHY_PCIE	= 0x620000,
 	GRCBASE_MISC_AEU	= 0x8000,
 	GRCBASE_BAR0_MAP	= 0x1c00000,
 	MAX_BLOCK_ADDR
@@ -795,13 +974,13 @@ enum init_modes {
 	MODE_RESERVED3,
 	MODE_RESERVED4,
 	MODE_RESERVED5,
+	MODE_RESERVED6,
 	MODE_SF,
 	MODE_MF_SD,
 	MODE_MF_SI,
 	MODE_PORTS_PER_ENG_1,
 	MODE_PORTS_PER_ENG_2,
 	MODE_PORTS_PER_ENG_4,
-	MODE_40G,
 	MODE_100G,
 	MODE_EAGLE_ENG1_WORKAROUND,
 	MAX_INIT_MODES
@@ -816,43 +995,6 @@ enum init_phases {
 	MAX_INIT_PHASES
 };
 
-struct mstorm_core_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define MSTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
-#define MSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
-#define MSTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
-#define MSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
-#define MSTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* cf0 */
-#define MSTORM_CORE_CONN_AG_CTX_CF0_SHIFT     2
-#define MSTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* cf1 */
-#define MSTORM_CORE_CONN_AG_CTX_CF1_SHIFT     4
-#define MSTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* cf2 */
-#define MSTORM_CORE_CONN_AG_CTX_CF2_SHIFT     6
-	u8 flags1;
-#define MSTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
-#define MSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   0
-#define MSTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
-#define MSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   1
-#define MSTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
-#define MSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   2
-#define MSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
-#define MSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 3
-#define MSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
-#define MSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 4
-#define MSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
-#define MSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 5
-#define MSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
-#define MSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 6
-#define MSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
-#define MSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 7
-	__le16	word0 /* word0 */;
-	__le16	word1 /* word1 */;
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-};
-
 /* per encapsulation type enabling flags */
 struct prs_reg_encapsulation_type_en {
 	u8 flags;
@@ -927,250 +1069,44 @@ struct qm_rf_opportunistic_mask {
 };
 
 /* QM hardware structure of QM map memory */
-struct qm_rf_pq_map {
-	u32 reg;
-#define QM_RF_PQ_MAP_PQ_VALID_MASK          0x1         /* PQ active */
-#define QM_RF_PQ_MAP_PQ_VALID_SHIFT         0
-#define QM_RF_PQ_MAP_RL_ID_MASK             0xFF        /* RL ID */
-#define QM_RF_PQ_MAP_RL_ID_SHIFT            1
-#define QM_RF_PQ_MAP_VP_PQ_ID_MASK          0x1FF
-#define QM_RF_PQ_MAP_VP_PQ_ID_SHIFT         9
-#define QM_RF_PQ_MAP_VOQ_MASK               0x1F        /* VOQ */
-#define QM_RF_PQ_MAP_VOQ_SHIFT              18
-#define QM_RF_PQ_MAP_WRR_WEIGHT_GROUP_MASK  0x3         /* WRR weight */
-#define QM_RF_PQ_MAP_WRR_WEIGHT_GROUP_SHIFT 23
-#define QM_RF_PQ_MAP_RL_VALID_MASK          0x1         /* RL active */
-#define QM_RF_PQ_MAP_RL_VALID_SHIFT         25
-#define QM_RF_PQ_MAP_RESERVED_MASK          0x3F
-#define QM_RF_PQ_MAP_RESERVED_SHIFT         26
-};
-
-/* SDM operation gen command (generate aggregative interrupt) */
-struct sdm_op_gen {
-	__le32 command;
-#define SDM_OP_GEN_COMP_PARAM_MASK  0xFFFF      /* completion parameters 0-15 */
-#define SDM_OP_GEN_COMP_PARAM_SHIFT 0
-#define SDM_OP_GEN_COMP_TYPE_MASK   0xF         /* completion type 16-19 */
-#define SDM_OP_GEN_COMP_TYPE_SHIFT  16
-#define SDM_OP_GEN_RESERVED_MASK    0xFFF       /* reserved 20-31 */
-#define SDM_OP_GEN_RESERVED_SHIFT   20
-};
-
-struct tstorm_core_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define TSTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
-#define TSTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
-#define TSTORM_CORE_CONN_AG_CTX_BIT2_MASK     0x1       /* bit2 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT2_SHIFT    2
-#define TSTORM_CORE_CONN_AG_CTX_BIT3_MASK     0x1       /* bit3 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT3_SHIFT    3
-#define TSTORM_CORE_CONN_AG_CTX_BIT4_MASK     0x1       /* bit4 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT4_SHIFT    4
-#define TSTORM_CORE_CONN_AG_CTX_BIT5_MASK     0x1       /* bit5 */
-#define TSTORM_CORE_CONN_AG_CTX_BIT5_SHIFT    5
-#define TSTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* timer0cf */
-#define TSTORM_CORE_CONN_AG_CTX_CF0_SHIFT     6
-	u8 flags1;
-#define TSTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* timer1cf */
-#define TSTORM_CORE_CONN_AG_CTX_CF1_SHIFT     0
-#define TSTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* timer2cf */
-#define TSTORM_CORE_CONN_AG_CTX_CF2_SHIFT     2
-#define TSTORM_CORE_CONN_AG_CTX_CF3_MASK      0x3       /* timer_stop_all */
-#define TSTORM_CORE_CONN_AG_CTX_CF3_SHIFT     4
-#define TSTORM_CORE_CONN_AG_CTX_CF4_MASK      0x3       /* cf4 */
-#define TSTORM_CORE_CONN_AG_CTX_CF4_SHIFT     6
-	u8 flags2;
-#define TSTORM_CORE_CONN_AG_CTX_CF5_MASK      0x3       /* cf5 */
-#define TSTORM_CORE_CONN_AG_CTX_CF5_SHIFT     0
-#define TSTORM_CORE_CONN_AG_CTX_CF6_MASK      0x3       /* cf6 */
-#define TSTORM_CORE_CONN_AG_CTX_CF6_SHIFT     2
-#define TSTORM_CORE_CONN_AG_CTX_CF7_MASK      0x3       /* cf7 */
-#define TSTORM_CORE_CONN_AG_CTX_CF7_SHIFT     4
-#define TSTORM_CORE_CONN_AG_CTX_CF8_MASK      0x3       /* cf8 */
-#define TSTORM_CORE_CONN_AG_CTX_CF8_SHIFT     6
-	u8 flags3;
-#define TSTORM_CORE_CONN_AG_CTX_CF9_MASK      0x3       /* cf9 */
-#define TSTORM_CORE_CONN_AG_CTX_CF9_SHIFT     0
-#define TSTORM_CORE_CONN_AG_CTX_CF10_MASK     0x3       /* cf10 */
-#define TSTORM_CORE_CONN_AG_CTX_CF10_SHIFT    2
-#define TSTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
-#define TSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   4
-#define TSTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
-#define TSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   5
-#define TSTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
-#define TSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   6
-#define TSTORM_CORE_CONN_AG_CTX_CF3EN_MASK    0x1       /* cf3en */
-#define TSTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT   7
-	u8 flags4;
-#define TSTORM_CORE_CONN_AG_CTX_CF4EN_MASK    0x1       /* cf4en */
-#define TSTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT   0
-#define TSTORM_CORE_CONN_AG_CTX_CF5EN_MASK    0x1       /* cf5en */
-#define TSTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT   1
-#define TSTORM_CORE_CONN_AG_CTX_CF6EN_MASK    0x1       /* cf6en */
-#define TSTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT   2
-#define TSTORM_CORE_CONN_AG_CTX_CF7EN_MASK    0x1       /* cf7en */
-#define TSTORM_CORE_CONN_AG_CTX_CF7EN_SHIFT   3
-#define TSTORM_CORE_CONN_AG_CTX_CF8EN_MASK    0x1       /* cf8en */
-#define TSTORM_CORE_CONN_AG_CTX_CF8EN_SHIFT   4
-#define TSTORM_CORE_CONN_AG_CTX_CF9EN_MASK    0x1       /* cf9en */
-#define TSTORM_CORE_CONN_AG_CTX_CF9EN_SHIFT   5
-#define TSTORM_CORE_CONN_AG_CTX_CF10EN_MASK   0x1       /* cf10en */
-#define TSTORM_CORE_CONN_AG_CTX_CF10EN_SHIFT  6
-#define TSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 7
-	u8 flags5;
-#define TSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 0
-#define TSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 1
-#define TSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 2
-#define TSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 3
-#define TSTORM_CORE_CONN_AG_CTX_RULE5EN_MASK  0x1       /* rule5en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT 4
-#define TSTORM_CORE_CONN_AG_CTX_RULE6EN_MASK  0x1       /* rule6en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT 5
-#define TSTORM_CORE_CONN_AG_CTX_RULE7EN_MASK  0x1       /* rule7en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT 6
-#define TSTORM_CORE_CONN_AG_CTX_RULE8EN_MASK  0x1       /* rule8en */
-#define TSTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT 7
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-	__le32	reg2 /* reg2 */;
-	__le32	reg3 /* reg3 */;
-	__le32	reg4 /* reg4 */;
-	__le32	reg5 /* reg5 */;
-	__le32	reg6 /* reg6 */;
-	__le32	reg7 /* reg7 */;
-	__le32	reg8 /* reg8 */;
-	u8	byte2 /* byte2 */;
-	u8	byte3 /* byte3 */;
-	__le16	word0 /* word0 */;
-	u8	byte4 /* byte4 */;
-	u8	byte5 /* byte5 */;
-	__le16	word1 /* word1 */;
-	__le16	word2 /* conn_dpi */;
-	__le16	word3 /* word3 */;
-	__le32	reg9 /* reg9 */;
-	__le32	reg10 /* reg10 */;
+struct qm_rf_pq_map {
+	u32 reg;
+#define QM_RF_PQ_MAP_PQ_VALID_MASK          0x1         /* PQ active */
+#define QM_RF_PQ_MAP_PQ_VALID_SHIFT         0
+#define QM_RF_PQ_MAP_RL_ID_MASK             0xFF        /* RL ID */
+#define QM_RF_PQ_MAP_RL_ID_SHIFT            1
+#define QM_RF_PQ_MAP_VP_PQ_ID_MASK          0x1FF
+#define QM_RF_PQ_MAP_VP_PQ_ID_SHIFT         9
+#define QM_RF_PQ_MAP_VOQ_MASK               0x1F        /* VOQ */
+#define QM_RF_PQ_MAP_VOQ_SHIFT              18
+#define QM_RF_PQ_MAP_WRR_WEIGHT_GROUP_MASK  0x3         /* WRR weight */
+#define QM_RF_PQ_MAP_WRR_WEIGHT_GROUP_SHIFT 23
+#define QM_RF_PQ_MAP_RL_VALID_MASK          0x1         /* RL active */
+#define QM_RF_PQ_MAP_RL_VALID_SHIFT         25
+#define QM_RF_PQ_MAP_RESERVED_MASK          0x3F
+#define QM_RF_PQ_MAP_RESERVED_SHIFT         26
 };
 
-struct ustorm_core_conn_ag_ctx {
-	u8	reserved /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define USTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
-#define USTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
-#define USTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
-#define USTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
-#define USTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* timer0cf */
-#define USTORM_CORE_CONN_AG_CTX_CF0_SHIFT     2
-#define USTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* timer1cf */
-#define USTORM_CORE_CONN_AG_CTX_CF1_SHIFT     4
-#define USTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* timer2cf */
-#define USTORM_CORE_CONN_AG_CTX_CF2_SHIFT     6
-	u8 flags1;
-#define USTORM_CORE_CONN_AG_CTX_CF3_MASK      0x3       /* timer_stop_all */
-#define USTORM_CORE_CONN_AG_CTX_CF3_SHIFT     0
-#define USTORM_CORE_CONN_AG_CTX_CF4_MASK      0x3       /* cf4 */
-#define USTORM_CORE_CONN_AG_CTX_CF4_SHIFT     2
-#define USTORM_CORE_CONN_AG_CTX_CF5_MASK      0x3       /* cf5 */
-#define USTORM_CORE_CONN_AG_CTX_CF5_SHIFT     4
-#define USTORM_CORE_CONN_AG_CTX_CF6_MASK      0x3       /* cf6 */
-#define USTORM_CORE_CONN_AG_CTX_CF6_SHIFT     6
-	u8 flags2;
-#define USTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
-#define USTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   0
-#define USTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
-#define USTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   1
-#define USTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
-#define USTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   2
-#define USTORM_CORE_CONN_AG_CTX_CF3EN_MASK    0x1       /* cf3en */
-#define USTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT   3
-#define USTORM_CORE_CONN_AG_CTX_CF4EN_MASK    0x1       /* cf4en */
-#define USTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT   4
-#define USTORM_CORE_CONN_AG_CTX_CF5EN_MASK    0x1       /* cf5en */
-#define USTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT   5
-#define USTORM_CORE_CONN_AG_CTX_CF6EN_MASK    0x1       /* cf6en */
-#define USTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT   6
-#define USTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
-#define USTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 7
-	u8 flags3;
-#define USTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
-#define USTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 0
-#define USTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
-#define USTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 1
-#define USTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
-#define USTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 2
-#define USTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
-#define USTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 3
-#define USTORM_CORE_CONN_AG_CTX_RULE5EN_MASK  0x1       /* rule5en */
-#define USTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT 4
-#define USTORM_CORE_CONN_AG_CTX_RULE6EN_MASK  0x1       /* rule6en */
-#define USTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT 5
-#define USTORM_CORE_CONN_AG_CTX_RULE7EN_MASK  0x1       /* rule7en */
-#define USTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT 6
-#define USTORM_CORE_CONN_AG_CTX_RULE8EN_MASK  0x1       /* rule8en */
-#define USTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT 7
-	u8	byte2 /* byte2 */;
-	u8	byte3 /* byte3 */;
-	__le16	word0 /* conn_dpi */;
-	__le16	word1 /* word1 */;
-	__le32	rx_producers /* reg0 */;
-	__le32	reg1 /* reg1 */;
-	__le32	reg2 /* reg2 */;
-	__le32	reg3 /* reg3 */;
-	__le16	word2 /* word2 */;
-	__le16	word3 /* word3 */;
+/* Completion params for aggregated interrupt completion */
+struct sdm_agg_int_comp_params {
+	__le16 params;
+#define SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_MASK      0x3F
+#define SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT     0
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_MASK  0x1
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT 6
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_MASK     0x1FF
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_SHIFT    7
 };
 
-struct ystorm_core_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define YSTORM_CORE_CONN_AG_CTX_BIT0_MASK     0x1       /* exist_in_qm0 */
-#define YSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT    0
-#define YSTORM_CORE_CONN_AG_CTX_BIT1_MASK     0x1       /* exist_in_qm1 */
-#define YSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT    1
-#define YSTORM_CORE_CONN_AG_CTX_CF0_MASK      0x3       /* cf0 */
-#define YSTORM_CORE_CONN_AG_CTX_CF0_SHIFT     2
-#define YSTORM_CORE_CONN_AG_CTX_CF1_MASK      0x3       /* cf1 */
-#define YSTORM_CORE_CONN_AG_CTX_CF1_SHIFT     4
-#define YSTORM_CORE_CONN_AG_CTX_CF2_MASK      0x3       /* cf2 */
-#define YSTORM_CORE_CONN_AG_CTX_CF2_SHIFT     6
-	u8 flags1;
-#define YSTORM_CORE_CONN_AG_CTX_CF0EN_MASK    0x1       /* cf0en */
-#define YSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT   0
-#define YSTORM_CORE_CONN_AG_CTX_CF1EN_MASK    0x1       /* cf1en */
-#define YSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT   1
-#define YSTORM_CORE_CONN_AG_CTX_CF2EN_MASK    0x1       /* cf2en */
-#define YSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT   2
-#define YSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK  0x1       /* rule0en */
-#define YSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT 3
-#define YSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK  0x1       /* rule1en */
-#define YSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT 4
-#define YSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK  0x1       /* rule2en */
-#define YSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT 5
-#define YSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK  0x1       /* rule3en */
-#define YSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT 6
-#define YSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK  0x1       /* rule4en */
-#define YSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT 7
-	u8	byte2 /* byte2 */;
-	u8	byte3 /* byte3 */;
-	__le16	word0 /* word0 */;
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-	__le16	word1 /* word1 */;
-	__le16	word2 /* word2 */;
-	__le16	word3 /* word3 */;
-	__le16	word4 /* word4 */;
-	__le32	reg2 /* reg2 */;
-	__le32	reg3 /* reg3 */;
+/* SDM operation gen command (generate aggregative interrupt) */
+struct sdm_op_gen {
+	__le32 command;
+#define SDM_OP_GEN_COMP_PARAM_MASK  0xFFFF      /* completion parameters 0-15 */
+#define SDM_OP_GEN_COMP_PARAM_SHIFT 0
+#define SDM_OP_GEN_COMP_TYPE_MASK   0xF         /* completion type 16-19 */
+#define SDM_OP_GEN_COMP_TYPE_SHIFT  16
+#define SDM_OP_GEN_RESERVED_MASK    0xFFF       /* reserved 20-31 */
+#define SDM_OP_GEN_RESERVED_SHIFT   20
 };
 
 /*********************************** Init ************************************/
@@ -1274,13 +1210,6 @@ enum chip_ids {
 	MAX_CHIP_IDS
 };
 
-enum idle_chk_severity_types {
-	IDLE_CHK_SEVERITY_ERROR /* idle check failure should cause an error */,
-	IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC,
-	IDLE_CHK_SEVERITY_WARNING,
-	MAX_IDLE_CHK_SEVERITY_TYPES
-};
-
 struct init_array_raw_hdr {
 	__le32 data;
 #define INIT_ARRAY_RAW_HDR_TYPE_MASK    0xF
@@ -1340,14 +1269,6 @@ struct init_callback_op {
 	__le16	block_id /* Blocks ID */;
 };
 
-/* init comparison types */
-enum init_comparison_types {
-	INIT_COMPARISON_EQ /* init value is included in the init command */,
-	INIT_COMPARISON_OR /* init value is all zeros */,
-	INIT_COMPARISON_AND /* init value is an array of values */,
-	MAX_INIT_COMPARISON_TYPES
-};
-
 /* init operation: delay */
 struct init_delay_op {
 	__le32	op_data;
@@ -1444,12 +1365,10 @@ struct init_read_op {
 	__le32 op_data;
 #define INIT_READ_OP_OP_MASK         0xF
 #define INIT_READ_OP_OP_SHIFT        0
-#define INIT_READ_OP_POLL_COMP_MASK  0x7
-#define INIT_READ_OP_POLL_COMP_SHIFT 4
+#define INIT_READ_OP_POLL_TYPE_MASK  0xF
+#define INIT_READ_OP_POLL_TYPE_SHIFT 4
 #define INIT_READ_OP_RESERVED_MASK   0x1
-#define INIT_READ_OP_RESERVED_SHIFT  7
-#define INIT_READ_OP_POLL_MASK       0x1
-#define INIT_READ_OP_POLL_SHIFT      8
+#define INIT_READ_OP_RESERVED_SHIFT  8
 #define INIT_READ_OP_ADDRESS_MASK    0x7FFFFF
 #define INIT_READ_OP_ADDRESS_SHIFT   9
 	__le32 expected_val;
@@ -1477,6 +1396,14 @@ enum init_op_types {
 	MAX_INIT_OP_TYPES
 };
 
+enum init_poll_types {
+	INIT_POLL_NONE /* No polling */,
+	INIT_POLL_EQ /* init value is included in the init command */,
+	INIT_POLL_OR /* init value is all zeros */,
+	INIT_POLL_AND /* init value is an array of values */,
+	MAX_INIT_POLL_TYPES
+};
+
 /* init source types */
 enum init_source_types {
 	INIT_SRC_INLINE /* init value is included in the init command */,
@@ -1677,175 +1604,213 @@ bool qed_send_qm_stop_cmd(struct qed_hwfn	*p_hwfn,
 			  u16			num_pqs);
 
 /* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
-#define YSTORM_FLOW_CONTROL_MODE_OFFSET			(IRO[0].base)
-#define YSTORM_FLOW_CONTROL_MODE_SIZE			(IRO[0].size)
+#define YSTORM_FLOW_CONTROL_MODE_OFFSET  (IRO[0].base)
+#define YSTORM_FLOW_CONTROL_MODE_SIZE    (IRO[0].size)
 /* Tstorm port statistics */
-#define TSTORM_PORT_STAT_OFFSET(port_id)		(IRO[1].base + \
-							 ((port_id) * \
-							  IRO[1].m1))
-#define TSTORM_PORT_STAT_SIZE				(IRO[1].size)
+#define TSTORM_PORT_STAT_OFFSET(port_id) (IRO[1].base + ((port_id) * IRO[1].m1))
+#define TSTORM_PORT_STAT_SIZE            (IRO[1].size)
+/* Tstorm ll2 port statistics */
+#define TSTORM_LL2_PORT_STAT_OFFSET(port_id) \
+				(IRO[2].base + ((port_id) * IRO[2].m1))
+#define TSTORM_LL2_PORT_STAT_SIZE            (IRO[2].size)
 /* Ustorm VF-PF Channel ready flag */
-#define USTORM_VF_PF_CHANNEL_READY_OFFSET(vf_id)	(IRO[2].base +	\
-							 ((vf_id) *	\
-							  IRO[2].m1))
-#define USTORM_VF_PF_CHANNEL_READY_SIZE			(IRO[2].size)
+#define USTORM_VF_PF_CHANNEL_READY_OFFSET(vf_id) \
+				(IRO[3].base +	((vf_id) * IRO[3].m1))
+#define USTORM_VF_PF_CHANNEL_READY_SIZE          (IRO[3].size)
 /* Ustorm Final flr cleanup ack */
-#define USTORM_FLR_FINAL_ACK_OFFSET			(IRO[3].base)
-#define USTORM_FLR_FINAL_ACK_SIZE			(IRO[3].size)
+#define USTORM_FLR_FINAL_ACK_OFFSET(pf_id) (IRO[4].base + ((pf_id) * IRO[4].m1))
+#define USTORM_FLR_FINAL_ACK_SIZE          (IRO[4].size)
 /* Ustorm Event ring consumer */
-#define USTORM_EQE_CONS_OFFSET(pf_id)			(IRO[4].base +	\
-							 ((pf_id) *	\
-							  IRO[4].m1))
-#define USTORM_EQE_CONS_SIZE				(IRO[4].size)
-/* Ustorm Completion ring consumer */
-#define USTORM_CQ_CONS_OFFSET(global_queue_id)		(IRO[5].base +	\
-							 ((global_queue_id) * \
-							  IRO[5].m1))
-#define USTORM_CQ_CONS_SIZE				(IRO[5].size)
+#define USTORM_EQE_CONS_OFFSET(pf_id)    (IRO[5].base +	((pf_id) * IRO[5].m1))
+#define USTORM_EQE_CONS_SIZE             (IRO[5].size)
+/* Ustorm Common Queue ring consumer */
+#define USTORM_COMMON_QUEUE_CONS_OFFSET(global_queue_id) \
+			(IRO[6].base + ((global_queue_id) * IRO[6].m1))
+#define USTORM_COMMON_QUEUE_CONS_SIZE    (IRO[6].size)
 /* Xstorm Integration Test Data */
-#define XSTORM_INTEG_TEST_DATA_OFFSET			(IRO[6].base)
-#define XSTORM_INTEG_TEST_DATA_SIZE			(IRO[6].size)
+#define XSTORM_INTEG_TEST_DATA_OFFSET    (IRO[7].base)
+#define XSTORM_INTEG_TEST_DATA_SIZE      (IRO[7].size)
 /* Ystorm Integration Test Data */
-#define YSTORM_INTEG_TEST_DATA_OFFSET			(IRO[7].base)
-#define YSTORM_INTEG_TEST_DATA_SIZE			(IRO[7].size)
+#define YSTORM_INTEG_TEST_DATA_OFFSET    (IRO[8].base)
+#define YSTORM_INTEG_TEST_DATA_SIZE      (IRO[8].size)
 /* Pstorm Integration Test Data */
-#define PSTORM_INTEG_TEST_DATA_OFFSET			(IRO[8].base)
-#define PSTORM_INTEG_TEST_DATA_SIZE			(IRO[8].size)
+#define PSTORM_INTEG_TEST_DATA_OFFSET    (IRO[9].base)
+#define PSTORM_INTEG_TEST_DATA_SIZE      (IRO[9].size)
 /* Tstorm Integration Test Data */
-#define TSTORM_INTEG_TEST_DATA_OFFSET			(IRO[9].base)
-#define TSTORM_INTEG_TEST_DATA_SIZE			(IRO[9].size)
+#define TSTORM_INTEG_TEST_DATA_OFFSET    (IRO[10].base)
+#define TSTORM_INTEG_TEST_DATA_SIZE      (IRO[10].size)
 /* Mstorm Integration Test Data */
-#define MSTORM_INTEG_TEST_DATA_OFFSET			(IRO[10].base)
-#define MSTORM_INTEG_TEST_DATA_SIZE			(IRO[10].size)
+#define MSTORM_INTEG_TEST_DATA_OFFSET    (IRO[11].base)
+#define MSTORM_INTEG_TEST_DATA_SIZE      (IRO[11].size)
 /* Ustorm Integration Test Data */
-#define USTORM_INTEG_TEST_DATA_OFFSET			(IRO[11].base)
-#define USTORM_INTEG_TEST_DATA_SIZE			(IRO[11].size)
+#define USTORM_INTEG_TEST_DATA_OFFSET    (IRO[12].base)
+#define USTORM_INTEG_TEST_DATA_SIZE      (IRO[12].size)
 /* Tstorm producers */
-#define TSTORM_LL2_RX_PRODS_OFFSET(core_rx_queue_id)	(IRO[12].base +	\
-							 ((core_rx_queue_id) * \
-							  IRO[12].m1))
-#define TSTORM_LL2_RX_PRODS_SIZE			(IRO[12].size)
-/* Tstorm LiteL2 queue statistics */
-#define CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(core_rx_q_id) (IRO[13].base + \
-							     ((core_rx_q_id) * \
-							      IRO[13].m1))
-#define CORE_LL2_TSTORM_PER_QUEUE_STAT_SIZE		(IRO[13].size)
+#define TSTORM_LL2_RX_PRODS_OFFSET(core_rx_queue_id) \
+			(IRO[13].base + ((core_rx_queue_id) * IRO[13].m1))
+#define TSTORM_LL2_RX_PRODS_SIZE         (IRO[13].size)
+/* Tstorm LightL2 queue statistics */
+#define CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \
+			(IRO[14].base + ((core_rx_queue_id) * IRO[14].m1))
+#define CORE_LL2_TSTORM_PER_QUEUE_STAT_SIZE    (IRO[14].size)
 /* Ustorm LiteL2 queue statistics */
-#define CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(core_rx_q_id) (IRO[14].base + \
-							     ((core_rx_q_id) * \
-							      IRO[14].m1))
-#define CORE_LL2_USTORM_PER_QUEUE_STAT_SIZE		(IRO[14].size)
+#define CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \
+			(IRO[15].base +	((core_rx_queue_id) * IRO[15].m1))
+#define CORE_LL2_USTORM_PER_QUEUE_STAT_SIZE    (IRO[15].size)
 /* Pstorm LiteL2 queue statistics */
-#define CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(core_txst_id) (IRO[15].base + \
-							     ((core_txst_id) * \
-							      IRO[15].m1))
-#define CORE_LL2_PSTORM_PER_QUEUE_STAT_SIZE		(IRO[15].size)
+#define CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(core_tx_stats_id) \
+			(IRO[16].base +	((core_tx_stats_id) * IRO[16].m1))
+#define CORE_LL2_PSTORM_PER_QUEUE_STAT_SIZE    (IRO[16].size)
 /* Mstorm queue statistics */
-#define MSTORM_QUEUE_STAT_OFFSET(stat_counter_id) (IRO[16].base + \
-						   ((stat_counter_id) *	\
-						    IRO[16].m1))
-#define MSTORM_QUEUE_STAT_SIZE				(IRO[16].size)
+#define MSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+			(IRO[17].base + ((stat_counter_id) * IRO[17].m1))
+#define MSTORM_QUEUE_STAT_SIZE                 (IRO[17].size)
 /* Mstorm producers */
-#define MSTORM_PRODS_OFFSET(queue_id)			(IRO[17].base +	\
-							 ((queue_id) *	\
-							  IRO[17].m1))
-#define MSTORM_PRODS_SIZE				(IRO[17].size)
+#define MSTORM_PRODS_OFFSET(queue_id) (IRO[18].base + ((queue_id) * IRO[18].m1))
+#define MSTORM_PRODS_SIZE             (IRO[18].size)
 /* TPA agregation timeout in us resolution (on ASIC) */
-#define MSTORM_TPA_TIMEOUT_US_OFFSET			(IRO[18].base)
-#define MSTORM_TPA_TIMEOUT_US_SIZE			(IRO[18].size)
+#define MSTORM_TPA_TIMEOUT_US_OFFSET  (IRO[19].base)
+#define MSTORM_TPA_TIMEOUT_US_SIZE    (IRO[19].size)
 /* Ustorm queue statistics */
-#define USTORM_QUEUE_STAT_OFFSET(stat_counter_id)	(IRO[19].base +	\
-							((stat_counter_id) * \
-							 IRO[19].m1))
-#define USTORM_QUEUE_STAT_SIZE				(IRO[19].size)
+#define USTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+			(IRO[20].base + ((stat_counter_id) * IRO[20].m1))
+#define USTORM_QUEUE_STAT_SIZE        (IRO[20].size)
 /* Ustorm queue zone */
-#define USTORM_ETH_QUEUE_ZONE_OFFSET(queue_id)		(IRO[20].base +	\
-							 ((queue_id) *	\
-							  IRO[20].m1))
-#define USTORM_ETH_QUEUE_ZONE_SIZE			(IRO[20].size)
+#define USTORM_ETH_QUEUE_ZONE_OFFSET(queue_id) \
+			(IRO[21].base +	((queue_id) * IRO[21].m1))
+#define USTORM_ETH_QUEUE_ZONE_SIZE    (IRO[21].size)
 /* Pstorm queue statistics */
-#define PSTORM_QUEUE_STAT_OFFSET(stat_counter_id)	(IRO[21].base +	\
-							 ((stat_counter_id) * \
-							  IRO[21].m1))
-#define PSTORM_QUEUE_STAT_SIZE				(IRO[21].size)
+#define PSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+		(IRO[22].base + ((stat_counter_id) * IRO[22].m1))
+#define PSTORM_QUEUE_STAT_SIZE        (IRO[22].size)
 /* Tstorm last parser message */
-#define TSTORM_ETH_PRS_INPUT_OFFSET(pf_id)		(IRO[22].base +	\
-							 ((pf_id) *	\
-							  IRO[22].m1))
-#define TSTORM_ETH_PRS_INPUT_SIZE			(IRO[22].size)
+#define TSTORM_ETH_PRS_INPUT_OFFSET  (IRO[23].base)
+#define TSTORM_ETH_PRS_INPUT_SIZE    (IRO[23].size)
+/* Tstorm Eth limit Rx rate */
+#define ETH_RX_RATE_LIMIT_OFFSET(pf_id) (IRO[24].base +	((pf_id) * IRO[24].m1))
+#define ETH_RX_RATE_LIMIT_SIZE       (IRO[24].size)
 /* Ystorm queue zone */
-#define YSTORM_ETH_QUEUE_ZONE_OFFSET(queue_id)		(IRO[23].base +	\
-							 ((queue_id) *	\
-							  IRO[23].m1))
-#define YSTORM_ETH_QUEUE_ZONE_SIZE			(IRO[23].size)
+#define YSTORM_ETH_QUEUE_ZONE_OFFSET(queue_id) \
+			(IRO[25].base +	((queue_id) * IRO[25].m1))
+#define YSTORM_ETH_QUEUE_ZONE_SIZE   (IRO[25].size)
 /* Ystorm cqe producer */
-#define YSTORM_TOE_CQ_PROD_OFFSET(rss_id)		(IRO[24].base +	\
-							 ((rss_id) *	\
-							  IRO[24].m1))
-#define YSTORM_TOE_CQ_PROD_SIZE				(IRO[24].size)
+#define YSTORM_TOE_CQ_PROD_OFFSET(rss_id) \
+			(IRO[26].base + ((rss_id) * IRO[26].m1))
+#define YSTORM_TOE_CQ_PROD_SIZE      (IRO[26].size)
 /* Ustorm cqe producer */
-#define USTORM_TOE_CQ_PROD_OFFSET(rss_id)		(IRO[25].base +	\
-							 ((rss_id) *	\
-							  IRO[25].m1))
-#define USTORM_TOE_CQ_PROD_SIZE				(IRO[25].size)
+#define USTORM_TOE_CQ_PROD_OFFSET(rss_id) \
+			(IRO[27].base + ((rss_id) * IRO[27].m1))
+#define USTORM_TOE_CQ_PROD_SIZE      (IRO[27].size)
 /* Ustorm grq producer */
-#define USTORM_TOE_GRQ_PROD_OFFSET(pf_id)		(IRO[26].base +	\
-							 ((pf_id) *	\
-							  IRO[26].m1))
-#define USTORM_TOE_GRQ_PROD_SIZE			(IRO[26].size)
+#define USTORM_TOE_GRQ_PROD_OFFSET(pf_id) \
+			(IRO[28].base + ((pf_id) * IRO[28].m1))
+#define USTORM_TOE_GRQ_PROD_SIZE     (IRO[28].size)
 /* Tstorm cmdq-cons of given command queue-id */
-#define TSTORM_SCSI_CMDQ_CONS_OFFSET(cmdq_queue_id)	(IRO[27].base +	\
-							 ((cmdq_queue_id) * \
-							  IRO[27].m1))
-#define TSTORM_SCSI_CMDQ_CONS_SIZE			(IRO[27].size)
+#define TSTORM_SCSI_CMDQ_CONS_OFFSET(cmdq_queue_id) \
+			(IRO[29].base + ((cmdq_queue_id) * IRO[29].m1))
+#define TSTORM_SCSI_CMDQ_CONS_SIZE   (IRO[29].size)
 /* Mstorm rq-cons of given queue-id */
-#define MSTORM_SCSI_RQ_CONS_OFFSET(rq_queue_id)		(IRO[28].base +	\
-							 ((rq_queue_id) * \
-							  IRO[28].m1))
-#define MSTORM_SCSI_RQ_CONS_SIZE			(IRO[28].size)
+#define MSTORM_SCSI_RQ_CONS_OFFSET(rq_queue_id) \
+		(IRO[30].base + ((rq_queue_id) * IRO[30].m1))
+#define MSTORM_SCSI_RQ_CONS_SIZE     (IRO[30].size)
+/* Mstorm bdq-external-producer of given BDQ function ID, BDqueue-id */
+#define MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
+	(IRO[31].base + ((func_id) * IRO[31].m1) + ((bdq_id) * IRO[31].m2))
+#define MSTORM_SCSI_BDQ_EXT_PROD_SIZE (IRO[31].size)
+/* Tstorm (reflects M-Storm) bdq-external-producer of given fn ID, BDqueue-id */
+#define TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
+	(IRO[32].base + ((func_id) * IRO[32].m1) + ((bdq_id) * IRO[32].m2))
+#define TSTORM_SCSI_BDQ_EXT_PROD_SIZE (IRO[32].size)
+/* Tstorm iSCSI RX stats */
+#define TSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+				(IRO[33].base + ((pf_id) * IRO[33].m1))
+#define TSTORM_ISCSI_RX_STATS_SIZE    (IRO[33].size)
+/* Mstorm iSCSI RX stats */
+#define MSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+				(IRO[34].base + ((pf_id) * IRO[34].m1))
+#define MSTORM_ISCSI_RX_STATS_SIZE    (IRO[34].size)
+/* Ustorm iSCSI RX stats */
+#define USTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+				(IRO[35].base +	((pf_id) * IRO[35].m1))
+#define USTORM_ISCSI_RX_STATS_SIZE    (IRO[35].size)
+/* Xstorm iSCSI TX stats */
+#define XSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+				(IRO[36].base +	((pf_id) * IRO[36].m1))
+#define XSTORM_ISCSI_TX_STATS_SIZE    (IRO[36].size)
+/* Ystorm iSCSI TX stats */
+#define YSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+				(IRO[37].base +	((pf_id) * IRO[37].m1))
+#define YSTORM_ISCSI_TX_STATS_SIZE    (IRO[37].size)
+/* Pstorm iSCSI TX stats */
+#define PSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+				(IRO[38].base +	((pf_id) * IRO[38].m1))
+#define PSTORM_ISCSI_TX_STATS_SIZE    (IRO[38].size)
+/* Tstorm FCoE RX stats */
+#define TSTORM_FCOE_RX_STATS_OFFSET(pf_id) \
+				(IRO[39].base +	((pf_id) * IRO[39].m1))
+#define TSTORM_FCOE_RX_STATS_SIZE      (IRO[39].size)
+/* Mstorm FCoE RX stats */
+#define MSTORM_FCOE_RX_STATS_OFFSET(pf_id) \
+				(IRO[40].base +	((pf_id) * IRO[40].m1))
+#define MSTORM_FCOE_RX_STATS_SIZE      (IRO[40].size)
+/* Pstorm FCoE TX stats */
+#define PSTORM_FCOE_TX_STATS_OFFSET(pf_id) \
+				(IRO[41].base +	((pf_id) * IRO[41].m1))
+#define PSTORM_FCOE_TX_STATS_SIZE      (IRO[41].size)
 /* Pstorm RoCE statistics */
-#define PSTORM_ROCE_STAT_OFFSET(stat_counter_id)	(IRO[29].base +	\
-							 ((stat_counter_id) * \
-							  IRO[29].m1))
-#define PSTORM_ROCE_STAT_SIZE				(IRO[29].size)
+#define PSTORM_ROCE_STAT_OFFSET(stat_counter_id) \
+			(IRO[42].base + ((stat_counter_id) * IRO[42].m1))
+#define PSTORM_ROCE_STAT_SIZE          (IRO[42].size)
 /* Tstorm RoCE statistics */
-#define TSTORM_ROCE_STAT_OFFSET(stat_counter_id)	(IRO[30].base +	\
-							 ((stat_counter_id) * \
-							  IRO[30].m1))
-#define TSTORM_ROCE_STAT_SIZE				(IRO[30].size)
-
-static const struct iro iro_arr[31] = {
-	{ 0x10,	  0x0,	 0x0,	0x0,   0x8     },
-	{ 0x4448, 0x60,	 0x0,	0x0,   0x60    },
-	{ 0x498,  0x8,	 0x0,	0x0,   0x4     },
-	{ 0x494,  0x0,	 0x0,	0x0,   0x4     },
-	{ 0x10,	  0x8,	 0x0,	0x0,   0x2     },
-	{ 0x90,	  0x8,	 0x0,	0x0,   0x2     },
-	{ 0x4540, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x39e0, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x2598, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x4350, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x52d0, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x7a48, 0x0,	 0x0,	0x0,   0xf8    },
-	{ 0x100,  0x8,	 0x0,	0x0,   0x8     },
-	{ 0x5808, 0x10,	 0x0,	0x0,   0x10    },
-	{ 0xb100, 0x30,	 0x0,	0x0,   0x30    },
-	{ 0x95c0, 0x30,	 0x0,	0x0,   0x30    },
-	{ 0x54f8, 0x40,	 0x0,	0x0,   0x40    },
-	{ 0x200,  0x10,	 0x0,	0x0,   0x8     },
-	{ 0x9e70, 0x0,	 0x0,	0x0,   0x4     },
-	{ 0x7ca0, 0x40,	 0x0,	0x0,   0x30    },
-	{ 0xd00,  0x8,	 0x0,	0x0,   0x8     },
-	{ 0x2790, 0x80,	 0x0,	0x0,   0x38    },
-	{ 0xa520, 0xf0,	 0x0,	0x0,   0xf0    },
-	{ 0x80,	  0x8,	 0x0,	0x0,   0x8     },
-	{ 0xac0,  0x8,	 0x0,	0x0,   0x8     },
-	{ 0x2580, 0x8,	 0x0,	0x0,   0x8     },
-	{ 0x2500, 0x8,	 0x0,	0x0,   0x8     },
-	{ 0x440,  0x8,	 0x0,	0x0,   0x2     },
-	{ 0x1800, 0x8,	 0x0,	0x0,   0x2     },
-	{ 0x27c8, 0x80,	 0x0,	0x0,   0x10    },
-	{ 0x4710, 0x10,	 0x0,	0x0,   0x10    },
+#define TSTORM_ROCE_STAT_OFFSET(stat_counter_id) \
+			(IRO[43].base + ((stat_counter_id) * IRO[43].m1))
+#define TSTORM_ROCE_STAT_SIZE          (IRO[43].size)
+
+static const struct iro iro_arr[44] = {
+	{ 0x10,	   0x0,	   0x0,	   0x0,	   0x8	    },
+	{ 0x47c8,  0x60,   0x0,	   0x0,	   0x60	    },
+	{ 0x5e30,  0x20,   0x0,	   0x0,	   0x20	    },
+	{ 0x510,   0x8,	   0x0,	   0x0,	   0x4	    },
+	{ 0x490,   0x8,	   0x0,	   0x0,	   0x4	    },
+	{ 0x10,	   0x8,	   0x0,	   0x0,	   0x2	    },
+	{ 0x90,	   0x8,	   0x0,	   0x0,	   0x2	    },
+	{ 0x4940,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x3de0,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x2998,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x4750,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x56d0,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x7e50,  0x0,	   0x0,	   0x0,	   0x78	    },
+	{ 0x100,   0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x5c10,  0x10,   0x0,	   0x0,	   0x10	    },
+	{ 0xb508,  0x30,   0x0,	   0x0,	   0x30	    },
+	{ 0x95c0,  0x30,   0x0,	   0x0,	   0x30	    },
+	{ 0x58a0,  0x40,   0x0,	   0x0,	   0x40	    },
+	{ 0x200,   0x10,   0x0,	   0x0,	   0x8	    },
+	{ 0xa230,  0x0,	   0x0,	   0x0,	   0x4	    },
+	{ 0x8058,  0x40,   0x0,	   0x0,	   0x30	    },
+	{ 0xd00,   0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x2b30,  0x80,   0x0,	   0x0,	   0x38	    },
+	{ 0xa808,  0x0,	   0x0,	   0x0,	   0xf0	    },
+	{ 0xa8f8,  0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x80,	   0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0xac0,   0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x2580,  0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x2500,  0x8,	   0x0,	   0x0,	   0x8	    },
+	{ 0x440,   0x8,	   0x0,	   0x0,	   0x2	    },
+	{ 0x1800,  0x8,	   0x0,	   0x0,	   0x2	    },
+	{ 0x1a00,  0x10,   0x8,	   0x0,	   0x2	    },
+	{ 0x640,   0x10,   0x8,	   0x0,	   0x2	    },
+	{ 0xd9b8,  0x38,   0x0,	   0x0,	   0x24	    },
+	{ 0x11048, 0x10,   0x0,	   0x0,	   0x8	    },
+	{ 0x11678, 0x38,   0x0,	   0x0,	   0x18	    },
+	{ 0xaec0,  0x30,   0x0,	   0x0,	   0x10	    },
+	{ 0x8700,  0x28,   0x0,	   0x0,	   0x18	    },
+	{ 0xec00,  0x10,   0x0,	   0x0,	   0x10	    },
+	{ 0xde38,  0x40,   0x0,	   0x0,	   0x30	    },
+	{ 0x121a8, 0x38,   0x0,	   0x0,	   0x8	    },
+	{ 0xf068,  0x20,   0x0,	   0x0,	   0x20	    },
+	{ 0x2b68,  0x80,   0x0,	   0x0,	   0x10	    },
+	{ 0x4ab8,  0x10,   0x0,	   0x0,	   0x10	    },
 };
 
 /* Runtime array offsets */
@@ -1866,426 +1831,427 @@ static const struct iro iro_arr[31] = {
 #define DORQ_REG_VF_MAX_ICID_6_RT_OFFSET                                14
 #define DORQ_REG_VF_MAX_ICID_7_RT_OFFSET                                15
 #define DORQ_REG_PF_WAKE_ALL_RT_OFFSET                                  16
-#define IGU_REG_PF_CONFIGURATION_RT_OFFSET                              17
-#define IGU_REG_VF_CONFIGURATION_RT_OFFSET                              18
-#define IGU_REG_ATTN_MSG_ADDR_L_RT_OFFSET                               19
-#define IGU_REG_ATTN_MSG_ADDR_H_RT_OFFSET                               20
-#define IGU_REG_LEADING_EDGE_LATCH_RT_OFFSET                            21
-#define IGU_REG_TRAILING_EDGE_LATCH_RT_OFFSET                           22
-#define CAU_REG_CQE_AGG_UNIT_SIZE_RT_OFFSET                             23
-#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET                                 760
+#define DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET                               17
+#define IGU_REG_PF_CONFIGURATION_RT_OFFSET                              18
+#define IGU_REG_VF_CONFIGURATION_RT_OFFSET                              19
+#define IGU_REG_ATTN_MSG_ADDR_L_RT_OFFSET                               20
+#define IGU_REG_ATTN_MSG_ADDR_H_RT_OFFSET                               21
+#define IGU_REG_LEADING_EDGE_LATCH_RT_OFFSET                            22
+#define IGU_REG_TRAILING_EDGE_LATCH_RT_OFFSET                           23
+#define CAU_REG_CQE_AGG_UNIT_SIZE_RT_OFFSET                             24
+#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET                                 761
 #define CAU_REG_SB_VAR_MEMORY_RT_SIZE                                   736
-#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET                                 760
+#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET                                 761
 #define CAU_REG_SB_VAR_MEMORY_RT_SIZE                                   736
-#define CAU_REG_SB_ADDR_MEMORY_RT_OFFSET                                1496
+#define CAU_REG_SB_ADDR_MEMORY_RT_OFFSET                                1497
 #define CAU_REG_SB_ADDR_MEMORY_RT_SIZE                                  736
-#define CAU_REG_PI_MEMORY_RT_OFFSET                                     2232
+#define CAU_REG_PI_MEMORY_RT_OFFSET                                     2233
 #define CAU_REG_PI_MEMORY_RT_SIZE                                       4416
-#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET                    6648
-#define PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET                      6649
-#define PRS_REG_TASK_ID_MAX_INITIATOR_VF_RT_OFFSET                      6650
-#define PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET                         6651
-#define PRS_REG_TASK_ID_MAX_TARGET_VF_RT_OFFSET                         6652
-#define PRS_REG_SEARCH_TCP_RT_OFFSET                                    6653
-#define PRS_REG_SEARCH_FCOE_RT_OFFSET                                   6654
-#define PRS_REG_SEARCH_ROCE_RT_OFFSET                                   6655
-#define PRS_REG_ROCE_DEST_QP_MAX_VF_RT_OFFSET                           6656
-#define PRS_REG_ROCE_DEST_QP_MAX_PF_RT_OFFSET                           6657
-#define PRS_REG_SEARCH_OPENFLOW_RT_OFFSET                               6658
-#define PRS_REG_SEARCH_NON_IP_AS_OPENFLOW_RT_OFFSET                     6659
-#define PRS_REG_OPENFLOW_SUPPORT_ONLY_KNOWN_OVER_IP_RT_OFFSET           6660
-#define PRS_REG_OPENFLOW_SEARCH_KEY_MASK_RT_OFFSET                      6661
-#define PRS_REG_LIGHT_L2_ETHERTYPE_EN_RT_OFFSET                         6662
-#define SRC_REG_FIRSTFREE_RT_OFFSET                                     6663
+#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET                    6649
+#define PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET                      6650
+#define PRS_REG_TASK_ID_MAX_INITIATOR_VF_RT_OFFSET                      6651
+#define PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET                         6652
+#define PRS_REG_TASK_ID_MAX_TARGET_VF_RT_OFFSET                         6653
+#define PRS_REG_SEARCH_TCP_RT_OFFSET                                    6654
+#define PRS_REG_SEARCH_FCOE_RT_OFFSET                                   6655
+#define PRS_REG_SEARCH_ROCE_RT_OFFSET                                   6656
+#define PRS_REG_ROCE_DEST_QP_MAX_VF_RT_OFFSET                           6657
+#define PRS_REG_ROCE_DEST_QP_MAX_PF_RT_OFFSET                           6658
+#define PRS_REG_SEARCH_OPENFLOW_RT_OFFSET                               6659
+#define PRS_REG_SEARCH_NON_IP_AS_OPENFLOW_RT_OFFSET                     6660
+#define PRS_REG_OPENFLOW_SUPPORT_ONLY_KNOWN_OVER_IP_RT_OFFSET           6661
+#define PRS_REG_OPENFLOW_SEARCH_KEY_MASK_RT_OFFSET                      6662
+#define PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET                               6663
+#define PRS_REG_LIGHT_L2_ETHERTYPE_EN_RT_OFFSET                         6664
+#define SRC_REG_FIRSTFREE_RT_OFFSET                                     6665
 #define SRC_REG_FIRSTFREE_RT_SIZE                                       2
-#define SRC_REG_LASTFREE_RT_OFFSET                                      6665
+#define SRC_REG_LASTFREE_RT_OFFSET                                      6667
 #define SRC_REG_LASTFREE_RT_SIZE                                        2
-#define SRC_REG_COUNTFREE_RT_OFFSET                                     6667
-#define SRC_REG_NUMBER_HASH_BITS_RT_OFFSET                              6668
-#define PSWRQ2_REG_CDUT_P_SIZE_RT_OFFSET                                6669
-#define PSWRQ2_REG_CDUC_P_SIZE_RT_OFFSET                                6670
-#define PSWRQ2_REG_TM_P_SIZE_RT_OFFSET                                  6671
-#define PSWRQ2_REG_QM_P_SIZE_RT_OFFSET                                  6672
-#define PSWRQ2_REG_SRC_P_SIZE_RT_OFFSET                                 6673
-#define PSWRQ2_REG_TM_FIRST_ILT_RT_OFFSET                               6674
-#define PSWRQ2_REG_TM_LAST_ILT_RT_OFFSET                                6675
-#define PSWRQ2_REG_QM_FIRST_ILT_RT_OFFSET                               6676
-#define PSWRQ2_REG_QM_LAST_ILT_RT_OFFSET                                6677
-#define PSWRQ2_REG_SRC_FIRST_ILT_RT_OFFSET                              6678
-#define PSWRQ2_REG_SRC_LAST_ILT_RT_OFFSET                               6679
-#define PSWRQ2_REG_CDUC_FIRST_ILT_RT_OFFSET                             6680
-#define PSWRQ2_REG_CDUC_LAST_ILT_RT_OFFSET                              6681
-#define PSWRQ2_REG_CDUT_FIRST_ILT_RT_OFFSET                             6682
-#define PSWRQ2_REG_CDUT_LAST_ILT_RT_OFFSET                              6683
-#define PSWRQ2_REG_TSDM_FIRST_ILT_RT_OFFSET                             6684
-#define PSWRQ2_REG_TSDM_LAST_ILT_RT_OFFSET                              6685
-#define PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET                     6686
-#define PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET                   6687
-#define PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET                   6688
-#define PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET                               6689
-#define PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET                             6690
-#define PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET                             6691
-#define PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET                           6692
-#define PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET                         6693
-#define PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET                         6694
-#define PSWRQ2_REG_VF_BASE_RT_OFFSET                                    6695
-#define PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET                                6696
-#define PSWRQ2_REG_WR_MBS0_RT_OFFSET                                    6697
-#define PSWRQ2_REG_RD_MBS0_RT_OFFSET                                    6698
-#define PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET                              6699
-#define PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET                              6700
-#define PSWRQ2_REG_ILT_MEMORY_RT_OFFSET                                 6701
+#define SRC_REG_COUNTFREE_RT_OFFSET                                     6669
+#define SRC_REG_NUMBER_HASH_BITS_RT_OFFSET                              6670
+#define PSWRQ2_REG_CDUT_P_SIZE_RT_OFFSET                                6671
+#define PSWRQ2_REG_CDUC_P_SIZE_RT_OFFSET                                6672
+#define PSWRQ2_REG_TM_P_SIZE_RT_OFFSET                                  6673
+#define PSWRQ2_REG_QM_P_SIZE_RT_OFFSET                                  6674
+#define PSWRQ2_REG_SRC_P_SIZE_RT_OFFSET                                 6675
+#define PSWRQ2_REG_TM_FIRST_ILT_RT_OFFSET                               6676
+#define PSWRQ2_REG_TM_LAST_ILT_RT_OFFSET                                6677
+#define PSWRQ2_REG_QM_FIRST_ILT_RT_OFFSET                               6678
+#define PSWRQ2_REG_QM_LAST_ILT_RT_OFFSET                                6679
+#define PSWRQ2_REG_SRC_FIRST_ILT_RT_OFFSET                              6680
+#define PSWRQ2_REG_SRC_LAST_ILT_RT_OFFSET                               6681
+#define PSWRQ2_REG_CDUC_FIRST_ILT_RT_OFFSET                             6682
+#define PSWRQ2_REG_CDUC_LAST_ILT_RT_OFFSET                              6683
+#define PSWRQ2_REG_CDUT_FIRST_ILT_RT_OFFSET                             6684
+#define PSWRQ2_REG_CDUT_LAST_ILT_RT_OFFSET                              6685
+#define PSWRQ2_REG_TSDM_FIRST_ILT_RT_OFFSET                             6686
+#define PSWRQ2_REG_TSDM_LAST_ILT_RT_OFFSET                              6687
+#define PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET                     6688
+#define PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET                   6689
+#define PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET                   6690
+#define PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET                               6691
+#define PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET                             6692
+#define PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET                             6693
+#define PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET                           6694
+#define PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET                         6695
+#define PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET                         6696
+#define PSWRQ2_REG_VF_BASE_RT_OFFSET                                    6697
+#define PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET                                6698
+#define PSWRQ2_REG_WR_MBS0_RT_OFFSET                                    6699
+#define PSWRQ2_REG_RD_MBS0_RT_OFFSET                                    6700
+#define PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET                              6701
+#define PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET                              6702
+#define PSWRQ2_REG_ILT_MEMORY_RT_OFFSET                                 6703
 #define PSWRQ2_REG_ILT_MEMORY_RT_SIZE                                   22000
-#define PGLUE_REG_B_VF_BASE_RT_OFFSET                                   28701
-#define PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET                           28702
-#define PGLUE_REG_B_PF_BAR0_SIZE_RT_OFFSET                              28703
-#define PGLUE_REG_B_PF_BAR1_SIZE_RT_OFFSET                              28704
-#define PGLUE_REG_B_VF_BAR1_SIZE_RT_OFFSET                              28705
-#define TM_REG_VF_ENABLE_CONN_RT_OFFSET                                 28706
-#define TM_REG_PF_ENABLE_CONN_RT_OFFSET                                 28707
-#define TM_REG_PF_ENABLE_TASK_RT_OFFSET                                 28708
-#define TM_REG_GROUP_SIZE_RESOLUTION_CONN_RT_OFFSET                     28709
-#define TM_REG_GROUP_SIZE_RESOLUTION_TASK_RT_OFFSET                     28710
-#define TM_REG_CONFIG_CONN_MEM_RT_OFFSET                                28711
+#define PGLUE_REG_B_VF_BASE_RT_OFFSET                                   28703
+#define PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET                           28704
+#define PGLUE_REG_B_PF_BAR0_SIZE_RT_OFFSET                              28705
+#define PGLUE_REG_B_PF_BAR1_SIZE_RT_OFFSET                              28706
+#define PGLUE_REG_B_VF_BAR1_SIZE_RT_OFFSET                              28707
+#define TM_REG_VF_ENABLE_CONN_RT_OFFSET                                 28708
+#define TM_REG_PF_ENABLE_CONN_RT_OFFSET                                 28709
+#define TM_REG_PF_ENABLE_TASK_RT_OFFSET                                 28710
+#define TM_REG_GROUP_SIZE_RESOLUTION_CONN_RT_OFFSET                     28711
+#define TM_REG_GROUP_SIZE_RESOLUTION_TASK_RT_OFFSET                     28712
+#define TM_REG_CONFIG_CONN_MEM_RT_OFFSET                                28713
 #define TM_REG_CONFIG_CONN_MEM_RT_SIZE                                  416
-#define TM_REG_CONFIG_TASK_MEM_RT_OFFSET                                29127
+#define TM_REG_CONFIG_TASK_MEM_RT_OFFSET                                29129
 #define TM_REG_CONFIG_TASK_MEM_RT_SIZE                                  512
-#define QM_REG_MAXPQSIZE_0_RT_OFFSET                                    29639
-#define QM_REG_MAXPQSIZE_1_RT_OFFSET                                    29640
-#define QM_REG_MAXPQSIZE_2_RT_OFFSET                                    29641
-#define QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET                               29642
-#define QM_REG_MAXPQSIZETXSEL_1_RT_OFFSET                               29643
-#define QM_REG_MAXPQSIZETXSEL_2_RT_OFFSET                               29644
-#define QM_REG_MAXPQSIZETXSEL_3_RT_OFFSET                               29645
-#define QM_REG_MAXPQSIZETXSEL_4_RT_OFFSET                               29646
-#define QM_REG_MAXPQSIZETXSEL_5_RT_OFFSET                               29647
-#define QM_REG_MAXPQSIZETXSEL_6_RT_OFFSET                               29648
-#define QM_REG_MAXPQSIZETXSEL_7_RT_OFFSET                               29649
-#define QM_REG_MAXPQSIZETXSEL_8_RT_OFFSET                               29650
-#define QM_REG_MAXPQSIZETXSEL_9_RT_OFFSET                               29651
-#define QM_REG_MAXPQSIZETXSEL_10_RT_OFFSET                              29652
-#define QM_REG_MAXPQSIZETXSEL_11_RT_OFFSET                              29653
-#define QM_REG_MAXPQSIZETXSEL_12_RT_OFFSET                              29654
-#define QM_REG_MAXPQSIZETXSEL_13_RT_OFFSET                              29655
-#define QM_REG_MAXPQSIZETXSEL_14_RT_OFFSET                              29656
-#define QM_REG_MAXPQSIZETXSEL_15_RT_OFFSET                              29657
-#define QM_REG_MAXPQSIZETXSEL_16_RT_OFFSET                              29658
-#define QM_REG_MAXPQSIZETXSEL_17_RT_OFFSET                              29659
-#define QM_REG_MAXPQSIZETXSEL_18_RT_OFFSET                              29660
-#define QM_REG_MAXPQSIZETXSEL_19_RT_OFFSET                              29661
-#define QM_REG_MAXPQSIZETXSEL_20_RT_OFFSET                              29662
-#define QM_REG_MAXPQSIZETXSEL_21_RT_OFFSET                              29663
-#define QM_REG_MAXPQSIZETXSEL_22_RT_OFFSET                              29664
-#define QM_REG_MAXPQSIZETXSEL_23_RT_OFFSET                              29665
-#define QM_REG_MAXPQSIZETXSEL_24_RT_OFFSET                              29666
-#define QM_REG_MAXPQSIZETXSEL_25_RT_OFFSET                              29667
-#define QM_REG_MAXPQSIZETXSEL_26_RT_OFFSET                              29668
-#define QM_REG_MAXPQSIZETXSEL_27_RT_OFFSET                              29669
-#define QM_REG_MAXPQSIZETXSEL_28_RT_OFFSET                              29670
-#define QM_REG_MAXPQSIZETXSEL_29_RT_OFFSET                              29671
-#define QM_REG_MAXPQSIZETXSEL_30_RT_OFFSET                              29672
-#define QM_REG_MAXPQSIZETXSEL_31_RT_OFFSET                              29673
-#define QM_REG_MAXPQSIZETXSEL_32_RT_OFFSET                              29674
-#define QM_REG_MAXPQSIZETXSEL_33_RT_OFFSET                              29675
-#define QM_REG_MAXPQSIZETXSEL_34_RT_OFFSET                              29676
-#define QM_REG_MAXPQSIZETXSEL_35_RT_OFFSET                              29677
-#define QM_REG_MAXPQSIZETXSEL_36_RT_OFFSET                              29678
-#define QM_REG_MAXPQSIZETXSEL_37_RT_OFFSET                              29679
-#define QM_REG_MAXPQSIZETXSEL_38_RT_OFFSET                              29680
-#define QM_REG_MAXPQSIZETXSEL_39_RT_OFFSET                              29681
-#define QM_REG_MAXPQSIZETXSEL_40_RT_OFFSET                              29682
-#define QM_REG_MAXPQSIZETXSEL_41_RT_OFFSET                              29683
-#define QM_REG_MAXPQSIZETXSEL_42_RT_OFFSET                              29684
-#define QM_REG_MAXPQSIZETXSEL_43_RT_OFFSET                              29685
-#define QM_REG_MAXPQSIZETXSEL_44_RT_OFFSET                              29686
-#define QM_REG_MAXPQSIZETXSEL_45_RT_OFFSET                              29687
-#define QM_REG_MAXPQSIZETXSEL_46_RT_OFFSET                              29688
-#define QM_REG_MAXPQSIZETXSEL_47_RT_OFFSET                              29689
-#define QM_REG_MAXPQSIZETXSEL_48_RT_OFFSET                              29690
-#define QM_REG_MAXPQSIZETXSEL_49_RT_OFFSET                              29691
-#define QM_REG_MAXPQSIZETXSEL_50_RT_OFFSET                              29692
-#define QM_REG_MAXPQSIZETXSEL_51_RT_OFFSET                              29693
-#define QM_REG_MAXPQSIZETXSEL_52_RT_OFFSET                              29694
-#define QM_REG_MAXPQSIZETXSEL_53_RT_OFFSET                              29695
-#define QM_REG_MAXPQSIZETXSEL_54_RT_OFFSET                              29696
-#define QM_REG_MAXPQSIZETXSEL_55_RT_OFFSET                              29697
-#define QM_REG_MAXPQSIZETXSEL_56_RT_OFFSET                              29698
-#define QM_REG_MAXPQSIZETXSEL_57_RT_OFFSET                              29699
-#define QM_REG_MAXPQSIZETXSEL_58_RT_OFFSET                              29700
-#define QM_REG_MAXPQSIZETXSEL_59_RT_OFFSET                              29701
-#define QM_REG_MAXPQSIZETXSEL_60_RT_OFFSET                              29702
-#define QM_REG_MAXPQSIZETXSEL_61_RT_OFFSET                              29703
-#define QM_REG_MAXPQSIZETXSEL_62_RT_OFFSET                              29704
-#define QM_REG_MAXPQSIZETXSEL_63_RT_OFFSET                              29705
-#define QM_REG_BASEADDROTHERPQ_RT_OFFSET                                29706
+#define QM_REG_MAXPQSIZE_0_RT_OFFSET                                    29641
+#define QM_REG_MAXPQSIZE_1_RT_OFFSET                                    29642
+#define QM_REG_MAXPQSIZE_2_RT_OFFSET                                    29643
+#define QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET                               29644
+#define QM_REG_MAXPQSIZETXSEL_1_RT_OFFSET                               29645
+#define QM_REG_MAXPQSIZETXSEL_2_RT_OFFSET                               29646
+#define QM_REG_MAXPQSIZETXSEL_3_RT_OFFSET                               29647
+#define QM_REG_MAXPQSIZETXSEL_4_RT_OFFSET                               29648
+#define QM_REG_MAXPQSIZETXSEL_5_RT_OFFSET                               29649
+#define QM_REG_MAXPQSIZETXSEL_6_RT_OFFSET                               29650
+#define QM_REG_MAXPQSIZETXSEL_7_RT_OFFSET                               29651
+#define QM_REG_MAXPQSIZETXSEL_8_RT_OFFSET                               29652
+#define QM_REG_MAXPQSIZETXSEL_9_RT_OFFSET                               29653
+#define QM_REG_MAXPQSIZETXSEL_10_RT_OFFSET                              29654
+#define QM_REG_MAXPQSIZETXSEL_11_RT_OFFSET                              29655
+#define QM_REG_MAXPQSIZETXSEL_12_RT_OFFSET                              29656
+#define QM_REG_MAXPQSIZETXSEL_13_RT_OFFSET                              29657
+#define QM_REG_MAXPQSIZETXSEL_14_RT_OFFSET                              29658
+#define QM_REG_MAXPQSIZETXSEL_15_RT_OFFSET                              29659
+#define QM_REG_MAXPQSIZETXSEL_16_RT_OFFSET                              29660
+#define QM_REG_MAXPQSIZETXSEL_17_RT_OFFSET                              29661
+#define QM_REG_MAXPQSIZETXSEL_18_RT_OFFSET                              29662
+#define QM_REG_MAXPQSIZETXSEL_19_RT_OFFSET                              29663
+#define QM_REG_MAXPQSIZETXSEL_20_RT_OFFSET                              29664
+#define QM_REG_MAXPQSIZETXSEL_21_RT_OFFSET                              29665
+#define QM_REG_MAXPQSIZETXSEL_22_RT_OFFSET                              29666
+#define QM_REG_MAXPQSIZETXSEL_23_RT_OFFSET                              29667
+#define QM_REG_MAXPQSIZETXSEL_24_RT_OFFSET                              29668
+#define QM_REG_MAXPQSIZETXSEL_25_RT_OFFSET                              29669
+#define QM_REG_MAXPQSIZETXSEL_26_RT_OFFSET                              29670
+#define QM_REG_MAXPQSIZETXSEL_27_RT_OFFSET                              29671
+#define QM_REG_MAXPQSIZETXSEL_28_RT_OFFSET                              29672
+#define QM_REG_MAXPQSIZETXSEL_29_RT_OFFSET                              29673
+#define QM_REG_MAXPQSIZETXSEL_30_RT_OFFSET                              29674
+#define QM_REG_MAXPQSIZETXSEL_31_RT_OFFSET                              29675
+#define QM_REG_MAXPQSIZETXSEL_32_RT_OFFSET                              29676
+#define QM_REG_MAXPQSIZETXSEL_33_RT_OFFSET                              29677
+#define QM_REG_MAXPQSIZETXSEL_34_RT_OFFSET                              29678
+#define QM_REG_MAXPQSIZETXSEL_35_RT_OFFSET                              29679
+#define QM_REG_MAXPQSIZETXSEL_36_RT_OFFSET                              29680
+#define QM_REG_MAXPQSIZETXSEL_37_RT_OFFSET                              29681
+#define QM_REG_MAXPQSIZETXSEL_38_RT_OFFSET                              29682
+#define QM_REG_MAXPQSIZETXSEL_39_RT_OFFSET                              29683
+#define QM_REG_MAXPQSIZETXSEL_40_RT_OFFSET                              29684
+#define QM_REG_MAXPQSIZETXSEL_41_RT_OFFSET                              29685
+#define QM_REG_MAXPQSIZETXSEL_42_RT_OFFSET                              29686
+#define QM_REG_MAXPQSIZETXSEL_43_RT_OFFSET                              29687
+#define QM_REG_MAXPQSIZETXSEL_44_RT_OFFSET                              29688
+#define QM_REG_MAXPQSIZETXSEL_45_RT_OFFSET                              29689
+#define QM_REG_MAXPQSIZETXSEL_46_RT_OFFSET                              29690
+#define QM_REG_MAXPQSIZETXSEL_47_RT_OFFSET                              29691
+#define QM_REG_MAXPQSIZETXSEL_48_RT_OFFSET                              29692
+#define QM_REG_MAXPQSIZETXSEL_49_RT_OFFSET                              29693
+#define QM_REG_MAXPQSIZETXSEL_50_RT_OFFSET                              29694
+#define QM_REG_MAXPQSIZETXSEL_51_RT_OFFSET                              29695
+#define QM_REG_MAXPQSIZETXSEL_52_RT_OFFSET                              29696
+#define QM_REG_MAXPQSIZETXSEL_53_RT_OFFSET                              29697
+#define QM_REG_MAXPQSIZETXSEL_54_RT_OFFSET                              29698
+#define QM_REG_MAXPQSIZETXSEL_55_RT_OFFSET                              29699
+#define QM_REG_MAXPQSIZETXSEL_56_RT_OFFSET                              29700
+#define QM_REG_MAXPQSIZETXSEL_57_RT_OFFSET                              29701
+#define QM_REG_MAXPQSIZETXSEL_58_RT_OFFSET                              29702
+#define QM_REG_MAXPQSIZETXSEL_59_RT_OFFSET                              29703
+#define QM_REG_MAXPQSIZETXSEL_60_RT_OFFSET                              29704
+#define QM_REG_MAXPQSIZETXSEL_61_RT_OFFSET                              29705
+#define QM_REG_MAXPQSIZETXSEL_62_RT_OFFSET                              29706
+#define QM_REG_MAXPQSIZETXSEL_63_RT_OFFSET                              29707
+#define QM_REG_BASEADDROTHERPQ_RT_OFFSET                                29708
 #define QM_REG_BASEADDROTHERPQ_RT_SIZE                                  128
-#define QM_REG_VOQCRDLINE_RT_OFFSET                                     29834
+#define QM_REG_VOQCRDLINE_RT_OFFSET                                     29836
 #define QM_REG_VOQCRDLINE_RT_SIZE                                       20
-#define QM_REG_VOQINITCRDLINE_RT_OFFSET                                 29854
+#define QM_REG_VOQINITCRDLINE_RT_OFFSET                                 29856
 #define QM_REG_VOQINITCRDLINE_RT_SIZE                                   20
-#define QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET                             29874
-#define QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET                             29875
-#define QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET                              29876
-#define QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET                            29877
-#define QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET                           29878
-#define QM_REG_WRROTHERPQGRP_0_RT_OFFSET                                29879
-#define QM_REG_WRROTHERPQGRP_1_RT_OFFSET                                29880
-#define QM_REG_WRROTHERPQGRP_2_RT_OFFSET                                29881
-#define QM_REG_WRROTHERPQGRP_3_RT_OFFSET                                29882
-#define QM_REG_WRROTHERPQGRP_4_RT_OFFSET                                29883
-#define QM_REG_WRROTHERPQGRP_5_RT_OFFSET                                29884
-#define QM_REG_WRROTHERPQGRP_6_RT_OFFSET                                29885
-#define QM_REG_WRROTHERPQGRP_7_RT_OFFSET                                29886
-#define QM_REG_WRROTHERPQGRP_8_RT_OFFSET                                29887
-#define QM_REG_WRROTHERPQGRP_9_RT_OFFSET                                29888
-#define QM_REG_WRROTHERPQGRP_10_RT_OFFSET                               29889
-#define QM_REG_WRROTHERPQGRP_11_RT_OFFSET                               29890
-#define QM_REG_WRROTHERPQGRP_12_RT_OFFSET                               29891
-#define QM_REG_WRROTHERPQGRP_13_RT_OFFSET                               29892
-#define QM_REG_WRROTHERPQGRP_14_RT_OFFSET                               29893
-#define QM_REG_WRROTHERPQGRP_15_RT_OFFSET                               29894
-#define QM_REG_WRROTHERGRPWEIGHT_0_RT_OFFSET                            29895
-#define QM_REG_WRROTHERGRPWEIGHT_1_RT_OFFSET                            29896
-#define QM_REG_WRROTHERGRPWEIGHT_2_RT_OFFSET                            29897
-#define QM_REG_WRROTHERGRPWEIGHT_3_RT_OFFSET                            29898
-#define QM_REG_WRRTXGRPWEIGHT_0_RT_OFFSET                               29899
-#define QM_REG_WRRTXGRPWEIGHT_1_RT_OFFSET                               29900
-#define QM_REG_PQTX2PF_0_RT_OFFSET                                      29901
-#define QM_REG_PQTX2PF_1_RT_OFFSET                                      29902
-#define QM_REG_PQTX2PF_2_RT_OFFSET                                      29903
-#define QM_REG_PQTX2PF_3_RT_OFFSET                                      29904
-#define QM_REG_PQTX2PF_4_RT_OFFSET                                      29905
-#define QM_REG_PQTX2PF_5_RT_OFFSET                                      29906
-#define QM_REG_PQTX2PF_6_RT_OFFSET                                      29907
-#define QM_REG_PQTX2PF_7_RT_OFFSET                                      29908
-#define QM_REG_PQTX2PF_8_RT_OFFSET                                      29909
-#define QM_REG_PQTX2PF_9_RT_OFFSET                                      29910
-#define QM_REG_PQTX2PF_10_RT_OFFSET                                     29911
-#define QM_REG_PQTX2PF_11_RT_OFFSET                                     29912
-#define QM_REG_PQTX2PF_12_RT_OFFSET                                     29913
-#define QM_REG_PQTX2PF_13_RT_OFFSET                                     29914
-#define QM_REG_PQTX2PF_14_RT_OFFSET                                     29915
-#define QM_REG_PQTX2PF_15_RT_OFFSET                                     29916
-#define QM_REG_PQTX2PF_16_RT_OFFSET                                     29917
-#define QM_REG_PQTX2PF_17_RT_OFFSET                                     29918
-#define QM_REG_PQTX2PF_18_RT_OFFSET                                     29919
-#define QM_REG_PQTX2PF_19_RT_OFFSET                                     29920
-#define QM_REG_PQTX2PF_20_RT_OFFSET                                     29921
-#define QM_REG_PQTX2PF_21_RT_OFFSET                                     29922
-#define QM_REG_PQTX2PF_22_RT_OFFSET                                     29923
-#define QM_REG_PQTX2PF_23_RT_OFFSET                                     29924
-#define QM_REG_PQTX2PF_24_RT_OFFSET                                     29925
-#define QM_REG_PQTX2PF_25_RT_OFFSET                                     29926
-#define QM_REG_PQTX2PF_26_RT_OFFSET                                     29927
-#define QM_REG_PQTX2PF_27_RT_OFFSET                                     29928
-#define QM_REG_PQTX2PF_28_RT_OFFSET                                     29929
-#define QM_REG_PQTX2PF_29_RT_OFFSET                                     29930
-#define QM_REG_PQTX2PF_30_RT_OFFSET                                     29931
-#define QM_REG_PQTX2PF_31_RT_OFFSET                                     29932
-#define QM_REG_PQTX2PF_32_RT_OFFSET                                     29933
-#define QM_REG_PQTX2PF_33_RT_OFFSET                                     29934
-#define QM_REG_PQTX2PF_34_RT_OFFSET                                     29935
-#define QM_REG_PQTX2PF_35_RT_OFFSET                                     29936
-#define QM_REG_PQTX2PF_36_RT_OFFSET                                     29937
-#define QM_REG_PQTX2PF_37_RT_OFFSET                                     29938
-#define QM_REG_PQTX2PF_38_RT_OFFSET                                     29939
-#define QM_REG_PQTX2PF_39_RT_OFFSET                                     29940
-#define QM_REG_PQTX2PF_40_RT_OFFSET                                     29941
-#define QM_REG_PQTX2PF_41_RT_OFFSET                                     29942
-#define QM_REG_PQTX2PF_42_RT_OFFSET                                     29943
-#define QM_REG_PQTX2PF_43_RT_OFFSET                                     29944
-#define QM_REG_PQTX2PF_44_RT_OFFSET                                     29945
-#define QM_REG_PQTX2PF_45_RT_OFFSET                                     29946
-#define QM_REG_PQTX2PF_46_RT_OFFSET                                     29947
-#define QM_REG_PQTX2PF_47_RT_OFFSET                                     29948
-#define QM_REG_PQTX2PF_48_RT_OFFSET                                     29949
-#define QM_REG_PQTX2PF_49_RT_OFFSET                                     29950
-#define QM_REG_PQTX2PF_50_RT_OFFSET                                     29951
-#define QM_REG_PQTX2PF_51_RT_OFFSET                                     29952
-#define QM_REG_PQTX2PF_52_RT_OFFSET                                     29953
-#define QM_REG_PQTX2PF_53_RT_OFFSET                                     29954
-#define QM_REG_PQTX2PF_54_RT_OFFSET                                     29955
-#define QM_REG_PQTX2PF_55_RT_OFFSET                                     29956
-#define QM_REG_PQTX2PF_56_RT_OFFSET                                     29957
-#define QM_REG_PQTX2PF_57_RT_OFFSET                                     29958
-#define QM_REG_PQTX2PF_58_RT_OFFSET                                     29959
-#define QM_REG_PQTX2PF_59_RT_OFFSET                                     29960
-#define QM_REG_PQTX2PF_60_RT_OFFSET                                     29961
-#define QM_REG_PQTX2PF_61_RT_OFFSET                                     29962
-#define QM_REG_PQTX2PF_62_RT_OFFSET                                     29963
-#define QM_REG_PQTX2PF_63_RT_OFFSET                                     29964
-#define QM_REG_PQOTHER2PF_0_RT_OFFSET                                   29965
-#define QM_REG_PQOTHER2PF_1_RT_OFFSET                                   29966
-#define QM_REG_PQOTHER2PF_2_RT_OFFSET                                   29967
-#define QM_REG_PQOTHER2PF_3_RT_OFFSET                                   29968
-#define QM_REG_PQOTHER2PF_4_RT_OFFSET                                   29969
-#define QM_REG_PQOTHER2PF_5_RT_OFFSET                                   29970
-#define QM_REG_PQOTHER2PF_6_RT_OFFSET                                   29971
-#define QM_REG_PQOTHER2PF_7_RT_OFFSET                                   29972
-#define QM_REG_PQOTHER2PF_8_RT_OFFSET                                   29973
-#define QM_REG_PQOTHER2PF_9_RT_OFFSET                                   29974
-#define QM_REG_PQOTHER2PF_10_RT_OFFSET                                  29975
-#define QM_REG_PQOTHER2PF_11_RT_OFFSET                                  29976
-#define QM_REG_PQOTHER2PF_12_RT_OFFSET                                  29977
-#define QM_REG_PQOTHER2PF_13_RT_OFFSET                                  29978
-#define QM_REG_PQOTHER2PF_14_RT_OFFSET                                  29979
-#define QM_REG_PQOTHER2PF_15_RT_OFFSET                                  29980
-#define QM_REG_RLGLBLPERIOD_0_RT_OFFSET                                 29981
-#define QM_REG_RLGLBLPERIOD_1_RT_OFFSET                                 29982
-#define QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET                            29983
-#define QM_REG_RLGLBLPERIODTIMER_1_RT_OFFSET                            29984
-#define QM_REG_RLGLBLPERIODSEL_0_RT_OFFSET                              29985
-#define QM_REG_RLGLBLPERIODSEL_1_RT_OFFSET                              29986
-#define QM_REG_RLGLBLPERIODSEL_2_RT_OFFSET                              29987
-#define QM_REG_RLGLBLPERIODSEL_3_RT_OFFSET                              29988
-#define QM_REG_RLGLBLPERIODSEL_4_RT_OFFSET                              29989
-#define QM_REG_RLGLBLPERIODSEL_5_RT_OFFSET                              29990
-#define QM_REG_RLGLBLPERIODSEL_6_RT_OFFSET                              29991
-#define QM_REG_RLGLBLPERIODSEL_7_RT_OFFSET                              29992
-#define QM_REG_RLGLBLINCVAL_RT_OFFSET                                   29993
+#define QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET                             29876
+#define QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET                             29877
+#define QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET                              29878
+#define QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET                            29879
+#define QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET                           29880
+#define QM_REG_WRROTHERPQGRP_0_RT_OFFSET                                29881
+#define QM_REG_WRROTHERPQGRP_1_RT_OFFSET                                29882
+#define QM_REG_WRROTHERPQGRP_2_RT_OFFSET                                29883
+#define QM_REG_WRROTHERPQGRP_3_RT_OFFSET                                29884
+#define QM_REG_WRROTHERPQGRP_4_RT_OFFSET                                29885
+#define QM_REG_WRROTHERPQGRP_5_RT_OFFSET                                29886
+#define QM_REG_WRROTHERPQGRP_6_RT_OFFSET                                29887
+#define QM_REG_WRROTHERPQGRP_7_RT_OFFSET                                29888
+#define QM_REG_WRROTHERPQGRP_8_RT_OFFSET                                29889
+#define QM_REG_WRROTHERPQGRP_9_RT_OFFSET                                29890
+#define QM_REG_WRROTHERPQGRP_10_RT_OFFSET                               29891
+#define QM_REG_WRROTHERPQGRP_11_RT_OFFSET                               29892
+#define QM_REG_WRROTHERPQGRP_12_RT_OFFSET                               29893
+#define QM_REG_WRROTHERPQGRP_13_RT_OFFSET                               29894
+#define QM_REG_WRROTHERPQGRP_14_RT_OFFSET                               29895
+#define QM_REG_WRROTHERPQGRP_15_RT_OFFSET                               29896
+#define QM_REG_WRROTHERGRPWEIGHT_0_RT_OFFSET                            29897
+#define QM_REG_WRROTHERGRPWEIGHT_1_RT_OFFSET                            29898
+#define QM_REG_WRROTHERGRPWEIGHT_2_RT_OFFSET                            29899
+#define QM_REG_WRROTHERGRPWEIGHT_3_RT_OFFSET                            29900
+#define QM_REG_WRRTXGRPWEIGHT_0_RT_OFFSET                               29901
+#define QM_REG_WRRTXGRPWEIGHT_1_RT_OFFSET                               29902
+#define QM_REG_PQTX2PF_0_RT_OFFSET                                      29903
+#define QM_REG_PQTX2PF_1_RT_OFFSET                                      29904
+#define QM_REG_PQTX2PF_2_RT_OFFSET                                      29905
+#define QM_REG_PQTX2PF_3_RT_OFFSET                                      29906
+#define QM_REG_PQTX2PF_4_RT_OFFSET                                      29907
+#define QM_REG_PQTX2PF_5_RT_OFFSET                                      29908
+#define QM_REG_PQTX2PF_6_RT_OFFSET                                      29909
+#define QM_REG_PQTX2PF_7_RT_OFFSET                                      29910
+#define QM_REG_PQTX2PF_8_RT_OFFSET                                      29911
+#define QM_REG_PQTX2PF_9_RT_OFFSET                                      29912
+#define QM_REG_PQTX2PF_10_RT_OFFSET                                     29913
+#define QM_REG_PQTX2PF_11_RT_OFFSET                                     29914
+#define QM_REG_PQTX2PF_12_RT_OFFSET                                     29915
+#define QM_REG_PQTX2PF_13_RT_OFFSET                                     29916
+#define QM_REG_PQTX2PF_14_RT_OFFSET                                     29917
+#define QM_REG_PQTX2PF_15_RT_OFFSET                                     29918
+#define QM_REG_PQTX2PF_16_RT_OFFSET                                     29919
+#define QM_REG_PQTX2PF_17_RT_OFFSET                                     29920
+#define QM_REG_PQTX2PF_18_RT_OFFSET                                     29921
+#define QM_REG_PQTX2PF_19_RT_OFFSET                                     29922
+#define QM_REG_PQTX2PF_20_RT_OFFSET                                     29923
+#define QM_REG_PQTX2PF_21_RT_OFFSET                                     29924
+#define QM_REG_PQTX2PF_22_RT_OFFSET                                     29925
+#define QM_REG_PQTX2PF_23_RT_OFFSET                                     29926
+#define QM_REG_PQTX2PF_24_RT_OFFSET                                     29927
+#define QM_REG_PQTX2PF_25_RT_OFFSET                                     29928
+#define QM_REG_PQTX2PF_26_RT_OFFSET                                     29929
+#define QM_REG_PQTX2PF_27_RT_OFFSET                                     29930
+#define QM_REG_PQTX2PF_28_RT_OFFSET                                     29931
+#define QM_REG_PQTX2PF_29_RT_OFFSET                                     29932
+#define QM_REG_PQTX2PF_30_RT_OFFSET                                     29933
+#define QM_REG_PQTX2PF_31_RT_OFFSET                                     29934
+#define QM_REG_PQTX2PF_32_RT_OFFSET                                     29935
+#define QM_REG_PQTX2PF_33_RT_OFFSET                                     29936
+#define QM_REG_PQTX2PF_34_RT_OFFSET                                     29937
+#define QM_REG_PQTX2PF_35_RT_OFFSET                                     29938
+#define QM_REG_PQTX2PF_36_RT_OFFSET                                     29939
+#define QM_REG_PQTX2PF_37_RT_OFFSET                                     29940
+#define QM_REG_PQTX2PF_38_RT_OFFSET                                     29941
+#define QM_REG_PQTX2PF_39_RT_OFFSET                                     29942
+#define QM_REG_PQTX2PF_40_RT_OFFSET                                     29943
+#define QM_REG_PQTX2PF_41_RT_OFFSET                                     29944
+#define QM_REG_PQTX2PF_42_RT_OFFSET                                     29945
+#define QM_REG_PQTX2PF_43_RT_OFFSET                                     29946
+#define QM_REG_PQTX2PF_44_RT_OFFSET                                     29947
+#define QM_REG_PQTX2PF_45_RT_OFFSET                                     29948
+#define QM_REG_PQTX2PF_46_RT_OFFSET                                     29949
+#define QM_REG_PQTX2PF_47_RT_OFFSET                                     29950
+#define QM_REG_PQTX2PF_48_RT_OFFSET                                     29951
+#define QM_REG_PQTX2PF_49_RT_OFFSET                                     29952
+#define QM_REG_PQTX2PF_50_RT_OFFSET                                     29953
+#define QM_REG_PQTX2PF_51_RT_OFFSET                                     29954
+#define QM_REG_PQTX2PF_52_RT_OFFSET                                     29955
+#define QM_REG_PQTX2PF_53_RT_OFFSET                                     29956
+#define QM_REG_PQTX2PF_54_RT_OFFSET                                     29957
+#define QM_REG_PQTX2PF_55_RT_OFFSET                                     29958
+#define QM_REG_PQTX2PF_56_RT_OFFSET                                     29959
+#define QM_REG_PQTX2PF_57_RT_OFFSET                                     29960
+#define QM_REG_PQTX2PF_58_RT_OFFSET                                     29961
+#define QM_REG_PQTX2PF_59_RT_OFFSET                                     29962
+#define QM_REG_PQTX2PF_60_RT_OFFSET                                     29963
+#define QM_REG_PQTX2PF_61_RT_OFFSET                                     29964
+#define QM_REG_PQTX2PF_62_RT_OFFSET                                     29965
+#define QM_REG_PQTX2PF_63_RT_OFFSET                                     29966
+#define QM_REG_PQOTHER2PF_0_RT_OFFSET                                   29967
+#define QM_REG_PQOTHER2PF_1_RT_OFFSET                                   29968
+#define QM_REG_PQOTHER2PF_2_RT_OFFSET                                   29969
+#define QM_REG_PQOTHER2PF_3_RT_OFFSET                                   29970
+#define QM_REG_PQOTHER2PF_4_RT_OFFSET                                   29971
+#define QM_REG_PQOTHER2PF_5_RT_OFFSET                                   29972
+#define QM_REG_PQOTHER2PF_6_RT_OFFSET                                   29973
+#define QM_REG_PQOTHER2PF_7_RT_OFFSET                                   29974
+#define QM_REG_PQOTHER2PF_8_RT_OFFSET                                   29975
+#define QM_REG_PQOTHER2PF_9_RT_OFFSET                                   29976
+#define QM_REG_PQOTHER2PF_10_RT_OFFSET                                  29977
+#define QM_REG_PQOTHER2PF_11_RT_OFFSET                                  29978
+#define QM_REG_PQOTHER2PF_12_RT_OFFSET                                  29979
+#define QM_REG_PQOTHER2PF_13_RT_OFFSET                                  29980
+#define QM_REG_PQOTHER2PF_14_RT_OFFSET                                  29981
+#define QM_REG_PQOTHER2PF_15_RT_OFFSET                                  29982
+#define QM_REG_RLGLBLPERIOD_0_RT_OFFSET                                 29983
+#define QM_REG_RLGLBLPERIOD_1_RT_OFFSET                                 29984
+#define QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET                            29985
+#define QM_REG_RLGLBLPERIODTIMER_1_RT_OFFSET                            29986
+#define QM_REG_RLGLBLPERIODSEL_0_RT_OFFSET                              29987
+#define QM_REG_RLGLBLPERIODSEL_1_RT_OFFSET                              29988
+#define QM_REG_RLGLBLPERIODSEL_2_RT_OFFSET                              29989
+#define QM_REG_RLGLBLPERIODSEL_3_RT_OFFSET                              29990
+#define QM_REG_RLGLBLPERIODSEL_4_RT_OFFSET                              29991
+#define QM_REG_RLGLBLPERIODSEL_5_RT_OFFSET                              29992
+#define QM_REG_RLGLBLPERIODSEL_6_RT_OFFSET                              29993
+#define QM_REG_RLGLBLPERIODSEL_7_RT_OFFSET                              29994
+#define QM_REG_RLGLBLINCVAL_RT_OFFSET                                   29995
 #define QM_REG_RLGLBLINCVAL_RT_SIZE                                     256
-#define QM_REG_RLGLBLUPPERBOUND_RT_OFFSET                               30249
+#define QM_REG_RLGLBLUPPERBOUND_RT_OFFSET                               30251
 #define QM_REG_RLGLBLUPPERBOUND_RT_SIZE                                 256
-#define QM_REG_RLGLBLCRD_RT_OFFSET                                      30505
+#define QM_REG_RLGLBLCRD_RT_OFFSET                                      30507
 #define QM_REG_RLGLBLCRD_RT_SIZE                                        256
-#define QM_REG_RLGLBLENABLE_RT_OFFSET                                   30761
-#define QM_REG_RLPFPERIOD_RT_OFFSET                                     30762
-#define QM_REG_RLPFPERIODTIMER_RT_OFFSET                                30763
-#define QM_REG_RLPFINCVAL_RT_OFFSET                                     30764
+#define QM_REG_RLGLBLENABLE_RT_OFFSET                                   30763
+#define QM_REG_RLPFPERIOD_RT_OFFSET                                     30764
+#define QM_REG_RLPFPERIODTIMER_RT_OFFSET                                30765
+#define QM_REG_RLPFINCVAL_RT_OFFSET                                     30766
 #define QM_REG_RLPFINCVAL_RT_SIZE                                       16
-#define QM_REG_RLPFUPPERBOUND_RT_OFFSET                                 30780
+#define QM_REG_RLPFUPPERBOUND_RT_OFFSET                                 30782
 #define QM_REG_RLPFUPPERBOUND_RT_SIZE                                   16
-#define QM_REG_RLPFCRD_RT_OFFSET                                        30796
+#define QM_REG_RLPFCRD_RT_OFFSET                                        30798
 #define QM_REG_RLPFCRD_RT_SIZE                                          16
-#define QM_REG_RLPFENABLE_RT_OFFSET                                     30812
-#define QM_REG_RLPFVOQENABLE_RT_OFFSET                                  30813
-#define QM_REG_WFQPFWEIGHT_RT_OFFSET                                    30814
+#define QM_REG_RLPFENABLE_RT_OFFSET                                     30814
+#define QM_REG_RLPFVOQENABLE_RT_OFFSET                                  30815
+#define QM_REG_WFQPFWEIGHT_RT_OFFSET                                    30816
 #define QM_REG_WFQPFWEIGHT_RT_SIZE                                      16
-#define QM_REG_WFQPFUPPERBOUND_RT_OFFSET                                30830
+#define QM_REG_WFQPFUPPERBOUND_RT_OFFSET                                30832
 #define QM_REG_WFQPFUPPERBOUND_RT_SIZE                                  16
-#define QM_REG_WFQPFCRD_RT_OFFSET                                       30846
+#define QM_REG_WFQPFCRD_RT_OFFSET                                       30848
 #define QM_REG_WFQPFCRD_RT_SIZE                                         160
-#define QM_REG_WFQPFENABLE_RT_OFFSET                                    31006
-#define QM_REG_WFQVPENABLE_RT_OFFSET                                    31007
-#define QM_REG_BASEADDRTXPQ_RT_OFFSET                                   31008
+#define QM_REG_WFQPFENABLE_RT_OFFSET                                    31008
+#define QM_REG_WFQVPENABLE_RT_OFFSET                                    31009
+#define QM_REG_BASEADDRTXPQ_RT_OFFSET                                   31010
 #define QM_REG_BASEADDRTXPQ_RT_SIZE                                     512
-#define QM_REG_TXPQMAP_RT_OFFSET                                        31520
+#define QM_REG_TXPQMAP_RT_OFFSET                                        31522
 #define QM_REG_TXPQMAP_RT_SIZE                                          512
-#define QM_REG_WFQVPWEIGHT_RT_OFFSET                                    32032
+#define QM_REG_WFQVPWEIGHT_RT_OFFSET                                    32034
 #define QM_REG_WFQVPWEIGHT_RT_SIZE                                      512
-#define QM_REG_WFQVPUPPERBOUND_RT_OFFSET                                32544
-#define QM_REG_WFQVPUPPERBOUND_RT_SIZE                                  512
-#define QM_REG_WFQVPCRD_RT_OFFSET                                       33056
+#define QM_REG_WFQVPCRD_RT_OFFSET                                       32546
 #define QM_REG_WFQVPCRD_RT_SIZE                                         512
-#define QM_REG_WFQVPMAP_RT_OFFSET                                       33568
+#define QM_REG_WFQVPMAP_RT_OFFSET                                       33058
 #define QM_REG_WFQVPMAP_RT_SIZE                                         512
-#define QM_REG_WFQPFCRD_MSB_RT_OFFSET                                   34080
+#define QM_REG_WFQPFCRD_MSB_RT_OFFSET                                   33570
 #define QM_REG_WFQPFCRD_MSB_RT_SIZE                                     160
-#define NIG_REG_LLH_CLS_TYPE_DUALMODE_RT_OFFSET                         34240
-#define NIG_REG_OUTER_TAG_VALUE_LIST0_RT_OFFSET                         34241
-#define NIG_REG_OUTER_TAG_VALUE_LIST1_RT_OFFSET                         34242
-#define NIG_REG_OUTER_TAG_VALUE_LIST2_RT_OFFSET                         34243
-#define NIG_REG_OUTER_TAG_VALUE_LIST3_RT_OFFSET                         34244
-#define NIG_REG_OUTER_TAG_VALUE_MASK_RT_OFFSET                          34245
-#define NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET                      34246
-#define NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET                               34247
+#define NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET                               33730
+#define NIG_REG_OUTER_TAG_VALUE_LIST0_RT_OFFSET                         33731
+#define NIG_REG_OUTER_TAG_VALUE_LIST1_RT_OFFSET                         33732
+#define NIG_REG_OUTER_TAG_VALUE_LIST2_RT_OFFSET                         33733
+#define NIG_REG_OUTER_TAG_VALUE_LIST3_RT_OFFSET                         33734
+#define NIG_REG_OUTER_TAG_VALUE_MASK_RT_OFFSET                          33735
+#define NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET                      33736
+#define NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET                               33737
 #define NIG_REG_LLH_FUNC_TAG_EN_RT_SIZE                                 4
-#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_OFFSET                          34251
+#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_OFFSET                          33741
 #define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_SIZE                            4
-#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET                            34255
+#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET                            33745
 #define NIG_REG_LLH_FUNC_TAG_VALUE_RT_SIZE                              4
-#define NIG_REG_LLH_FUNC_NO_TAG_RT_OFFSET                               34259
-#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_OFFSET                         34260
+#define NIG_REG_LLH_FUNC_NO_TAG_RT_OFFSET                               33749
+#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_OFFSET                         33750
 #define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_SIZE                           32
-#define NIG_REG_LLH_FUNC_FILTER_EN_RT_OFFSET                            34292
+#define NIG_REG_LLH_FUNC_FILTER_EN_RT_OFFSET                            33782
 #define NIG_REG_LLH_FUNC_FILTER_EN_RT_SIZE                              16
-#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_OFFSET                          34308
+#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_OFFSET                          33798
 #define NIG_REG_LLH_FUNC_FILTER_MODE_RT_SIZE                            16
-#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET                 34324
+#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET                 33814
 #define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE                   16
-#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET                       34340
+#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET                       33830
 #define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE                         16
-#define NIG_REG_TX_EDPM_CTRL_RT_OFFSET                                  34356
-#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET                               34357
-#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET                               34358
-#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET                               34359
-#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET                           34360
-#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET                           34361
-#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET                           34362
-#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET                           34363
-#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET                        34364
-#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET                        34365
-#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET                        34366
-#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET                        34367
-#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET                            34368
-#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET                         34369
-#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET                          34370
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET                        34371
-#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET                           34372
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET                    34373
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET                        34374
-#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET                           34375
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET                    34376
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET                        34377
-#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET                           34378
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET                    34379
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET                        34380
-#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET                           34381
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET                    34382
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET                        34383
-#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET                           34384
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET                    34385
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET                        34386
-#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET                           34387
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET                    34388
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET                        34389
-#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET                           34390
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET                    34391
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET                        34392
-#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET                           34393
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET                    34394
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET                        34395
-#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET                           34396
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET                    34397
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET                        34398
-#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET                           34399
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET                    34400
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET                       34401
-#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET                          34402
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET                   34403
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET                       34404
-#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET                          34405
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET                   34406
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET                       34407
-#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET                          34408
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET                   34409
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET                       34410
-#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET                          34411
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET                   34412
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET                       34413
-#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET                          34414
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET                   34415
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET                       34416
-#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET                          34417
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET                   34418
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET                       34419
-#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET                          34420
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET                   34421
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET                       34422
-#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET                          34423
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET                   34424
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET                       34425
-#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET                          34426
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET                   34427
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET                       34428
-#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET                          34429
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET                   34430
-#define XCM_REG_CON_PHY_Q3_RT_OFFSET                                    34431
-
-#define RUNTIME_ARRAY_SIZE 34432
+#define NIG_REG_TX_EDPM_CTRL_RT_OFFSET                                  33846
+#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET                               33847
+#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET                               33848
+#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET                               33849
+#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET                           33850
+#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET                           33851
+#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET                           33852
+#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET                           33853
+#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET                        33854
+#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET                        33855
+#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET                        33856
+#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET                        33857
+#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET                            33858
+#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET                         33859
+#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET                               33860
+#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET                          33861
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET                        33862
+#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET                           33863
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET                    33864
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET                        33865
+#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET                           33866
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET                    33867
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET                        33868
+#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET                           33869
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET                    33870
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET                        33871
+#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET                           33872
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET                    33873
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET                        33874
+#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET                           33875
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET                    33876
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET                        33877
+#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET                           33878
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET                    33879
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET                        33880
+#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET                           33881
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET                    33882
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET                        33883
+#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET                           33884
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET                    33885
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET                        33886
+#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET                           33887
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET                    33888
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET                        33889
+#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET                           33890
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET                    33891
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET                       33892
+#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET                          33893
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET                   33894
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET                       33895
+#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET                          33896
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET                   33897
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET                       33898
+#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET                          33899
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET                   33900
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET                       33901
+#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET                          33902
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET                   33903
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET                       33904
+#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET                          33905
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET                   33906
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET                       33907
+#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET                          33908
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET                   33909
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET                       33910
+#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET                          33911
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET                   33912
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET                       33913
+#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET                          33914
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET                   33915
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET                       33916
+#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET                          33917
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET                   33918
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET                       33919
+#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET                          33920
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET                   33921
+#define XCM_REG_CON_PHY_Q3_RT_OFFSET                                    33922
+
+#define RUNTIME_ARRAY_SIZE 33923
 
-/* The eth storm context for the Ystorm */
-struct ystorm_eth_conn_st_ctx {
+/* The eth storm context for the Tstorm */
+struct tstorm_eth_conn_st_ctx {
 	__le32 reserved[4];
 };
 
@@ -2535,41 +2501,253 @@ struct xstorm_eth_conn_ag_ctx {
 	__le32	reg7 /* reg7 */;
 	__le32	reg8 /* reg8 */;
 	__le32	reg9 /* reg9 */;
-	u8	byte7 /* byte7 */;
-	u8	byte8 /* byte8 */;
-	u8	byte9 /* byte9 */;
-	u8	byte10 /* byte10 */;
-	u8	byte11 /* byte11 */;
-	u8	byte12 /* byte12 */;
-	u8	byte13 /* byte13 */;
-	u8	byte14 /* byte14 */;
-	u8	byte15 /* byte15 */;
-	u8	byte16 /* byte16 */;
-	__le16	word11 /* word11 */;
+	u8	byte7 /* byte7 */;
+	u8	byte8 /* byte8 */;
+	u8	byte9 /* byte9 */;
+	u8	byte10 /* byte10 */;
+	u8	byte11 /* byte11 */;
+	u8	byte12 /* byte12 */;
+	u8	byte13 /* byte13 */;
+	u8	byte14 /* byte14 */;
+	u8	byte15 /* byte15 */;
+	u8	byte16 /* byte16 */;
+	__le16	word11 /* word11 */;
+	__le32	reg10 /* reg10 */;
+	__le32	reg11 /* reg11 */;
+	__le32	reg12 /* reg12 */;
+	__le32	reg13 /* reg13 */;
+	__le32	reg14 /* reg14 */;
+	__le32	reg15 /* reg15 */;
+	__le32	reg16 /* reg16 */;
+	__le32	reg17 /* reg17 */;
+	__le32	reg18 /* reg18 */;
+	__le32	reg19 /* reg19 */;
+	__le16	word12 /* word12 */;
+	__le16	word13 /* word13 */;
+	__le16	word14 /* word14 */;
+	__le16	word15 /* word15 */;
+};
+
+/* The eth storm context for the Ystorm */
+struct ystorm_eth_conn_st_ctx {
+	__le32 reserved[8];
+};
+
+struct ystorm_eth_conn_ag_ctx {
+	u8	byte0 /* cdu_validation */;
+	u8	byte1 /* state */;
+	u8	flags0;
+#define YSTORM_ETH_CONN_AG_CTX_BIT0_MASK                  0x1 /* exist_in_qm0 */
+#define YSTORM_ETH_CONN_AG_CTX_BIT0_SHIFT                 0
+#define YSTORM_ETH_CONN_AG_CTX_BIT1_MASK                  0x1 /* exist_in_qm1 */
+#define YSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT                 1
+#define YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_MASK     0x3   /* cf0 */
+#define YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_SHIFT    2
+#define YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_MASK      0x3   /* cf1 */
+#define YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_SHIFT     4
+#define YSTORM_ETH_CONN_AG_CTX_CF2_MASK                   0x3   /* cf2 */
+#define YSTORM_ETH_CONN_AG_CTX_CF2_SHIFT                  6
+	u8 flags1;
+#define YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_MASK  0x1   /* cf0en */
+#define YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_SHIFT 0
+#define YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_EN_MASK   0x1   /* cf1en */
+#define YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_EN_SHIFT  1
+#define YSTORM_ETH_CONN_AG_CTX_CF2EN_MASK                 0x1   /* cf2en */
+#define YSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT                2
+#define YSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK               0x1   /* rule0en */
+#define YSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT              3
+#define YSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK               0x1   /* rule1en */
+#define YSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT              4
+#define YSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK               0x1   /* rule2en */
+#define YSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT              5
+#define YSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK               0x1   /* rule3en */
+#define YSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT              6
+#define YSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK               0x1   /* rule4en */
+#define YSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT              7
+	u8	byte2 /* byte2 */;
+	u8	byte3 /* byte3 */;
+	__le16	word0 /* word0 */;
+	__le32	terminate_spqe /* reg0 */;
+	__le32	reg1 /* reg1 */;
+	__le16	tx_bd_cons_upd /* word1 */;
+	__le16	word2 /* word2 */;
+	__le16	word3 /* word3 */;
+	__le16	word4 /* word4 */;
+	__le32	reg2 /* reg2 */;
+	__le32	reg3 /* reg3 */;
+};
+
+struct tstorm_eth_conn_ag_ctx {
+	u8	byte0 /* cdu_validation */;
+	u8	byte1 /* state */;
+	u8	flags0;
+#define TSTORM_ETH_CONN_AG_CTX_BIT0_MASK      0x1       /* exist_in_qm0 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT0_SHIFT     0
+#define TSTORM_ETH_CONN_AG_CTX_BIT1_MASK      0x1       /* exist_in_qm1 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT     1
+#define TSTORM_ETH_CONN_AG_CTX_BIT2_MASK      0x1       /* bit2 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT2_SHIFT     2
+#define TSTORM_ETH_CONN_AG_CTX_BIT3_MASK      0x1       /* bit3 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT3_SHIFT     3
+#define TSTORM_ETH_CONN_AG_CTX_BIT4_MASK      0x1       /* bit4 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT4_SHIFT     4
+#define TSTORM_ETH_CONN_AG_CTX_BIT5_MASK      0x1       /* bit5 */
+#define TSTORM_ETH_CONN_AG_CTX_BIT5_SHIFT     5
+#define TSTORM_ETH_CONN_AG_CTX_CF0_MASK       0x3       /* timer0cf */
+#define TSTORM_ETH_CONN_AG_CTX_CF0_SHIFT      6
+	u8 flags1;
+#define TSTORM_ETH_CONN_AG_CTX_CF1_MASK       0x3       /* timer1cf */
+#define TSTORM_ETH_CONN_AG_CTX_CF1_SHIFT      0
+#define TSTORM_ETH_CONN_AG_CTX_CF2_MASK       0x3       /* timer2cf */
+#define TSTORM_ETH_CONN_AG_CTX_CF2_SHIFT      2
+#define TSTORM_ETH_CONN_AG_CTX_CF3_MASK       0x3       /* timer_stop_all */
+#define TSTORM_ETH_CONN_AG_CTX_CF3_SHIFT      4
+#define TSTORM_ETH_CONN_AG_CTX_CF4_MASK       0x3       /* cf4 */
+#define TSTORM_ETH_CONN_AG_CTX_CF4_SHIFT      6
+	u8 flags2;
+#define TSTORM_ETH_CONN_AG_CTX_CF5_MASK       0x3       /* cf5 */
+#define TSTORM_ETH_CONN_AG_CTX_CF5_SHIFT      0
+#define TSTORM_ETH_CONN_AG_CTX_CF6_MASK       0x3       /* cf6 */
+#define TSTORM_ETH_CONN_AG_CTX_CF6_SHIFT      2
+#define TSTORM_ETH_CONN_AG_CTX_CF7_MASK       0x3       /* cf7 */
+#define TSTORM_ETH_CONN_AG_CTX_CF7_SHIFT      4
+#define TSTORM_ETH_CONN_AG_CTX_CF8_MASK       0x3       /* cf8 */
+#define TSTORM_ETH_CONN_AG_CTX_CF8_SHIFT      6
+	u8 flags3;
+#define TSTORM_ETH_CONN_AG_CTX_CF9_MASK       0x3       /* cf9 */
+#define TSTORM_ETH_CONN_AG_CTX_CF9_SHIFT      0
+#define TSTORM_ETH_CONN_AG_CTX_CF10_MASK      0x3       /* cf10 */
+#define TSTORM_ETH_CONN_AG_CTX_CF10_SHIFT     2
+#define TSTORM_ETH_CONN_AG_CTX_CF0EN_MASK     0x1       /* cf0en */
+#define TSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT    4
+#define TSTORM_ETH_CONN_AG_CTX_CF1EN_MASK     0x1       /* cf1en */
+#define TSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT    5
+#define TSTORM_ETH_CONN_AG_CTX_CF2EN_MASK     0x1       /* cf2en */
+#define TSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT    6
+#define TSTORM_ETH_CONN_AG_CTX_CF3EN_MASK     0x1       /* cf3en */
+#define TSTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT    7
+	u8 flags4;
+#define TSTORM_ETH_CONN_AG_CTX_CF4EN_MASK     0x1       /* cf4en */
+#define TSTORM_ETH_CONN_AG_CTX_CF4EN_SHIFT    0
+#define TSTORM_ETH_CONN_AG_CTX_CF5EN_MASK     0x1       /* cf5en */
+#define TSTORM_ETH_CONN_AG_CTX_CF5EN_SHIFT    1
+#define TSTORM_ETH_CONN_AG_CTX_CF6EN_MASK     0x1       /* cf6en */
+#define TSTORM_ETH_CONN_AG_CTX_CF6EN_SHIFT    2
+#define TSTORM_ETH_CONN_AG_CTX_CF7EN_MASK     0x1       /* cf7en */
+#define TSTORM_ETH_CONN_AG_CTX_CF7EN_SHIFT    3
+#define TSTORM_ETH_CONN_AG_CTX_CF8EN_MASK     0x1       /* cf8en */
+#define TSTORM_ETH_CONN_AG_CTX_CF8EN_SHIFT    4
+#define TSTORM_ETH_CONN_AG_CTX_CF9EN_MASK     0x1       /* cf9en */
+#define TSTORM_ETH_CONN_AG_CTX_CF9EN_SHIFT    5
+#define TSTORM_ETH_CONN_AG_CTX_CF10EN_MASK    0x1       /* cf10en */
+#define TSTORM_ETH_CONN_AG_CTX_CF10EN_SHIFT   6
+#define TSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK   0x1       /* rule0en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT  7
+	u8 flags5;
+#define TSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK   0x1       /* rule1en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT  0
+#define TSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK   0x1       /* rule2en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT  1
+#define TSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK   0x1       /* rule3en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT  2
+#define TSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK   0x1       /* rule4en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT  3
+#define TSTORM_ETH_CONN_AG_CTX_RULE5EN_MASK   0x1       /* rule5en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT  4
+#define TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_MASK  0x1       /* rule6en */
+#define TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_SHIFT 5
+#define TSTORM_ETH_CONN_AG_CTX_RULE7EN_MASK   0x1       /* rule7en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT  6
+#define TSTORM_ETH_CONN_AG_CTX_RULE8EN_MASK   0x1       /* rule8en */
+#define TSTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT  7
+	__le32	reg0 /* reg0 */;
+	__le32	reg1 /* reg1 */;
+	__le32	reg2 /* reg2 */;
+	__le32	reg3 /* reg3 */;
+	__le32	reg4 /* reg4 */;
+	__le32	reg5 /* reg5 */;
+	__le32	reg6 /* reg6 */;
+	__le32	reg7 /* reg7 */;
+	__le32	reg8 /* reg8 */;
+	u8	byte2 /* byte2 */;
+	u8	byte3 /* byte3 */;
+	__le16	rx_bd_cons /* word0 */;
+	u8	byte4 /* byte4 */;
+	u8	byte5 /* byte5 */;
+	__le16	rx_bd_prod /* word1 */;
+	__le16	word2 /* conn_dpi */;
+	__le16	word3 /* word3 */;
+	__le32	reg9 /* reg9 */;
 	__le32	reg10 /* reg10 */;
-	__le32	reg11 /* reg11 */;
-	__le32	reg12 /* reg12 */;
-	__le32	reg13 /* reg13 */;
-	__le32	reg14 /* reg14 */;
-	__le32	reg15 /* reg15 */;
-	__le32	reg16 /* reg16 */;
-	__le32	reg17 /* reg17 */;
-	__le32	reg18 /* reg18 */;
-	__le32	reg19 /* reg19 */;
-	__le16	word12 /* word12 */;
-	__le16	word13 /* word13 */;
-	__le16	word14 /* word14 */;
-	__le16	word15 /* word15 */;
-};
-
-/* The eth storm context for the Tstorm */
-struct tstorm_eth_conn_st_ctx {
-	__le32 reserved[4];
 };
 
-/* The eth storm context for the Mstorm */
-struct mstorm_eth_conn_st_ctx {
-	__le32 reserved[8];
+struct ustorm_eth_conn_ag_ctx {
+	u8	byte0 /* cdu_validation */;
+	u8	byte1 /* state */;
+	u8	flags0;
+#define USTORM_ETH_CONN_AG_CTX_BIT0_MASK                  0x1 /* exist_in_qm0 */
+#define USTORM_ETH_CONN_AG_CTX_BIT0_SHIFT                 0
+#define USTORM_ETH_CONN_AG_CTX_BIT1_MASK                  0x1 /* exist_in_qm1 */
+#define USTORM_ETH_CONN_AG_CTX_BIT1_SHIFT                 1
+#define USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_MASK   0x3 /* timer0cf */
+#define USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_SHIFT  2
+#define USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_MASK   0x3 /* timer1cf */
+#define USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_SHIFT  4
+#define USTORM_ETH_CONN_AG_CTX_CF2_MASK                   0x3 /* timer2cf */
+#define USTORM_ETH_CONN_AG_CTX_CF2_SHIFT                  6
+	u8 flags1;
+#define USTORM_ETH_CONN_AG_CTX_CF3_MASK                 0x3 /* timer_stop_all */
+#define USTORM_ETH_CONN_AG_CTX_CF3_SHIFT                0
+#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_MASK           0x3 /* cf4 */
+#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_SHIFT          2
+#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_MASK           0x3 /* cf5 */
+#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_SHIFT          4
+#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_MASK   0x3 /* cf6 */
+#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_SHIFT  6
+	u8 flags2;
+#define USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_EN_MASK  0x1 /* cf0en */
+#define USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_EN_SHIFT 0
+#define USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_EN_MASK  0x1 /* cf1en */
+#define USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_EN_SHIFT 1
+#define USTORM_ETH_CONN_AG_CTX_CF2EN_MASK                   0x1 /* cf2en */
+#define USTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT                  2
+#define USTORM_ETH_CONN_AG_CTX_CF3EN_MASK                   0x1 /* cf3en */
+#define USTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT                  3
+#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_MASK            0x1 /* cf4en */
+#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_SHIFT           4
+#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_MASK            0x1 /* cf5en */
+#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_SHIFT           5
+#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_MASK    0x1 /* cf6en */
+#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_SHIFT   6
+#define USTORM_ETH_CONN_AG_CTX_RULE0EN_MASK                 0x1 /* rule0en */
+#define USTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT                7
+	u8 flags3;
+#define USTORM_ETH_CONN_AG_CTX_RULE1EN_MASK                 0x1 /* rule1en */
+#define USTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT                0
+#define USTORM_ETH_CONN_AG_CTX_RULE2EN_MASK                 0x1 /* rule2en */
+#define USTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT                1
+#define USTORM_ETH_CONN_AG_CTX_RULE3EN_MASK                 0x1 /* rule3en */
+#define USTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT                2
+#define USTORM_ETH_CONN_AG_CTX_RULE4EN_MASK                 0x1 /* rule4en */
+#define USTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT                3
+#define USTORM_ETH_CONN_AG_CTX_RULE5EN_MASK                 0x1 /* rule5en */
+#define USTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT                4
+#define USTORM_ETH_CONN_AG_CTX_RULE6EN_MASK                 0x1 /* rule6en */
+#define USTORM_ETH_CONN_AG_CTX_RULE6EN_SHIFT                5
+#define USTORM_ETH_CONN_AG_CTX_RULE7EN_MASK                 0x1 /* rule7en */
+#define USTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT                6
+#define USTORM_ETH_CONN_AG_CTX_RULE8EN_MASK                 0x1 /* rule8en */
+#define USTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT                7
+	u8	byte2 /* byte2 */;
+	u8	byte3 /* byte3 */;
+	__le16	word0 /* conn_dpi */;
+	__le16	tx_bd_cons /* word1 */;
+	__le32	reg0 /* reg0 */;
+	__le32	reg1 /* reg1 */;
+	__le32	reg2 /* reg2 */;
+	__le32	tx_int_coallecing_timeset /* reg3 */;
+	__le16	tx_drv_bd_cons /* word2 */;
+	__le16	rx_drv_cqe_cons /* word3 */;
 };
 
 /* The eth storm context for the Ustorm */
@@ -2577,24 +2755,30 @@ struct ustorm_eth_conn_st_ctx {
 	__le32 reserved[40];
 };
 
+/* The eth storm context for the Mstorm */
+struct mstorm_eth_conn_st_ctx {
+	__le32 reserved[8];
+};
+
 /* eth connection context */
 struct eth_conn_context {
-	struct ystorm_eth_conn_st_ctx	ystorm_st_context;
-	struct regpair			ystorm_st_padding[2] /* padding */;
+	struct tstorm_eth_conn_st_ctx	tstorm_st_context;
+	struct regpair			tstorm_st_padding[2];
 	struct pstorm_eth_conn_st_ctx	pstorm_st_context;
-	struct regpair			pstorm_st_padding[2] /* padding */;
 	struct xstorm_eth_conn_st_ctx	xstorm_st_context;
 	struct xstorm_eth_conn_ag_ctx	xstorm_ag_context;
-	struct tstorm_eth_conn_st_ctx	tstorm_st_context;
-	struct regpair			tstorm_st_padding[2] /* padding */;
-	struct mstorm_eth_conn_st_ctx	mstorm_st_context;
+	struct ystorm_eth_conn_st_ctx	ystorm_st_context;
+	struct ystorm_eth_conn_ag_ctx	ystorm_ag_context;
+	struct tstorm_eth_conn_ag_ctx	tstorm_ag_context;
+	struct ustorm_eth_conn_ag_ctx	ustorm_ag_context;
 	struct ustorm_eth_conn_st_ctx	ustorm_st_context;
+	struct mstorm_eth_conn_st_ctx	mstorm_st_context;
 };
 
 enum eth_filter_action {
 	ETH_FILTER_ACTION_REMOVE,
 	ETH_FILTER_ACTION_ADD,
-	ETH_FILTER_ACTION_REPLACE,
+	ETH_FILTER_ACTION_REMOVE_ALL,
 	MAX_ETH_FILTER_ACTION
 };
 
@@ -2653,6 +2837,32 @@ enum eth_ramrod_cmd_id {
 	MAX_ETH_RAMROD_CMD_ID
 };
 
+enum eth_tx_err {
+	ETH_TX_ERR_DROP /* Drop erronous packet. */,
+	ETH_TX_ERR_ASSERT_MALICIOUS,
+	MAX_ETH_TX_ERR
+};
+
+struct eth_tx_err_vals {
+	__le16 values;
+#define ETH_TX_ERR_VALS_ILLEGAL_VLAN_MODE_MASK            0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_VLAN_MODE_SHIFT           0
+#define ETH_TX_ERR_VALS_PACKET_TOO_SMALL_MASK             0x1
+#define ETH_TX_ERR_VALS_PACKET_TOO_SMALL_SHIFT            1
+#define ETH_TX_ERR_VALS_ANTI_SPOOFING_ERR_MASK            0x1
+#define ETH_TX_ERR_VALS_ANTI_SPOOFING_ERR_SHIFT           2
+#define ETH_TX_ERR_VALS_ILLEGAL_INBAND_TAGS_MASK          0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_INBAND_TAGS_SHIFT         3
+#define ETH_TX_ERR_VALS_VLAN_INSERTION_W_INBAND_TAG_MASK  0x1
+#define ETH_TX_ERR_VALS_VLAN_INSERTION_W_INBAND_TAG_SHIFT 4
+#define ETH_TX_ERR_VALS_MTU_VIOLATION_MASK                0x1
+#define ETH_TX_ERR_VALS_MTU_VIOLATION_SHIFT               5
+#define ETH_TX_ERR_VALS_ILLEGAL_CONTROL_FRAME_MASK        0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_CONTROL_FRAME_SHIFT       6
+#define ETH_TX_ERR_VALS_RESERVED_MASK                     0x1FF
+#define ETH_TX_ERR_VALS_RESERVED_SHIFT                    7
+};
+
 struct eth_vport_rss_config {
 	__le16 capabilities;
 #define ETH_VPORT_RSS_CONFIG_IPV4_CAPABILITY_MASK	0x1
@@ -2669,12 +2879,8 @@ struct eth_vport_rss_config {
 #define ETH_VPORT_RSS_CONFIG_IPV6_UDP_CAPABILITY_SHIFT   5
 #define ETH_VPORT_RSS_CONFIG_EN_5_TUPLE_CAPABILITY_MASK  0x1
 #define ETH_VPORT_RSS_CONFIG_EN_5_TUPLE_CAPABILITY_SHIFT 6
-#define ETH_VPORT_RSS_CONFIG_CALC_4TUP_TCP_FRAG_MASK     0x1
-#define ETH_VPORT_RSS_CONFIG_CALC_4TUP_TCP_FRAG_SHIFT    7
-#define ETH_VPORT_RSS_CONFIG_CALC_4TUP_UDP_FRAG_MASK     0x1
-#define ETH_VPORT_RSS_CONFIG_CALC_4TUP_UDP_FRAG_SHIFT    8
-#define ETH_VPORT_RSS_CONFIG_RESERVED0_MASK	      0x7F
-#define ETH_VPORT_RSS_CONFIG_RESERVED0_SHIFT	     9
+#define ETH_VPORT_RSS_CONFIG_RESERVED0_MASK	      0x1FF
+#define ETH_VPORT_RSS_CONFIG_RESERVED0_SHIFT	     7
 	u8      rss_id;
 	u8      rss_mode;
 	u8      update_rss_key;
@@ -2749,10 +2955,14 @@ struct rx_queue_start_ramrod_data {
 	u8	      pxp_tph_valid_pkt;
 	u8	      pxp_st_hint;
 	__le16	  pxp_st_index;
-	u8	      reserved[4];
-	struct regpair  cqe_pbl_addr;
-	struct regpair  bd_base;
-	struct regpair  sge_base;
+	u8		pmd_mode;
+	u8		notify_en;
+	u8		toggle_val;
+	u8		reserved[7];
+	__le16		reserved1;
+	struct regpair	cqe_pbl_addr;
+	struct regpair	bd_base;
+	struct regpair	reserved2;
 };
 
 struct rx_queue_stop_ramrod_data {
@@ -2764,23 +2974,24 @@ struct rx_queue_stop_ramrod_data {
 };
 
 struct rx_queue_update_ramrod_data {
-	__le16	  rx_queue_id;
-	u8	      complete_cqe_flg;
-	u8	      complete_event_flg;
-	u8	      init_sge_ring_flg;
-	u8	      vport_id;
-	u8	      pxp_tph_valid_sge;
-	u8	      pxp_st_hint;
-	__le16	  pxp_st_index;
-	u8	      reserved[6];
-	struct regpair  sge_base;
+	__le16	rx_queue_id;
+	u8	complete_cqe_flg;
+	u8	complete_event_flg;
+	u8	vport_id;
+	u8	reserved[4];
+	u8	reserved1;
+	u8	reserved2;
+	u8	reserved3;
+	__le16	reserved4;
+	__le16	reserved5;
+	struct regpair reserved6;
 };
 
 struct tx_queue_start_ramrod_data {
 	__le16  sb_id;
 	u8      sb_index;
 	u8      vport_id;
-	u8      tc;
+	u8      reserved0;
 	u8      stats_counter_id;
 	__le16  qm_pq_id;
 	u8      flags;
@@ -2790,18 +3001,25 @@ struct tx_queue_start_ramrod_data {
 #define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_PKT_DUP_SHIFT     1
 #define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_TX_DEST_MASK      0x1
 #define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_TX_DEST_SHIFT     2
-#define TX_QUEUE_START_RAMROD_DATA_RESERVED0_MASK	      0x1F
-#define TX_QUEUE_START_RAMROD_DATA_RESERVED0_SHIFT	     3
-	u8	      pin_context;
-	u8	      pxp_tph_valid_bd;
-	u8	      pxp_tph_valid_pkt;
-	__le16	  pxp_st_index;
-	u8	      pxp_st_hint;
-	u8	      reserved1[3];
-	__le16	  queue_zone_id;
-	__le16	  test_dup_count;
-	__le16	  pbl_size;
-	struct regpair  pbl_base_addr;
+#define TX_QUEUE_START_RAMROD_DATA_PMD_MODE_MASK               0x1
+#define TX_QUEUE_START_RAMROD_DATA_PMD_MODE_SHIFT              3
+#define TX_QUEUE_START_RAMROD_DATA_NOTIFY_EN_MASK              0x1
+#define TX_QUEUE_START_RAMROD_DATA_NOTIFY_EN_SHIFT             4
+#define TX_QUEUE_START_RAMROD_DATA_PIN_CONTEXT_MASK            0x1
+#define TX_QUEUE_START_RAMROD_DATA_PIN_CONTEXT_SHIFT           5
+#define TX_QUEUE_START_RAMROD_DATA_RESERVED1_MASK              0x3
+#define TX_QUEUE_START_RAMROD_DATA_RESERVED1_SHIFT             6
+	u8	pxp_st_hint;
+	u8	pxp_tph_valid_bd;
+	u8	pxp_tph_valid_pkt;
+	__le16	pxp_st_index;
+	__le16	comp_agg_size;
+	__le16	queue_zone_id;
+	__le16	test_dup_count;
+	__le16	pbl_size;
+	__le16	tx_queue_id;
+	struct regpair	pbl_base_addr;
+	struct regpair	bd_cons_address;
 };
 
 struct tx_queue_stop_ramrod_data {
@@ -2822,16 +3040,16 @@ struct vport_start_ramrod_data {
 	struct eth_vport_rx_mode	rx_mode;
 	struct eth_vport_tx_mode	tx_mode;
 	struct eth_vport_tpa_param      tpa_param;
-	__le16			  sge_buff_size;
-	u8			      max_sges_num;
-	u8			      tx_switching_en;
-	u8			      anti_spoofing_en;
-	u8			      default_vlan_en;
-	u8			      handle_ptp_pkts;
-	u8			      silent_vlan_removal_en;
-	__le16			  default_vlan;
-	u8			      untagged;
-	u8			      reserved[7];
+	__le16				default_vlan;
+	u8				tx_switching_en;
+	u8				anti_spoofing_en;
+	u8				default_vlan_en;
+	u8				handle_ptp_pkts;
+	u8				silent_vlan_removal_en;
+	u8				untagged;
+	struct eth_tx_err_vals		tx_err_behav;
+	u8				zero_placement_offset;
+	u8				reserved[7];
 };
 
 struct vport_stop_ramrod_data {
@@ -2840,36 +3058,35 @@ struct vport_stop_ramrod_data {
 };
 
 struct vport_update_ramrod_data_cmn {
-	u8      vport_id;
-	u8      update_rx_active_flg;
-	u8      rx_active_flg;
-	u8      update_tx_active_flg;
-	u8      tx_active_flg;
-	u8      update_rx_mode_flg;
-	u8      update_tx_mode_flg;
-	u8      update_approx_mcast_flg;
-	u8      update_rss_flg;
-	u8      update_inner_vlan_removal_en_flg;
-	u8      inner_vlan_removal_en;
-	u8      update_tpa_param_flg;
-	u8      update_tpa_en_flg;
-	u8      update_sge_param_flg;
-	__le16  sge_buff_size;
-	u8      max_sges_num;
-	u8      update_tx_switching_en_flg;
-	u8      tx_switching_en;
-	u8      update_anti_spoofing_en_flg;
-	u8      anti_spoofing_en;
-	u8      update_handle_ptp_pkts;
-	u8      handle_ptp_pkts;
-	u8      update_default_vlan_en_flg;
-	u8      default_vlan_en;
-	u8      update_default_vlan_flg;
-	__le16  default_vlan;
-	u8      update_accept_any_vlan_flg;
-	u8      accept_any_vlan;
-	u8      silent_vlan_removal_en;
-	u8      reserved;
+	u8	vport_id;
+	u8	update_rx_active_flg;
+	u8	rx_active_flg;
+	u8	update_tx_active_flg;
+	u8	tx_active_flg;
+	u8	update_rx_mode_flg;
+	u8	update_tx_mode_flg;
+	u8	update_approx_mcast_flg;
+	u8	update_rss_flg;
+	u8	update_inner_vlan_removal_en_flg;
+	u8	inner_vlan_removal_en;
+	u8	update_tpa_param_flg;
+	u8	update_tpa_en_flg;
+	u8	update_tx_switching_en_flg;
+	u8	tx_switching_en;
+	u8	update_anti_spoofing_en_flg;
+	u8	anti_spoofing_en;
+	u8	update_handle_ptp_pkts;
+	u8	handle_ptp_pkts;
+	u8	update_default_vlan_en_flg;
+	u8	default_vlan_en;
+	u8	update_default_vlan_flg;
+	__le16	default_vlan;
+	u8	update_accept_any_vlan_flg;
+	u8	accept_any_vlan;
+	u8	silent_vlan_removal_en;
+	u8	update_mtu_flg;
+	__le16	mtu;
+	u8	reserved[2];
 };
 
 struct vport_update_ramrod_mcast {
@@ -2885,436 +3102,6 @@ struct vport_update_ramrod_data {
 	struct eth_vport_rss_config	     rss_config;
 };
 
-struct mstorm_eth_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_MASK  0x1   /* exist_in_qm0 */
-#define MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_SHIFT 0
-#define MSTORM_ETH_CONN_AG_CTX_BIT1_MASK          0x1   /* exist_in_qm1 */
-#define MSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT         1
-#define MSTORM_ETH_CONN_AG_CTX_CF0_MASK           0x3   /* cf0 */
-#define MSTORM_ETH_CONN_AG_CTX_CF0_SHIFT          2
-#define MSTORM_ETH_CONN_AG_CTX_CF1_MASK           0x3   /* cf1 */
-#define MSTORM_ETH_CONN_AG_CTX_CF1_SHIFT          4
-#define MSTORM_ETH_CONN_AG_CTX_CF2_MASK           0x3   /* cf2 */
-#define MSTORM_ETH_CONN_AG_CTX_CF2_SHIFT          6
-	u8 flags1;
-#define MSTORM_ETH_CONN_AG_CTX_CF0EN_MASK         0x1   /* cf0en */
-#define MSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT        0
-#define MSTORM_ETH_CONN_AG_CTX_CF1EN_MASK         0x1   /* cf1en */
-#define MSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT        1
-#define MSTORM_ETH_CONN_AG_CTX_CF2EN_MASK         0x1   /* cf2en */
-#define MSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT        2
-#define MSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK       0x1   /* rule0en */
-#define MSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT      3
-#define MSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK       0x1   /* rule1en */
-#define MSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT      4
-#define MSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK       0x1   /* rule2en */
-#define MSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT      5
-#define MSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK       0x1   /* rule3en */
-#define MSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT      6
-#define MSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK       0x1   /* rule4en */
-#define MSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT      7
-	__le16	word0 /* word0 */;
-	__le16	word1 /* word1 */;
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-};
-
-struct tstorm_eth_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define TSTORM_ETH_CONN_AG_CTX_BIT0_MASK      0x1       /* exist_in_qm0 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT0_SHIFT     0
-#define TSTORM_ETH_CONN_AG_CTX_BIT1_MASK      0x1       /* exist_in_qm1 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT     1
-#define TSTORM_ETH_CONN_AG_CTX_BIT2_MASK      0x1       /* bit2 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT2_SHIFT     2
-#define TSTORM_ETH_CONN_AG_CTX_BIT3_MASK      0x1       /* bit3 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT3_SHIFT     3
-#define TSTORM_ETH_CONN_AG_CTX_BIT4_MASK      0x1       /* bit4 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT4_SHIFT     4
-#define TSTORM_ETH_CONN_AG_CTX_BIT5_MASK      0x1       /* bit5 */
-#define TSTORM_ETH_CONN_AG_CTX_BIT5_SHIFT     5
-#define TSTORM_ETH_CONN_AG_CTX_CF0_MASK       0x3       /* timer0cf */
-#define TSTORM_ETH_CONN_AG_CTX_CF0_SHIFT      6
-	u8 flags1;
-#define TSTORM_ETH_CONN_AG_CTX_CF1_MASK       0x3       /* timer1cf */
-#define TSTORM_ETH_CONN_AG_CTX_CF1_SHIFT      0
-#define TSTORM_ETH_CONN_AG_CTX_CF2_MASK       0x3       /* timer2cf */
-#define TSTORM_ETH_CONN_AG_CTX_CF2_SHIFT      2
-#define TSTORM_ETH_CONN_AG_CTX_CF3_MASK       0x3       /* timer_stop_all */
-#define TSTORM_ETH_CONN_AG_CTX_CF3_SHIFT      4
-#define TSTORM_ETH_CONN_AG_CTX_CF4_MASK       0x3       /* cf4 */
-#define TSTORM_ETH_CONN_AG_CTX_CF4_SHIFT      6
-	u8 flags2;
-#define TSTORM_ETH_CONN_AG_CTX_CF5_MASK       0x3       /* cf5 */
-#define TSTORM_ETH_CONN_AG_CTX_CF5_SHIFT      0
-#define TSTORM_ETH_CONN_AG_CTX_CF6_MASK       0x3       /* cf6 */
-#define TSTORM_ETH_CONN_AG_CTX_CF6_SHIFT      2
-#define TSTORM_ETH_CONN_AG_CTX_CF7_MASK       0x3       /* cf7 */
-#define TSTORM_ETH_CONN_AG_CTX_CF7_SHIFT      4
-#define TSTORM_ETH_CONN_AG_CTX_CF8_MASK       0x3       /* cf8 */
-#define TSTORM_ETH_CONN_AG_CTX_CF8_SHIFT      6
-	u8 flags3;
-#define TSTORM_ETH_CONN_AG_CTX_CF9_MASK       0x3       /* cf9 */
-#define TSTORM_ETH_CONN_AG_CTX_CF9_SHIFT      0
-#define TSTORM_ETH_CONN_AG_CTX_CF10_MASK      0x3       /* cf10 */
-#define TSTORM_ETH_CONN_AG_CTX_CF10_SHIFT     2
-#define TSTORM_ETH_CONN_AG_CTX_CF0EN_MASK     0x1       /* cf0en */
-#define TSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT    4
-#define TSTORM_ETH_CONN_AG_CTX_CF1EN_MASK     0x1       /* cf1en */
-#define TSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT    5
-#define TSTORM_ETH_CONN_AG_CTX_CF2EN_MASK     0x1       /* cf2en */
-#define TSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT    6
-#define TSTORM_ETH_CONN_AG_CTX_CF3EN_MASK     0x1       /* cf3en */
-#define TSTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT    7
-	u8 flags4;
-#define TSTORM_ETH_CONN_AG_CTX_CF4EN_MASK     0x1       /* cf4en */
-#define TSTORM_ETH_CONN_AG_CTX_CF4EN_SHIFT    0
-#define TSTORM_ETH_CONN_AG_CTX_CF5EN_MASK     0x1       /* cf5en */
-#define TSTORM_ETH_CONN_AG_CTX_CF5EN_SHIFT    1
-#define TSTORM_ETH_CONN_AG_CTX_CF6EN_MASK     0x1       /* cf6en */
-#define TSTORM_ETH_CONN_AG_CTX_CF6EN_SHIFT    2
-#define TSTORM_ETH_CONN_AG_CTX_CF7EN_MASK     0x1       /* cf7en */
-#define TSTORM_ETH_CONN_AG_CTX_CF7EN_SHIFT    3
-#define TSTORM_ETH_CONN_AG_CTX_CF8EN_MASK     0x1       /* cf8en */
-#define TSTORM_ETH_CONN_AG_CTX_CF8EN_SHIFT    4
-#define TSTORM_ETH_CONN_AG_CTX_CF9EN_MASK     0x1       /* cf9en */
-#define TSTORM_ETH_CONN_AG_CTX_CF9EN_SHIFT    5
-#define TSTORM_ETH_CONN_AG_CTX_CF10EN_MASK    0x1       /* cf10en */
-#define TSTORM_ETH_CONN_AG_CTX_CF10EN_SHIFT   6
-#define TSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK   0x1       /* rule0en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT  7
-	u8 flags5;
-#define TSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK   0x1       /* rule1en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT  0
-#define TSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK   0x1       /* rule2en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT  1
-#define TSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK   0x1       /* rule3en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT  2
-#define TSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK   0x1       /* rule4en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT  3
-#define TSTORM_ETH_CONN_AG_CTX_RULE5EN_MASK   0x1       /* rule5en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT  4
-#define TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_MASK  0x1       /* rule6en */
-#define TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_SHIFT 5
-#define TSTORM_ETH_CONN_AG_CTX_RULE7EN_MASK   0x1       /* rule7en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT  6
-#define TSTORM_ETH_CONN_AG_CTX_RULE8EN_MASK   0x1       /* rule8en */
-#define TSTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT  7
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-	__le32	reg2 /* reg2 */;
-	__le32	reg3 /* reg3 */;
-	__le32	reg4 /* reg4 */;
-	__le32	reg5 /* reg5 */;
-	__le32	reg6 /* reg6 */;
-	__le32	reg7 /* reg7 */;
-	__le32	reg8 /* reg8 */;
-	u8	byte2 /* byte2 */;
-	u8	byte3 /* byte3 */;
-	__le16	rx_bd_cons /* word0 */;
-	u8	byte4 /* byte4 */;
-	u8	byte5 /* byte5 */;
-	__le16	rx_bd_prod /* word1 */;
-	__le16	word2 /* conn_dpi */;
-	__le16	word3 /* word3 */;
-	__le32	reg9 /* reg9 */;
-	__le32	reg10 /* reg10 */;
-};
-
-struct ustorm_eth_conn_ag_ctx {
-	u8	byte0 /* cdu_validation */;
-	u8	byte1 /* state */;
-	u8	flags0;
-#define USTORM_ETH_CONN_AG_CTX_BIT0_MASK                  0x1
-#define USTORM_ETH_CONN_AG_CTX_BIT0_SHIFT                 0
-#define USTORM_ETH_CONN_AG_CTX_BIT1_MASK                  0x1
-#define USTORM_ETH_CONN_AG_CTX_BIT1_SHIFT                 1
-#define USTORM_ETH_CONN_AG_CTX_CF0_MASK                   0x3   /* timer0cf */
-#define USTORM_ETH_CONN_AG_CTX_CF0_SHIFT                  2
-#define USTORM_ETH_CONN_AG_CTX_CF1_MASK                   0x3   /* timer1cf */
-#define USTORM_ETH_CONN_AG_CTX_CF1_SHIFT                  4
-#define USTORM_ETH_CONN_AG_CTX_CF2_MASK                   0x3   /* timer2cf */
-#define USTORM_ETH_CONN_AG_CTX_CF2_SHIFT                  6
-	u8 flags1;
-#define USTORM_ETH_CONN_AG_CTX_CF3_MASK                   0x3
-#define USTORM_ETH_CONN_AG_CTX_CF3_SHIFT                  0
-#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_MASK             0x3   /* cf4 */
-#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_SHIFT            2
-#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_MASK             0x3   /* cf5 */
-#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_SHIFT            4
-#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_MASK     0x3   /* cf6 */
-#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_SHIFT    6
-	u8 flags2;
-#define USTORM_ETH_CONN_AG_CTX_CF0EN_MASK                 0x1   /* cf0en */
-#define USTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT                0
-#define USTORM_ETH_CONN_AG_CTX_CF1EN_MASK                 0x1   /* cf1en */
-#define USTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT                1
-#define USTORM_ETH_CONN_AG_CTX_CF2EN_MASK                 0x1   /* cf2en */
-#define USTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT                2
-#define USTORM_ETH_CONN_AG_CTX_CF3EN_MASK                 0x1   /* cf3en */
-#define USTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT                3
-#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_MASK          0x1   /* cf4en */
-#define USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_SHIFT         4
-#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_MASK          0x1   /* cf5en */
-#define USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_SHIFT         5
-#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_MASK  0x1   /* cf6en */
-#define USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_SHIFT 6
-#define USTORM_ETH_CONN_AG_CTX_RULE0EN_MASK               0x1   /* rule0en */
-#define USTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT              7
-	u8 flags3;
-#define USTORM_ETH_CONN_AG_CTX_RULE1EN_MASK               0x1   /* rule1en */
-#define USTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT              0
-#define USTORM_ETH_CONN_AG_CTX_RULE2EN_MASK               0x1   /* rule2en */
-#define USTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT              1
-#define USTORM_ETH_CONN_AG_CTX_RULE3EN_MASK               0x1   /* rule3en */
-#define USTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT              2
-#define USTORM_ETH_CONN_AG_CTX_RULE4EN_MASK               0x1   /* rule4en */
-#define USTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT              3
-#define USTORM_ETH_CONN_AG_CTX_RULE5EN_MASK               0x1   /* rule5en */
-#define USTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT              4
-#define USTORM_ETH_CONN_AG_CTX_RULE6EN_MASK               0x1   /* rule6en */
-#define USTORM_ETH_CONN_AG_CTX_RULE6EN_SHIFT              5
-#define USTORM_ETH_CONN_AG_CTX_RULE7EN_MASK               0x1   /* rule7en */
-#define USTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT              6
-#define USTORM_ETH_CONN_AG_CTX_RULE8EN_MASK               0x1   /* rule8en */
-#define USTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT              7
-	u8	byte2 /* byte2 */;
-	u8	byte3 /* byte3 */;
-	__le16	word0 /* conn_dpi */;
-	__le16	tx_bd_cons /* word1 */;
-	__le32	reg0 /* reg0 */;
-	__le32	reg1 /* reg1 */;
-	__le32	reg2 /* reg2 */;
-	__le32	reg3 /* reg3 */;
-	__le16	tx_drv_bd_cons /* word2 */;
-	__le16	rx_drv_cqe_cons /* word3 */;
-};
-
-struct xstorm_eth_hw_conn_ag_ctx {
-	u8	reserved0 /* cdu_validation */;
-	u8	eth_state /* state */;
-	u8	flags0;
-#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_SHIFT           0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_SHIFT              1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_SHIFT              2
-#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_SHIFT           3
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_SHIFT              4
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_SHIFT              5
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_SHIFT              6
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_SHIFT              7
-	u8 flags1;
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_SHIFT              0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_SHIFT              1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_MASK               0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_SHIFT              2
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT11_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT11_SHIFT                  3
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT12_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT12_SHIFT                  4
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT13_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_BIT13_SHIFT                  5
-#define XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_MASK          0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_SHIFT         6
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_SHIFT           7
-	u8 flags2;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF0_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF0_SHIFT                    0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF1_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF1_SHIFT                    2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF2_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF2_SHIFT                    4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF3_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF3_SHIFT                    6
-	u8 flags3;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF4_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF4_SHIFT                    0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF5_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF5_SHIFT                    2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF6_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF6_SHIFT                    4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF7_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF7_SHIFT                    6
-	u8 flags4;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF8_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF8_SHIFT                    0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF9_MASK                     0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF9_SHIFT                    2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF10_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF10_SHIFT                   4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF11_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF11_SHIFT                   6
-	u8 flags5;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF12_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF12_SHIFT                   0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF13_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF13_SHIFT                   2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF14_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF14_SHIFT                   4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF15_MASK                    0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF15_SHIFT                   6
-	u8 flags6;
-#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_MASK        0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_SHIFT       0
-#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_MASK        0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_SHIFT       2
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_MASK                   0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_SHIFT                  4
-#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_MASK            0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_SHIFT           6
-	u8 flags7;
-#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_MASK                0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_SHIFT               0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_MASK              0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_SHIFT             2
-#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_MASK               0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_SHIFT              4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_SHIFT                  6
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_SHIFT                  7
-	u8 flags8;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_SHIFT                  0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_SHIFT                  1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_SHIFT                  2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_SHIFT                  3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_SHIFT                  4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_SHIFT                  5
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_SHIFT                  6
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_MASK                   0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_SHIFT                  7
-	u8 flags9;
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_SHIFT                 0
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_SHIFT                 1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_SHIFT                 2
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_SHIFT                 3
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_SHIFT                 4
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_MASK                  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_SHIFT                 5
-#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_MASK     0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_SHIFT    6
-#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_MASK     0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_SHIFT    7
-	u8 flags10;
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_SHIFT               0
-#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_MASK         0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_SHIFT        1
-#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_MASK             0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT            2
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_MASK              0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_SHIFT             3
-#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_SHIFT           4
-#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_MASK  0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_SHIFT 5
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_MASK              0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_SHIFT             6
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_MASK              0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_SHIFT             7
-	u8 flags11;
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_MASK              0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_SHIFT             0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_MASK              0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_SHIFT             1
-#define XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_MASK          0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_SHIFT         2
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_MASK                 0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_SHIFT                3
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_MASK                 0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_SHIFT                4
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_MASK                 0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_SHIFT                5
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_SHIFT           6
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_MASK                 0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_SHIFT                7
-	u8 flags12;
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_SHIFT               0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_SHIFT               1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_SHIFT           2
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_SHIFT           3
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_SHIFT               4
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_SHIFT               5
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_SHIFT               6
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_SHIFT               7
-	u8 flags13;
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_SHIFT               0
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_MASK                0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_SHIFT               1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_SHIFT           2
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_SHIFT           3
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_SHIFT           4
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_SHIFT           5
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_SHIFT           6
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_MASK            0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_SHIFT           7
-	u8 flags14;
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_MASK        0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_SHIFT       0
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_MASK      0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_SHIFT     1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_MASK    0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_SHIFT   2
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_MASK    0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_SHIFT   3
-#define XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_MASK          0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_SHIFT         4
-#define XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK        0x1
-#define XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT       5
-#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_MASK              0x3
-#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_SHIFT             6
-	u8	edpm_event_id /* byte2 */;
-	__le16	physical_q0 /* physical_q0 */;
-	__le16	word1 /* physical_q1 */;
-	__le16	edpm_num_bds /* physical_q2 */;
-	__le16	tx_bd_cons /* word3 */;
-	__le16	tx_bd_prod /* word4 */;
-	__le16	go_to_bd_cons /* word5 */;
-	__le16	conn_dpi /* conn_dpi */;
-};
-
 #define VF_MAX_STATIC 192       /* In case of K2 */
 
 #define MCP_GLOB_PATH_MAX       2
@@ -3818,6 +3605,10 @@ struct public_port {
 	struct dcbx_local_params		local_admin_dcbx_mib;
 	struct dcbx_mib				remote_dcbx_mib;
 	struct dcbx_mib				operational_dcbx_mib;
+
+	u32					fc_npiv_nvram_tbl_addr;
+	u32					fc_npiv_nvram_tbl_size;
+	u32					transceiver_data;
 };
 
 /**************************************/
@@ -3830,7 +3621,11 @@ struct public_func {
 	u32	iscsi_boot_signature;
 	u32	iscsi_boot_block_offset;
 
-	u32	reserved[8];
+	u32	mtu_size;
+	u32	c2s_pcp_map_lower;
+	u32	c2s_pcp_map_upper;
+	u32	c2s_pcp_map_default;
+	u32	reserved[4];
 
 	u32	config;
 
@@ -3894,10 +3689,10 @@ struct public_func {
 #define DRV_ID_MCP_HSI_VER_SHIFT        16
 #define DRV_ID_MCP_HSI_VER_CURRENT	BIT(DRV_ID_MCP_HSI_VER_SHIFT)
 
-#define DRV_ID_DRV_TYPE_MASK            0xff000000
+#define DRV_ID_DRV_TYPE_MASK            0x7f000000
 #define DRV_ID_DRV_TYPE_SHIFT           24
 #define DRV_ID_DRV_TYPE_UNKNOWN         (0 << DRV_ID_DRV_TYPE_SHIFT)
-#define DRV_ID_DRV_TYPE_LINUX		BIT(DRV_ID_DRV_TYPE_SHIFT)
+#define DRV_ID_DRV_TYPE_LINUX           (1 << DRV_ID_DRV_TYPE_SHIFT)
 #define DRV_ID_DRV_TYPE_WINDOWS         (2 << DRV_ID_DRV_TYPE_SHIFT)
 #define DRV_ID_DRV_TYPE_DIAG            (3 << DRV_ID_DRV_TYPE_SHIFT)
 #define DRV_ID_DRV_TYPE_PREBOOT         (4 << DRV_ID_DRV_TYPE_SHIFT)
@@ -3905,6 +3700,10 @@ struct public_func {
 #define DRV_ID_DRV_TYPE_VMWARE          (6 << DRV_ID_DRV_TYPE_SHIFT)
 #define DRV_ID_DRV_TYPE_FREEBSD         (7 << DRV_ID_DRV_TYPE_SHIFT)
 #define DRV_ID_DRV_TYPE_AIX             (8 << DRV_ID_DRV_TYPE_SHIFT)
+
+#define DRV_ID_DRV_INIT_HW_MASK         0x80000000
+#define DRV_ID_DRV_INIT_HW_SHIFT        31
+#define DRV_ID_DRV_INIT_HW_FLAG         BIT(DRV_ID_DRV_INIT_HW_SHIFT)
 };
 
 /**************************************/
@@ -3964,6 +3763,7 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_MASK                       0xffff0000
 #define DRV_MSG_CODE_LOAD_REQ                   0x10000000
 #define DRV_MSG_CODE_LOAD_DONE                  0x11000000
+#define DRV_MSG_CODE_INIT_HW                    0x12000000
 #define DRV_MSG_CODE_UNLOAD_REQ                 0x20000000
 #define DRV_MSG_CODE_UNLOAD_DONE                0x21000000
 #define DRV_MSG_CODE_INIT_PHY                   0x22000000
@@ -4100,6 +3900,7 @@ struct public_drv_mb {
 #define FW_MSG_CODE_SET_SECURE_MODE_ERROR       0x00130000
 #define FW_MSG_CODE_SET_SECURE_MODE_OK          0x00140000
 #define FW_MSG_MODE_PHY_PRIVILEGE_ERROR         0x00150000
+#define FW_MSG_CODE_OK                          0x00160000
 
 #define FW_MSG_SEQ_NUMBER_MASK                  0x0000ffff
 
@@ -4212,7 +4013,7 @@ struct nvm_cfg1_glob {
 #define NVM_CFG1_GLOB_MF_MODE_MASK                              0x00000FF0
 #define NVM_CFG1_GLOB_MF_MODE_OFFSET                            4
 #define NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED                        0x0
-#define NVM_CFG1_GLOB_MF_MODE_FORCED_SF                         0x1
+#define NVM_CFG1_GLOB_MF_MODE_DEFAULT                           0x1
 #define NVM_CFG1_GLOB_MF_MODE_SPIO4                             0x2
 #define NVM_CFG1_GLOB_MF_MODE_NPAR1_0                           0x3
 #define NVM_CFG1_GLOB_MF_MODE_NPAR1_5                           0x4
@@ -4643,8 +4444,12 @@ struct nvm_cfg1_glob {
 #define NVM_CFG1_GLOB_I2C_MUX_SEL_GPIO__GPIO29                  0x1E
 #define NVM_CFG1_GLOB_I2C_MUX_SEL_GPIO__GPIO30                  0x1F
 #define NVM_CFG1_GLOB_I2C_MUX_SEL_GPIO__GPIO31                  0x20
-
-	u32 reserved[46];					/* 0x88 */
+	u32	device_capabilities;                            /* 0x88 */
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET              0x1
+	u32	power_dissipated;                               /* 0x8C */
+	u32 power_consumed;                                     /* 0x90 */
+	u32	efi_version;                                    /* 0x94 */
+	u32	reserved[42];                                   /* 0x98 */
 };
 
 struct nvm_cfg1_path {
@@ -4652,26 +4457,8 @@ struct nvm_cfg1_path {
 };
 
 struct nvm_cfg1_port {
-	u32 power_dissipated;					/* 0x0 */
-#define NVM_CFG1_PORT_POWER_DIS_D0_MASK                         0x000000FF
-#define NVM_CFG1_PORT_POWER_DIS_D0_OFFSET                       0
-#define NVM_CFG1_PORT_POWER_DIS_D1_MASK                         0x0000FF00
-#define NVM_CFG1_PORT_POWER_DIS_D1_OFFSET                       8
-#define NVM_CFG1_PORT_POWER_DIS_D2_MASK                         0x00FF0000
-#define NVM_CFG1_PORT_POWER_DIS_D2_OFFSET                       16
-#define NVM_CFG1_PORT_POWER_DIS_D3_MASK                         0xFF000000
-#define NVM_CFG1_PORT_POWER_DIS_D3_OFFSET                       24
-
-	u32 power_consumed;					/* 0x4 */
-#define NVM_CFG1_PORT_POWER_CONS_D0_MASK                        0x000000FF
-#define NVM_CFG1_PORT_POWER_CONS_D0_OFFSET                      0
-#define NVM_CFG1_PORT_POWER_CONS_D1_MASK                        0x0000FF00
-#define NVM_CFG1_PORT_POWER_CONS_D1_OFFSET                      8
-#define NVM_CFG1_PORT_POWER_CONS_D2_MASK                        0x00FF0000
-#define NVM_CFG1_PORT_POWER_CONS_D2_OFFSET                      16
-#define NVM_CFG1_PORT_POWER_CONS_D3_MASK                        0xFF000000
-#define NVM_CFG1_PORT_POWER_CONS_D3_OFFSET                      24
-
+	u32	reserved__m_relocated_to_option_123;           /* 0x0 */
+	u32	reserved__m_relocated_to_option_124;           /* 0x4 */
 	u32 generic_cont0;					/* 0x8 */
 #define NVM_CFG1_PORT_LED_MODE_MASK                             0x000000FF
 #define NVM_CFG1_PORT_LED_MODE_OFFSET                           0
@@ -4699,7 +4486,9 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_DCBX_MODE_IEEE                            0x1
 #define NVM_CFG1_PORT_DCBX_MODE_CEE                             0x2
 #define NVM_CFG1_PORT_DCBX_MODE_DYNAMIC                         0x3
-
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_MASK            0x00F00000
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_OFFSET          20
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_ETHERNET        0x1
 	u32	pcie_cfg;					/* 0xC */
 #define NVM_CFG1_PORT_RESERVED15_MASK                           0x00000007
 #define NVM_CFG1_PORT_RESERVED15_OFFSET                         0
@@ -4784,10 +4573,11 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_SERDES_NET_INTERFACE_SFI                  0x9
 #define NVM_CFG1_PORT_SERDES_NET_INTERFACE_1000X                0xB
 #define NVM_CFG1_PORT_SERDES_NET_INTERFACE_SGMII                0xC
-#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_XLAUI                0xD
-#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_CAUI                 0xE
-#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_XLPPI                0xF
-#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_CPPI                 0x10
+#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_XLAUI                0x11
+#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_XLPPI                0x12
+#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_CAUI                 0x21
+#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_CPPI                 0x22
+#define NVM_CFG1_PORT_SERDES_NET_INTERFACE_25GAUI               0x31
 #define NVM_CFG1_PORT_AN_MODE_MASK                              0xFF000000
 #define NVM_CFG1_PORT_AN_MODE_OFFSET                            24
 #define NVM_CFG1_PORT_AN_MODE_NONE                              0x0
@@ -4801,9 +4591,6 @@ struct nvm_cfg1_port {
 	u32 mgmt_traffic;					/* 0x20 */
 #define NVM_CFG1_PORT_RESERVED61_MASK                           0x0000000F
 #define NVM_CFG1_PORT_RESERVED61_OFFSET                         0
-#define NVM_CFG1_PORT_RESERVED61_DISABLED                       0x0
-#define NVM_CFG1_PORT_RESERVED61_NCSI_OVER_RMII                 0x1
-#define NVM_CFG1_PORT_RESERVED61_NCSI_OVER_SMBUS                0x2
 
 	u32 ext_phy;						/* 0x24 */
 #define NVM_CFG1_PORT_EXTERNAL_PHY_TYPE_MASK                    0x000000FF
@@ -4814,16 +4601,12 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_EXTERNAL_PHY_ADDRESS_OFFSET               8
 
 	u32 mba_cfg1;						/* 0x28 */
-#define NVM_CFG1_PORT_MBA_MASK                                  0x00000001
-#define NVM_CFG1_PORT_MBA_OFFSET                                0
-#define NVM_CFG1_PORT_MBA_DISABLED                              0x0
-#define NVM_CFG1_PORT_MBA_ENABLED                               0x1
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_MASK                        0x00000006
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_OFFSET                      1
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_AUTO                        0x0
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_BBS                         0x1
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_INT18H                      0x2
-#define NVM_CFG1_PORT_MBA_BOOT_TYPE_INT19H                      0x3
+#define NVM_CFG1_PORT_PREBOOT_OPROM_MASK                        0x00000001
+#define NVM_CFG1_PORT_PREBOOT_OPROM_OFFSET                      0
+#define NVM_CFG1_PORT_PREBOOT_OPROM_DISABLED                    0x0
+#define NVM_CFG1_PORT_PREBOOT_OPROM_ENABLED                     0x1
+#define NVM_CFG1_PORT_RESERVED__M_MBA_BOOT_TYPE_MASK            0x00000006
+#define NVM_CFG1_PORT_RESERVED__M_MBA_BOOT_TYPE_OFFSET          1
 #define NVM_CFG1_PORT_MBA_DELAY_TIME_MASK                       0x00000078
 #define NVM_CFG1_PORT_MBA_DELAY_TIME_OFFSET                     3
 #define NVM_CFG1_PORT_MBA_SETUP_HOT_KEY_MASK                    0x00000080
@@ -4836,61 +4619,30 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_MBA_HIDE_SETUP_PROMPT_ENABLED             0x1
 #define NVM_CFG1_PORT_RESERVED5_MASK                            0x0001FE00
 #define NVM_CFG1_PORT_RESERVED5_OFFSET                          9
-#define NVM_CFG1_PORT_RESERVED5_DISABLED                        0x0
-#define NVM_CFG1_PORT_RESERVED5_2K                              0x1
-#define NVM_CFG1_PORT_RESERVED5_4K                              0x2
-#define NVM_CFG1_PORT_RESERVED5_8K                              0x3
-#define NVM_CFG1_PORT_RESERVED5_16K                             0x4
-#define NVM_CFG1_PORT_RESERVED5_32K                             0x5
-#define NVM_CFG1_PORT_RESERVED5_64K                             0x6
-#define NVM_CFG1_PORT_RESERVED5_128K                            0x7
-#define NVM_CFG1_PORT_RESERVED5_256K                            0x8
-#define NVM_CFG1_PORT_RESERVED5_512K                            0x9
-#define NVM_CFG1_PORT_RESERVED5_1M                              0xA
-#define NVM_CFG1_PORT_RESERVED5_2M                              0xB
-#define NVM_CFG1_PORT_RESERVED5_4M                              0xC
-#define NVM_CFG1_PORT_RESERVED5_8M                              0xD
-#define NVM_CFG1_PORT_RESERVED5_16M                             0xE
-#define NVM_CFG1_PORT_RESERVED5_32M                             0xF
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_MASK                       0x001E0000
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_OFFSET                     17
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_AUTONEG                    0x0
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_1G                         0x1
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_10G                        0x2
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_25G                        0x4
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_40G                        0x5
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_50G                        0x6
-#define NVM_CFG1_PORT_MBA_LINK_SPEED_100G                       0x7
-#define NVM_CFG1_PORT_MBA_BOOT_RETRY_COUNT_MASK                 0x00E00000
-#define NVM_CFG1_PORT_MBA_BOOT_RETRY_COUNT_OFFSET               21
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_MASK                   0x001E0000
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_OFFSET                 17
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_AUTONEG                0x0
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_1G                     0x1
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_10G                    0x2
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_25G                    0x4
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_40G                    0x5
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_50G                    0x6
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_100G                   0x7
+#define NVM_CFG1_PORT_PREBOOT_LINK_SPEED_SMARTLINQ              0x8
+#define NVM_CFG1_PORT_RESERVED__M_MBA_BOOT_RETRY_COUNT_MASK     0x00E00000
+#define NVM_CFG1_PORT_RESERVED__M_MBA_BOOT_RETRY_COUNT_OFFSET   21
 
 	u32	mba_cfg2;					/* 0x2C */
-#define NVM_CFG1_PORT_MBA_VLAN_VALUE_MASK                       0x0000FFFF
-#define NVM_CFG1_PORT_MBA_VLAN_VALUE_OFFSET                     0
-#define NVM_CFG1_PORT_MBA_VLAN_MASK                             0x00010000
-#define NVM_CFG1_PORT_MBA_VLAN_OFFSET                           16
+#define NVM_CFG1_PORT_RESERVED65_MASK                           0x0000FFFF
+#define NVM_CFG1_PORT_RESERVED65_OFFSET                         0
+#define NVM_CFG1_PORT_RESERVED66_MASK                           0x00010000
+#define NVM_CFG1_PORT_RESERVED66_OFFSET                         16
 
 	u32	vf_cfg;						/* 0x30 */
 #define NVM_CFG1_PORT_RESERVED8_MASK                            0x0000FFFF
 #define NVM_CFG1_PORT_RESERVED8_OFFSET                          0
 #define NVM_CFG1_PORT_RESERVED6_MASK                            0x000F0000
 #define NVM_CFG1_PORT_RESERVED6_OFFSET                          16
-#define NVM_CFG1_PORT_RESERVED6_DISABLED                        0x0
-#define NVM_CFG1_PORT_RESERVED6_4K                              0x1
-#define NVM_CFG1_PORT_RESERVED6_8K                              0x2
-#define NVM_CFG1_PORT_RESERVED6_16K                             0x3
-#define NVM_CFG1_PORT_RESERVED6_32K                             0x4
-#define NVM_CFG1_PORT_RESERVED6_64K                             0x5
-#define NVM_CFG1_PORT_RESERVED6_128K                            0x6
-#define NVM_CFG1_PORT_RESERVED6_256K                            0x7
-#define NVM_CFG1_PORT_RESERVED6_512K                            0x8
-#define NVM_CFG1_PORT_RESERVED6_1M                              0x9
-#define NVM_CFG1_PORT_RESERVED6_2M                              0xA
-#define NVM_CFG1_PORT_RESERVED6_4M                              0xB
-#define NVM_CFG1_PORT_RESERVED6_8M                              0xC
-#define NVM_CFG1_PORT_RESERVED6_16M                             0xD
-#define NVM_CFG1_PORT_RESERVED6_32M                             0xE
-#define NVM_CFG1_PORT_RESERVED6_64M                             0xF
 
 	struct nvm_cfg_mac_address	lldp_mac_address;	/* 0x34 */
 
@@ -4973,18 +4725,16 @@ struct nvm_cfg1_func {
 	u32				device_id;		/* 0x10 */
 #define NVM_CFG1_FUNC_MF_VENDOR_DEVICE_ID_MASK                  0x0000FFFF
 #define NVM_CFG1_FUNC_MF_VENDOR_DEVICE_ID_OFFSET                0
-#define NVM_CFG1_FUNC_VENDOR_DEVICE_ID_MASK                     0xFFFF0000
-#define NVM_CFG1_FUNC_VENDOR_DEVICE_ID_OFFSET                   16
+#define NVM_CFG1_FUNC_RESERVED77_MASK                           0xFFFF0000
+#define NVM_CFG1_FUNC_RESERVED77_OFFSET                         16
 
 	u32				cmn_cfg;		/* 0x14 */
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_MASK                    0x00000007
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_OFFSET                  0
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_PXE                     0x0
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_RPL                     0x1
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_BOOTP                   0x2
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_ISCSI_BOOT              0x3
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_FCOE_BOOT               0x4
-#define NVM_CFG1_FUNC_MBA_BOOT_PROTOCOL_NONE                    0x7
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_MASK                0x00000007
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_OFFSET              0
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_PXE                 0x0
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_ISCSI_BOOT          0x3
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_FCOE_BOOT           0x4
+#define NVM_CFG1_FUNC_PREBOOT_BOOT_PROTOCOL_NONE                0x7
 #define NVM_CFG1_FUNC_VF_PCI_DEVICE_ID_MASK                     0x0007FFF8
 #define NVM_CFG1_FUNC_VF_PCI_DEVICE_ID_OFFSET                   3
 #define NVM_CFG1_FUNC_PERSONALITY_MASK                          0x00780000
@@ -5029,8 +4779,8 @@ struct nvm_cfg1_func {
 	struct nvm_cfg_mac_address	fcoe_node_wwn_mac_addr;	/* 0x1C */
 
 	struct nvm_cfg_mac_address	fcoe_port_wwn_mac_addr;	/* 0x24 */
-
-	u32				reserved[9];		/* 0x2C */
+	u32				preboot_generic_cfg;    /* 0x2C */
+	u32				reserved[8];            /* 0x30 */
 };
 
 struct nvm_cfg1 {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index 0b21a553cc7d..f55ebdc3c832 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -513,17 +513,14 @@ static int qed_pf_rl_rt_init(struct qed_hwfn *p_hwfn,
  * Return -1 on error.
  */
 static int qed_vp_wfq_rt_init(struct qed_hwfn *p_hwfn,
-			      u8 start_vport,
 			      u8 num_vports,
 			      struct init_qm_vport_params *vport_params)
 {
-	u8 tc, i, vport_id;
 	u32 inc_val;
+	u8 tc, i;
 
 	/* go over all PF VPORTs */
-	for (i = 0, vport_id = start_vport; i < num_vports; i++, vport_id++) {
-		u32 temp = QM_REG_WFQVPUPPERBOUND_RT_OFFSET;
-		u16 *pq_ids = &vport_params[i].first_tx_pq_id[0];
+	for (i = 0; i < num_vports; i++) {
 
 		if (!vport_params[i].vport_wfq)
 			continue;
@@ -539,20 +536,16 @@ static int qed_vp_wfq_rt_init(struct qed_hwfn *p_hwfn,
 		 * different TCs
 		 */
 		for (tc = 0; tc < NUM_OF_TCS; tc++) {
-			u16 vport_pq_id = pq_ids[tc];
+			u16 vport_pq_id = vport_params[i].first_tx_pq_id[tc];
 
 			if (vport_pq_id != QM_INVALID_PQ_ID) {
-				STORE_RT_REG(p_hwfn,
-					     QM_REG_WFQVPWEIGHT_RT_OFFSET +
-					     vport_pq_id, inc_val);
-				STORE_RT_REG(p_hwfn, temp + vport_pq_id,
-					     QM_WFQ_UPPER_BOUND |
-					     QM_WFQ_CRD_REG_SIGN_BIT);
 				STORE_RT_REG(p_hwfn,
 					     QM_REG_WFQVPCRD_RT_OFFSET +
 					     vport_pq_id,
-					     QM_WFQ_INIT_CRD(inc_val) |
 					     QM_WFQ_CRD_REG_SIGN_BIT);
+				STORE_RT_REG(p_hwfn,
+					     QM_REG_WFQVPWEIGHT_RT_OFFSET +
+					     vport_pq_id, inc_val);
 			}
 		}
 	}
@@ -709,8 +702,7 @@ int qed_qm_pf_rt_init(struct qed_hwfn *p_hwfn,
 	if (qed_pf_rl_rt_init(p_hwfn, p_params->pf_id, p_params->pf_rl))
 		return -1;
 
-	if (qed_vp_wfq_rt_init(p_hwfn, p_params->start_vport,
-			       p_params->num_vports, vport_params))
+	if (qed_vp_wfq_rt_init(p_hwfn, p_params->num_vports, vport_params))
 		return -1;
 
 	if (qed_vport_rl_rt_init(p_hwfn, p_params->start_vport,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
index 796f1390e598..3269b3610e03 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -55,63 +55,98 @@ void qed_init_clear_rt_data(struct qed_hwfn *p_hwfn)
 	int i;
 
 	for (i = 0; i < RUNTIME_ARRAY_SIZE; i++)
-		p_hwfn->rt_data[i].b_valid = false;
+		p_hwfn->rt_data.b_valid[i] = false;
 }
 
 void qed_init_store_rt_reg(struct qed_hwfn *p_hwfn,
 			   u32 rt_offset,
 			   u32 val)
 {
-	p_hwfn->rt_data[rt_offset].init_val = val;
-	p_hwfn->rt_data[rt_offset].b_valid = true;
+	p_hwfn->rt_data.init_val[rt_offset] = val;
+	p_hwfn->rt_data.b_valid[rt_offset] = true;
 }
 
 void qed_init_store_rt_agg(struct qed_hwfn *p_hwfn,
-			   u32 rt_offset,
-			   u32 *val,
+			   u32 rt_offset, u32 *p_val,
 			   size_t size)
 {
 	size_t i;
 
 	for (i = 0; i < size / sizeof(u32); i++) {
-		p_hwfn->rt_data[rt_offset + i].init_val = val[i];
-		p_hwfn->rt_data[rt_offset + i].b_valid = true;
+		p_hwfn->rt_data.init_val[rt_offset + i] = p_val[i];
+		p_hwfn->rt_data.b_valid[rt_offset + i]	= true;
 	}
 }
 
-static void qed_init_rt(struct qed_hwfn *p_hwfn,
-			struct qed_ptt *p_ptt,
-			u32 addr,
-			u32 rt_offset,
-			u32 size)
+static int qed_init_rt(struct qed_hwfn	*p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u32 addr,
+		       u16 rt_offset,
+		       u16 size,
+		       bool b_must_dmae)
 {
-	struct qed_rt_data *rt_data = p_hwfn->rt_data + rt_offset;
-	u32 i;
+	u32 *p_init_val = &p_hwfn->rt_data.init_val[rt_offset];
+	bool *p_valid = &p_hwfn->rt_data.b_valid[rt_offset];
+	u16 i, segment;
+	int rc = 0;
 
+	/* Since not all RT entries are initialized, go over the RT and
+	 * for each segment of initialized values use DMA.
+	 */
 	for (i = 0; i < size; i++) {
-		if (!rt_data[i].b_valid)
+		if (!p_valid[i])
 			continue;
-		qed_wr(p_hwfn, p_ptt, addr + (i << 2), rt_data[i].init_val);
+
+		/* In case there isn't any wide-bus configuration here,
+		 * simply write the data instead of using dmae.
+		 */
+		if (!b_must_dmae) {
+			qed_wr(p_hwfn, p_ptt, addr + (i << 2),
+			       p_init_val[i]);
+			continue;
+		}
+
+		/* Start of a new segment */
+		for (segment = 1; i + segment < size; segment++)
+			if (!p_valid[i + segment])
+				break;
+
+		rc = qed_dmae_host2grc(p_hwfn, p_ptt,
+				       (uintptr_t)(p_init_val + i),
+				       addr + (i << 2), segment, 0);
+		if (rc != 0)
+			return rc;
+
+		/* Jump over the entire segment, including invalid entry */
+		i += segment;
 	}
+
+	return rc;
 }
 
 int qed_init_alloc(struct qed_hwfn *p_hwfn)
 {
-	struct qed_rt_data *rt_data;
+	struct qed_rt_data *rt_data = &p_hwfn->rt_data;
 
-	rt_data = kzalloc(sizeof(*rt_data) * RUNTIME_ARRAY_SIZE, GFP_ATOMIC);
-	if (!rt_data)
+	rt_data->b_valid = kzalloc(sizeof(bool) * RUNTIME_ARRAY_SIZE,
+				   GFP_KERNEL);
+	if (!rt_data->b_valid)
 		return -ENOMEM;
 
-	p_hwfn->rt_data = rt_data;
+	rt_data->init_val = kzalloc(sizeof(u32) * RUNTIME_ARRAY_SIZE,
+				    GFP_KERNEL);
+	if (!rt_data->init_val) {
+		kfree(rt_data->b_valid);
+		return -ENOMEM;
+	}
 
 	return 0;
 }
 
 void qed_init_free(struct qed_hwfn *p_hwfn)
 {
-	kfree(p_hwfn->rt_data);
-	p_hwfn->rt_data = NULL;
+	kfree(p_hwfn->rt_data.init_val);
+	kfree(p_hwfn->rt_data.b_valid);
 }
 
 static int qed_init_array_dmae(struct qed_hwfn *p_hwfn,
@@ -289,7 +324,8 @@ static int qed_init_cmd_wr(struct qed_hwfn *p_hwfn,
 	case INIT_SRC_RUNTIME:
 		qed_init_rt(p_hwfn, p_ptt, addr,
 			    le16_to_cpu(arg->runtime.offset),
-			    le16_to_cpu(arg->runtime.size));
+			    le16_to_cpu(arg->runtime.size),
+			    b_must_dmae);
 		break;
 	}
 
@@ -316,49 +352,50 @@ static void qed_init_cmd_rd(struct qed_hwfn *p_hwfn,
 			    struct qed_ptt *p_ptt,
 			    struct init_read_op *cmd)
 {
-	u32 data = le32_to_cpu(cmd->op_data);
-	u32 addr = GET_FIELD(data, INIT_READ_OP_ADDRESS) << 2;
+	bool (*comp_check)(u32 val, u32 expected_val);
+	u32 delay = QED_INIT_POLL_PERIOD_US, val;
+	u32 data, addr, poll;
+	int i;
+
+	data = le32_to_cpu(cmd->op_data);
+	addr = GET_FIELD(data, INIT_READ_OP_ADDRESS) << 2;
+	poll = GET_FIELD(data, INIT_READ_OP_POLL_TYPE);
 
-	bool	(*comp_check)(u32	val,
-			      u32	expected_val);
-	u32	delay = QED_INIT_POLL_PERIOD_US, val;
 
 	val = qed_rd(p_hwfn, p_ptt, addr);
 
-	data = le32_to_cpu(cmd->op_data);
-	if (GET_FIELD(data, INIT_READ_OP_POLL)) {
-		int i;
+	if (poll == INIT_POLL_NONE)
+		return;
 
-		switch (GET_FIELD(data, INIT_READ_OP_POLL_COMP)) {
-		case INIT_COMPARISON_EQ:
-			comp_check = comp_eq;
-			break;
-		case INIT_COMPARISON_OR:
-			comp_check = comp_or;
-			break;
-		case INIT_COMPARISON_AND:
-			comp_check = comp_and;
-			break;
-		default:
-			comp_check = NULL;
-			DP_ERR(p_hwfn, "Invalid poll comparison type %08x\n",
-			       data);
-			return;
-		}
+	switch (poll) {
+	case INIT_POLL_EQ:
+		comp_check = comp_eq;
+		break;
+	case INIT_POLL_OR:
+		comp_check = comp_or;
+		break;
+	case INIT_POLL_AND:
+		comp_check = comp_and;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid poll comparison type %08x\n",
+		       cmd->op_data);
+		return;
+	}
 
-		for (i = 0;
-		     i < QED_INIT_MAX_POLL_COUNT &&
-		     !comp_check(val, le32_to_cpu(cmd->expected_val));
-		     i++) {
-			udelay(delay);
-			val = qed_rd(p_hwfn, p_ptt, addr);
-		}
+	data = le32_to_cpu(cmd->expected_val);
+	for (i = 0;
+	     i < QED_INIT_MAX_POLL_COUNT && !comp_check(val, data);
+	     i++) {
+		udelay(delay);
+		val = qed_rd(p_hwfn, p_ptt, addr);
+	}
 
-		if (i == QED_INIT_MAX_POLL_COUNT)
-			DP_ERR(p_hwfn,
-			       "Timeout when polling reg: 0x%08x [ Waiting-for: %08x Got: %08x (comparsion %08x)]\n",
-			       addr, le32_to_cpu(cmd->expected_val),
-			       val, data);
+	if (i == QED_INIT_MAX_POLL_COUNT) {
+		DP_ERR(p_hwfn,
+		       "Timeout when polling reg: 0x%08x [ Waiting-for: %08x Got: %08x (comparsion %08x)]\n",
+		       addr, le32_to_cpu(cmd->expected_val),
+		       val, le32_to_cpu(cmd->op_data));
 	}
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index f72036a2ef5b..978d07a61bbf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -714,7 +714,6 @@ qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
 	p_ramrod->sb_id			= cpu_to_le16(p_params->sb);
 	p_ramrod->sb_index		= p_params->sb_idx;
 	p_ramrod->stats_counter_id	= stats_id;
-	p_ramrod->tc			= p_pq_params->eth.tc;
 
 	p_ramrod->pbl_size		= cpu_to_le16(pbl_size);
 	p_ramrod->pbl_base_addr.hi	= DMA_HI_LE(pbl_addr);
@@ -821,9 +820,8 @@ qed_filter_action(enum qed_filter_opcode opcode)
 	case QED_FILTER_REMOVE:
 		action = ETH_FILTER_ACTION_REMOVE;
 		break;
-	case QED_FILTER_REPLACE:
 	case QED_FILTER_FLUSH:
-		action = ETH_FILTER_ACTION_REPLACE;
+		action = ETH_FILTER_ACTION_REMOVE_ALL;
 		break;
 	default:
 		action = MAX_ETH_FILTER_ACTION;
@@ -892,8 +890,7 @@ qed_filter_ucast_common(struct qed_hwfn *p_hwfn,
 	p_ramrod->filter_cmd_hdr.tx = p_filter_cmd->is_tx_filter ? 1 : 0;
 
 	switch (p_filter_cmd->opcode) {
-	case QED_FILTER_FLUSH:
-		p_ramrod->filter_cmd_hdr.cmd_cnt = 0; break;
+	case QED_FILTER_REPLACE:
 	case QED_FILTER_MOVE:
 		p_ramrod->filter_cmd_hdr.cmd_cnt = 2; break;
 	default:
@@ -962,6 +959,12 @@ qed_filter_ucast_common(struct qed_hwfn *p_hwfn,
 
 		p_second_filter->action		= ETH_FILTER_ACTION_ADD;
 		p_second_filter->vport_id	= vport_to_add_to;
+	} else if (p_filter_cmd->opcode == QED_FILTER_REPLACE) {
+		p_first_filter->vport_id = vport_to_add_to;
+		memcpy(p_second_filter, p_first_filter,
+		       sizeof(*p_second_filter));
+		p_first_filter->action	= ETH_FILTER_ACTION_REMOVE_ALL;
+		p_second_filter->action = ETH_FILTER_ACTION_ADD;
 	} else {
 		action = qed_filter_action(p_filter_cmd->opcode);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 9d76ce249277..593f8871adb6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -190,7 +190,7 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->pci_mem_start = cdev->pci_params.mem_start;
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
 	dev_info->pci_irq = cdev->pci_params.irq;
-	dev_info->is_mf = IS_MF(&cdev->hwfns[0]);
+	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
 	ether_addr_copy(dev_info->hw_mac, cdev->hwfns[0].hw_info.hw_mac_addr);
 
 	dev_info->fw_major = FW_MAJOR_VERSION;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index ba1b1f1ef789..1457e30faccf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -720,26 +720,25 @@ int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 	}
 
-	if (p_hwfn->cdev->mf_mode != SF) {
-		info->bandwidth_min = (shmem_info.config &
-				       FUNC_MF_CFG_MIN_BW_MASK) >>
-				      FUNC_MF_CFG_MIN_BW_SHIFT;
-		if (info->bandwidth_min < 1 || info->bandwidth_min > 100) {
-			DP_INFO(p_hwfn,
-				"bandwidth minimum out of bounds [%02x]. Set to 1\n",
-				info->bandwidth_min);
-			info->bandwidth_min = 1;
-		}
 
-		info->bandwidth_max = (shmem_info.config &
-				       FUNC_MF_CFG_MAX_BW_MASK) >>
-				      FUNC_MF_CFG_MAX_BW_SHIFT;
-		if (info->bandwidth_max < 1 || info->bandwidth_max > 100) {
-			DP_INFO(p_hwfn,
-				"bandwidth maximum out of bounds [%02x]. Set to 100\n",
-				info->bandwidth_max);
-			info->bandwidth_max = 100;
-		}
+	info->bandwidth_min = (shmem_info.config &
+			       FUNC_MF_CFG_MIN_BW_MASK) >>
+			      FUNC_MF_CFG_MIN_BW_SHIFT;
+	if (info->bandwidth_min < 1 || info->bandwidth_min > 100) {
+		DP_INFO(p_hwfn,
+			"bandwidth minimum out of bounds [%02x]. Set to 1\n",
+			info->bandwidth_min);
+		info->bandwidth_min = 1;
+	}
+
+	info->bandwidth_max = (shmem_info.config &
+			       FUNC_MF_CFG_MAX_BW_MASK) >>
+			      FUNC_MF_CFG_MAX_BW_SHIFT;
+	if (info->bandwidth_max < 1 || info->bandwidth_max > 100) {
+		DP_INFO(p_hwfn,
+			"bandwidth maximum out of bounds [%02x]. Set to 100\n",
+			info->bandwidth_max);
+		info->bandwidth_max = 100;
 	}
 
 	if (shmem_info.mac_upper || shmem_info.mac_lower) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 287fadfab52d..8a83609c443c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -343,7 +343,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
-		    enum mf_mode mode);
+		    enum qed_mf_mode mode);
 
 /**
  * @brief qed_sp_pf_stop - PF Function Stop Ramrod
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 6f7879136633..33090f63548c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -90,7 +90,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 }
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
-		    enum mf_mode mode)
+		    enum qed_mf_mode mode)
 {
 	struct qed_sp_init_request_params params;
 	struct pf_start_ramrod_data *p_ramrod = NULL;
@@ -125,6 +125,18 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->dont_log_ramrods	= 0;
 	p_ramrod->log_type_mask		= cpu_to_le16(0xf);
 	p_ramrod->mf_mode = mode;
+	switch (mode) {
+	case QED_MF_DEFAULT:
+	case QED_MF_NPAR:
+		p_ramrod->mf_mode = MF_NPAR;
+		break;
+	case QED_MF_OVLAN:
+		p_ramrod->mf_mode = MF_OVLAN;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+		p_ramrod->mf_mode = MF_NPAR;
+	}
 	p_ramrod->outer_tag = p_hwfn->hw_info.ovlan;
 
 	/* Place EQ address in RAMROD */
@@ -142,9 +154,8 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
-		   "Setting event_ring_sb [id %04x index %02x], mf [%s] outer_tag [%d]\n",
+		   "Setting event_ring_sb [id %04x index %02x], outer_tag [%d]\n",
 		   sb, sb_index,
-		   (p_ramrod->mf_mode == SF) ? "SF" : "Multi-Pf",
 		   p_ramrod->outer_tag);
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 7c6caf7f6612..f75d9e0676ce 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -173,9 +173,9 @@ enum QEDE_STATE {
  * skb are built only after the frame was DMA-ed.
  */
 struct sw_rx_data {
-	u8 *data;
-
-	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct page *data;
+	dma_addr_t mapping;
+	unsigned int page_offset;
 };
 
 struct qede_rx_queue {
@@ -188,6 +188,7 @@ struct qede_rx_queue {
 	void __iomem		*hw_rxq_prod_addr;
 
 	int			rx_buf_size;
+	unsigned int		rx_buf_seg_size;
 
 	u16			num_rx_buffers;
 	u16			rxq_id;
@@ -281,6 +282,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev);
 #define NUM_TX_BDS_MIN		128
 #define NUM_TX_BDS_DEF		NUM_TX_BDS_MAX
 
+#define QEDE_RX_HDR_SIZE	256
 #define	for_each_rss(i) for (i = 0; i < edev->num_rss; i++)
 
 #endif /* _QEDE_H_ */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index e442b85c9a5e..c49dc10ce151 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -217,9 +217,9 @@ static int qede_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	struct qed_link_params params;
 	u32 speed;
 
-	if (edev->dev_info.common.is_mf) {
+	if (!edev->dev_info.common.is_mf_default) {
 		DP_INFO(edev,
-			"Link parameters can not be changed in MF mode\n");
+			"Link parameters can not be changed in non-default mode\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -428,7 +428,7 @@ static int qede_set_pauseparam(struct net_device *dev,
 	struct qed_link_params params;
 	struct qed_link_output current_link;
 
-	if (!edev->dev_info.common.is_mf) {
+	if (!edev->dev_info.common.is_mf_default) {
 		DP_INFO(edev,
 			"Pause parameters can not be updated in non-default mode\n");
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 6237f10b5119..f50e0bd7fb2c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -330,15 +330,15 @@ static void qede_set_params_for_ipv6_ext(struct sk_buff *skb,
 					 struct eth_tx_3rd_bd *third_bd)
 {
 	u8 l4_proto;
-	u16 bd2_bits = 0, bd2_bits2 = 0;
+	u16 bd2_bits1 = 0, bd2_bits2 = 0;
 
-	bd2_bits2 |= (1 << ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT);
+	bd2_bits1 |= (1 << ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT);
 
-	bd2_bits |= ((((u8 *)skb_transport_header(skb) - skb->data) >> 1) &
+	bd2_bits2 |= ((((u8 *)skb_transport_header(skb) - skb->data) >> 1) &
 		     ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK)
 		    << ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT;
 
-	bd2_bits2 |= (ETH_L4_PSEUDO_CSUM_CORRECT_LENGTH <<
+	bd2_bits1 |= (ETH_L4_PSEUDO_CSUM_CORRECT_LENGTH <<
 		      ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT);
 
 	if (vlan_get_protocol(skb) == htons(ETH_P_IPV6))
@@ -347,16 +347,15 @@ static void qede_set_params_for_ipv6_ext(struct sk_buff *skb,
 		l4_proto = ip_hdr(skb)->protocol;
 
 	if (l4_proto == IPPROTO_UDP)
-		bd2_bits2 |= 1 << ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT;
+		bd2_bits1 |= 1 << ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT;
 
-	if (third_bd) {
+	if (third_bd)
 		third_bd->data.bitfields |=
-			((tcp_hdrlen(skb) / 4) &
-			 ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK) <<
-			ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT;
-	}
+			cpu_to_le16(((tcp_hdrlen(skb) / 4) &
+				ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK) <<
+				ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT);
 
-	second_bd->data.bitfields = cpu_to_le16(bd2_bits);
+	second_bd->data.bitfields1 = cpu_to_le16(bd2_bits1);
 	second_bd->data.bitfields2 = cpu_to_le16(bd2_bits2);
 }
 
@@ -464,12 +463,16 @@ netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 
 	/* Fill the parsing flags & params according to the requested offload */
 	if (xmit_type & XMIT_L4_CSUM) {
+		u16 temp = 1 << ETH_TX_DATA_1ST_BD_TUNN_CFG_OVERRIDE_SHIFT;
+
 		/* We don't re-calculate IP checksum as it is already done by
 		 * the upper stack
 		 */
 		first_bd->data.bd_flags.bitfields |=
 			1 << ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT;
 
+		first_bd->data.bitfields |= cpu_to_le16(temp);
+
 		/* If the packet is IPv6 with extension header, indicate that
 		 * to FW and pass few params, since the device cracker doesn't
 		 * support parsing IPv6 with extension header/s.
@@ -491,7 +494,7 @@ netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 
 		/* @@@TBD - if will not be removed need to check */
 		third_bd->data.bitfields |=
-			(1 << ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT);
+			cpu_to_le16((1 << ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT));
 
 		/* Make life easier for FW guys who can't deal with header and
 		 * data on same BD. If we need to split, use the second bd...
@@ -719,26 +722,52 @@ static bool qede_has_tx_work(struct qede_fastpath *fp)
 	return false;
 }
 
-/* This function copies the Rx buffer from the CONS position to the PROD
- * position, since we failed to allocate a new Rx buffer.
+/* This function reuses the buffer(from an offset) from
+ * consumer index to producer index in the bd ring
  */
-static void qede_reuse_rx_data(struct qede_rx_queue *rxq)
+static inline void qede_reuse_page(struct qede_dev *edev,
+				   struct qede_rx_queue *rxq,
+				   struct sw_rx_data *curr_cons)
 {
-	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
 	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
-	struct sw_rx_data *sw_rx_data_cons =
-		&rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
-	struct sw_rx_data *sw_rx_data_prod =
-		&rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	struct sw_rx_data *curr_prod;
+	dma_addr_t new_mapping;
 
-	dma_unmap_addr_set(sw_rx_data_prod, mapping,
-			   dma_unmap_addr(sw_rx_data_cons, mapping));
+	curr_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	*curr_prod = *curr_cons;
 
-	sw_rx_data_prod->data = sw_rx_data_cons->data;
-	memcpy(rx_bd_prod, rx_bd_cons, sizeof(struct eth_rx_bd));
+	new_mapping = curr_prod->mapping + curr_prod->page_offset;
+
+	rx_bd_prod->addr.hi = cpu_to_le32(upper_32_bits(new_mapping));
+	rx_bd_prod->addr.lo = cpu_to_le32(lower_32_bits(new_mapping));
 
-	rxq->sw_rx_cons++;
 	rxq->sw_rx_prod++;
+	curr_cons->data = NULL;
+}
+
+static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
+					 struct qede_rx_queue *rxq,
+					 struct sw_rx_data *curr_cons)
+{
+	/* Move to the next segment in the page */
+	curr_cons->page_offset += rxq->rx_buf_seg_size;
+
+	if (curr_cons->page_offset == PAGE_SIZE) {
+		if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+			return -ENOMEM;
+
+		dma_unmap_page(&edev->pdev->dev, curr_cons->mapping,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+	} else {
+		/* Increment refcount of the page as we don't want
+		 * network stack to take the ownership of the page
+		 * which can be recycled multiple times by the driver.
+		 */
+		atomic_inc(&curr_cons->data->_count);
+		qede_reuse_page(edev, rxq, curr_cons);
+	}
+
+	return 0;
 }
 
 static inline void qede_update_rx_prod(struct qede_dev *edev,
@@ -857,9 +886,10 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 		struct sw_rx_data *sw_rx_data;
 		union eth_rx_cqe *cqe;
 		struct sk_buff *skb;
+		struct page *data;
+		__le16 flags;
 		u16 len, pad;
 		u32 rx_hash;
-		u8 *data;
 
 		/* Get the CQE from the completion ring */
 		cqe = (union eth_rx_cqe *)
@@ -879,56 +909,110 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 		data = sw_rx_data->data;
 
 		fp_cqe = &cqe->fast_path_regular;
-		len =  le16_to_cpu(fp_cqe->pkt_len);
+		len =  le16_to_cpu(fp_cqe->len_on_first_bd);
 		pad = fp_cqe->placement_offset;
+		flags = cqe->fast_path_regular.pars_flags.flags;
 
-		/* For every Rx BD consumed, we allocate a new BD so the BD ring
-		 * is always with a fixed size. If allocation fails, we take the
-		 * consumed BD and return it to the ring in the PROD position.
-		 * The packet that was received on that BD will be dropped (and
-		 * not passed to the upper stack).
-		 */
-		if (likely(qede_alloc_rx_buffer(edev, rxq) == 0)) {
-			dma_unmap_single(&edev->pdev->dev,
-					 dma_unmap_addr(sw_rx_data, mapping),
-					 rxq->rx_buf_size, DMA_FROM_DEVICE);
-
-			/* If this is an error packet then drop it */
-			parse_flag =
-			le16_to_cpu(cqe->fast_path_regular.pars_flags.flags);
-			csum_flag = qede_check_csum(parse_flag);
-			if (csum_flag == QEDE_CSUM_ERROR) {
-				DP_NOTICE(edev,
-					  "CQE in CONS = %u has error, flags = %x, dropping incoming packet\n",
-					  sw_comp_cons, parse_flag);
-				rxq->rx_hw_errors++;
-				kfree(data);
-				goto next_rx;
-			}
-
-			skb = build_skb(data, 0);
-
-			if (unlikely(!skb)) {
-				DP_NOTICE(edev,
-					  "Build_skb failed, dropping incoming packet\n");
-				kfree(data);
-				rxq->rx_alloc_errors++;
-				goto next_rx;
-			}
+		/* If this is an error packet then drop it */
+		parse_flag = le16_to_cpu(flags);
 
-			skb_reserve(skb, pad);
+		csum_flag = qede_check_csum(parse_flag);
+		if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
+			DP_NOTICE(edev,
+				  "CQE in CONS = %u has error, flags = %x, dropping incoming packet\n",
+				  sw_comp_cons, parse_flag);
+			rxq->rx_hw_errors++;
+			qede_reuse_page(edev, rxq, sw_rx_data);
+			goto next_rx;
+		}
 
-		} else {
+		skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
+		if (unlikely(!skb)) {
 			DP_NOTICE(edev,
-				  "New buffer allocation failed, dropping incoming packet and reusing its buffer\n");
-			qede_reuse_rx_data(rxq);
+				  "Build_skb failed, dropping incoming packet\n");
+			qede_reuse_page(edev, rxq, sw_rx_data);
 			rxq->rx_alloc_errors++;
-			goto next_cqe;
+			goto next_rx;
 		}
 
-		sw_rx_data->data = NULL;
+		/* Copy data into SKB */
+		if (len + pad <= QEDE_RX_HDR_SIZE) {
+			memcpy(skb_put(skb, len),
+			       page_address(data) + pad +
+				sw_rx_data->page_offset, len);
+			qede_reuse_page(edev, rxq, sw_rx_data);
+		} else {
+			struct skb_frag_struct *frag;
+			unsigned int pull_len;
+			unsigned char *va;
 
-		skb_put(skb, len);
+			frag = &skb_shinfo(skb)->frags[0];
+
+			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, data,
+					pad + sw_rx_data->page_offset,
+					len, rxq->rx_buf_seg_size);
+
+			va = skb_frag_address(frag);
+			pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
+
+			/* Align the pull_len to optimize memcpy */
+			memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
+
+			skb_frag_size_sub(frag, pull_len);
+			frag->page_offset += pull_len;
+			skb->data_len -= pull_len;
+			skb->tail += pull_len;
+
+			if (unlikely(qede_realloc_rx_buffer(edev, rxq,
+							    sw_rx_data))) {
+				DP_ERR(edev, "Failed to allocate rx buffer\n");
+				rxq->rx_alloc_errors++;
+				goto next_cqe;
+			}
+		}
+
+		if (fp_cqe->bd_num != 1) {
+			u16 pkt_len = le16_to_cpu(fp_cqe->pkt_len);
+			u8 num_frags;
+
+			pkt_len -= len;
+
+			for (num_frags = fp_cqe->bd_num - 1; num_frags > 0;
+			     num_frags--) {
+				u16 cur_size = pkt_len > rxq->rx_buf_size ?
+						rxq->rx_buf_size : pkt_len;
+
+				WARN_ONCE(!cur_size,
+					  "Still got %d BDs for mapping jumbo, but length became 0\n",
+					  num_frags);
+
+				if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+					goto next_cqe;
+
+				rxq->sw_rx_cons++;
+				sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+				sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
+				qed_chain_consume(&rxq->rx_bd_ring);
+				dma_unmap_page(&edev->pdev->dev,
+					       sw_rx_data->mapping,
+					       PAGE_SIZE, DMA_FROM_DEVICE);
+
+				skb_fill_page_desc(skb,
+						   skb_shinfo(skb)->nr_frags++,
+						   sw_rx_data->data, 0,
+						   cur_size);
+
+				skb->truesize += PAGE_SIZE;
+				skb->data_len += cur_size;
+				skb->len += cur_size;
+				pkt_len -= cur_size;
+			}
+
+			if (pkt_len)
+				DP_ERR(edev,
+				       "Mapped all BDs of jumbo, but still have %d bytes\n",
+				       pkt_len);
+		}
 
 		skb->protocol = eth_type_trans(skb, edev->ndev);
 
@@ -1566,17 +1650,17 @@ static void qede_free_rx_buffers(struct qede_dev *edev,
 
 	for (i = rxq->sw_rx_cons; i != rxq->sw_rx_prod; i++) {
 		struct sw_rx_data *rx_buf;
-		u8 *data;
+		struct page *data;
 
 		rx_buf = &rxq->sw_rx_ring[i & NUM_RX_BDS_MAX];
 		data = rx_buf->data;
 
-		dma_unmap_single(&edev->pdev->dev,
-				 dma_unmap_addr(rx_buf, mapping),
-				 rxq->rx_buf_size, DMA_FROM_DEVICE);
+		dma_unmap_page(&edev->pdev->dev,
+			       rx_buf->mapping,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
 
 		rx_buf->data = NULL;
-		kfree(data);
+		__free_page(data);
 	}
 }
 
@@ -1600,29 +1684,32 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 	struct sw_rx_data *sw_rx_data;
 	struct eth_rx_bd *rx_bd;
 	dma_addr_t mapping;
+	struct page *data;
 	u16 rx_buf_size;
-	u8 *data;
 
 	rx_buf_size = rxq->rx_buf_size;
 
-	data = kmalloc(rx_buf_size, GFP_ATOMIC);
+	data = alloc_pages(GFP_ATOMIC, 0);
 	if (unlikely(!data)) {
-		DP_NOTICE(edev, "Failed to allocate Rx data\n");
+		DP_NOTICE(edev, "Failed to allocate Rx data [page]\n");
 		return -ENOMEM;
 	}
 
-	mapping = dma_map_single(&edev->pdev->dev, data,
-				 rx_buf_size, DMA_FROM_DEVICE);
+	/* Map the entire page as it would be used
+	 * for multiple RX buffer segment size mapping.
+	 */
+	mapping = dma_map_page(&edev->pdev->dev, data, 0,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
-		kfree(data);
+		__free_page(data);
 		DP_NOTICE(edev, "Failed to map Rx buffer\n");
 		return -ENOMEM;
 	}
 
 	sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	sw_rx_data->page_offset = 0;
 	sw_rx_data->data = data;
-
-	dma_unmap_addr_set(sw_rx_data, mapping, mapping);
+	sw_rx_data->mapping = mapping;
 
 	/* Advance PROD and get BD pointer */
 	rx_bd = (struct eth_rx_bd *)qed_chain_produce(&rxq->rx_bd_ring);
@@ -1643,13 +1730,16 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev,
 
 	rxq->num_rx_buffers = edev->q_num_rx_buffers;
 
-	rxq->rx_buf_size = NET_IP_ALIGN +
-			   ETH_OVERHEAD +
-			   edev->ndev->mtu +
-			   QEDE_FW_RX_ALIGN_END;
+	rxq->rx_buf_size = NET_IP_ALIGN + ETH_OVERHEAD +
+			   edev->ndev->mtu;
+	if (rxq->rx_buf_size > PAGE_SIZE)
+		rxq->rx_buf_size = PAGE_SIZE;
+
+	/* Segment size to spilt a page in multiple equal parts */
+	rxq->rx_buf_seg_size = roundup_pow_of_two(rxq->rx_buf_size);
 
 	/* Allocate the parallel driver ring for Rx buffers */
-	size = sizeof(*rxq->sw_rx_ring) * NUM_RX_BDS_MAX;
+	size = sizeof(*rxq->sw_rx_ring) * RX_RING_SIZE;
 	rxq->sw_rx_ring = kzalloc(size, GFP_KERNEL);
 	if (!rxq->sw_rx_ring) {
 		DP_ERR(edev, "Rx buffers ring allocation failed\n");
@@ -1660,7 +1750,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev,
 	rc = edev->ops->common->chain_alloc(edev->cdev,
 					    QED_CHAIN_USE_TO_CONSUME_PRODUCE,
 					    QED_CHAIN_MODE_NEXT_PTR,
-					    NUM_RX_BDS_MAX,
+					    RX_RING_SIZE,
 					    sizeof(struct eth_rx_bd),
 					    &rxq->rx_bd_ring);
 
@@ -1671,7 +1761,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev,
 	rc = edev->ops->common->chain_alloc(edev->cdev,
 					    QED_CHAIN_USE_TO_CONSUME,
 					    QED_CHAIN_MODE_PBL,
-					    NUM_RX_BDS_MAX,
+					    RX_RING_SIZE,
 					    sizeof(union eth_rx_cqe),
 					    &rxq->rx_comp_ring);
 	if (rc)
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 1d1ba2c5ee7a..53ecb37ae563 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -11,9 +11,11 @@
 
 #define CORE_SPQE_PAGE_SIZE_BYTES                       4096
 
+#define X_FINAL_CLEANUP_AGG_INT 1
+
 #define FW_MAJOR_VERSION	8
-#define FW_MINOR_VERSION	4
-#define FW_REVISION_VERSION	2
+#define FW_MINOR_VERSION	7
+#define FW_REVISION_VERSION	3
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
@@ -152,6 +154,9 @@
 /* number of queues in a PF queue group */
 #define QM_PF_QUEUE_GROUP_SIZE	8
 
+/* the size of a single queue element in bytes */
+#define QM_PQ_ELEMENT_SIZE                      4
+
 /* base number of Tx PQs in the CM PQ representation.
  * should be used when storing PQ IDs in CM PQ registers and context
  */
@@ -285,6 +290,16 @@
 #define PXP_NUM_ILT_RECORDS_K2 11000
 #define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB, PXP_NUM_ILT_RECORDS_K2)
 
+#define SDM_COMP_TYPE_NONE              0
+#define SDM_COMP_TYPE_WAKE_THREAD       1
+#define SDM_COMP_TYPE_AGG_INT           2
+#define SDM_COMP_TYPE_CM                3
+#define SDM_COMP_TYPE_LOADER            4
+#define SDM_COMP_TYPE_PXP               5
+#define SDM_COMP_TYPE_INDICATE_ERROR    6
+#define SDM_COMP_TYPE_RELEASE_THREAD    7
+#define SDM_COMP_TYPE_RAM               8
+
 /******************/
 /* PBF CONSTANTS  */
 /******************/
@@ -335,7 +350,7 @@ struct event_ring_entry {
 
 /* Multi function mode */
 enum mf_mode {
-	SF,
+	ERROR_MODE /* Unsupported mode */,
 	MF_OVLAN,
 	MF_NPAR,
 	MAX_MF_MODE
@@ -606,4 +621,19 @@ struct status_block {
 #define STATUS_BLOCK_ZERO_PAD3_SHIFT  24
 };
 
+struct tunnel_parsing_flags {
+	u8 flags;
+#define TUNNEL_PARSING_FLAGS_TYPE_MASK              0x3
+#define TUNNEL_PARSING_FLAGS_TYPE_SHIFT             0
+#define TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_MASK  0x1
+#define TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_SHIFT 2
+#define TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_MASK     0x3
+#define TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_SHIFT    3
+#define TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_MASK   0x1
+#define TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_SHIFT  5
+#define TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_MASK     0x1
+#define TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_SHIFT    6
+#define TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_MASK      0x1
+#define TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_SHIFT     7
+};
 #endif /* __COMMON_HSI__ */
diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h
index 320b3373ac1d..092cb0c1afcb 100644
--- a/include/linux/qed/eth_common.h
+++ b/include/linux/qed/eth_common.h
@@ -17,10 +17,8 @@
 #define ETH_MAX_RAMROD_PER_CON                          8
 #define ETH_TX_BD_PAGE_SIZE_BYTES                       4096
 #define ETH_RX_BD_PAGE_SIZE_BYTES                       4096
-#define ETH_RX_SGE_PAGE_SIZE_BYTES                      4096
 #define ETH_RX_CQE_PAGE_SIZE_BYTES                      4096
 #define ETH_RX_NUM_NEXT_PAGE_BDS                        2
-#define ETH_RX_NUM_NEXT_PAGE_SGES                       2
 
 #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT                          1
 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET                       18
@@ -34,7 +32,8 @@
 
 #define ETH_NUM_STATISTIC_COUNTERS                      MAX_NUM_VPORTS
 
-#define ETH_REG_CQE_PBL_SIZE                3
+/* Maximum number of buffers, used for RX packet placement */
+#define ETH_RX_MAX_BUFF_PER_PKT             5
 
 /* num of MAC/VLAN filters */
 #define ETH_NUM_MAC_FILTERS                                     512
@@ -54,9 +53,9 @@
 
 /* TPA constants */
 #define ETH_TPA_MAX_AGGS_NUM              64
-#define ETH_TPA_CQE_START_SGL_SIZE        3
-#define ETH_TPA_CQE_CONT_SGL_SIZE         6
-#define ETH_TPA_CQE_END_SGL_SIZE          4
+#define ETH_TPA_CQE_START_LEN_LIST_SIZE   ETH_RX_MAX_BUFF_PER_PKT
+#define ETH_TPA_CQE_CONT_LEN_LIST_SIZE    6
+#define ETH_TPA_CQE_END_LEN_LIST_SIZE     4
 
 /* Queue Zone sizes */
 #define TSTORM_QZONE_SIZE    0
@@ -74,18 +73,18 @@ struct coalescing_timeset {
 
 struct eth_tx_1st_bd_flags {
 	u8 bitfields;
+#define ETH_TX_1ST_BD_FLAGS_START_BD_MASK         0x1
+#define ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT        0
 #define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_MASK  0x1
-#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_SHIFT 0
+#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_SHIFT 1
 #define ETH_TX_1ST_BD_FLAGS_IP_CSUM_MASK          0x1
-#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT         1
+#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT         2
 #define ETH_TX_1ST_BD_FLAGS_L4_CSUM_MASK          0x1
-#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT         2
+#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT         3
 #define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_MASK   0x1
-#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT  3
+#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT  4
 #define ETH_TX_1ST_BD_FLAGS_LSO_MASK              0x1
-#define ETH_TX_1ST_BD_FLAGS_LSO_SHIFT             4
-#define ETH_TX_1ST_BD_FLAGS_START_BD_MASK         0x1
-#define ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT        5
+#define ETH_TX_1ST_BD_FLAGS_LSO_SHIFT             5
 #define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_MASK     0x1
 #define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_SHIFT    6
 #define ETH_TX_1ST_BD_FLAGS_TUNN_L4_CSUM_MASK     0x1
@@ -97,38 +96,44 @@ struct eth_tx_data_1st_bd {
 	__le16				vlan;
 	u8				nbds;
 	struct eth_tx_1st_bd_flags	bd_flags;
-	__le16				fw_use_only;
+	__le16				bitfields;
+#define ETH_TX_DATA_1ST_BD_TUNN_CFG_OVERRIDE_MASK  0x1
+#define ETH_TX_DATA_1ST_BD_TUNN_CFG_OVERRIDE_SHIFT 0
+#define ETH_TX_DATA_1ST_BD_RESERVED0_MASK          0x1
+#define ETH_TX_DATA_1ST_BD_RESERVED0_SHIFT         1
+#define ETH_TX_DATA_1ST_BD_FW_USE_ONLY_MASK        0x3FFF
+#define ETH_TX_DATA_1ST_BD_FW_USE_ONLY_SHIFT       2
 };
 
 /* The parsing information data for the second tx bd of a given packet. */
 struct eth_tx_data_2nd_bd {
 	__le16	tunn_ip_size;
-	__le16	bitfields;
-#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK     0x1FFF
-#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT    0
-#define ETH_TX_DATA_2ND_BD_RESERVED0_MASK                 0x7
-#define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT                13
-	__le16	bitfields2;
+	__le16	bitfields1;
 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_MASK  0xF
 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_SHIFT 0
 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_MASK       0x3
 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_SHIFT      4
 #define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_MASK            0x3
 #define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_SHIFT           6
+#define ETH_TX_DATA_2ND_BD_START_BD_MASK                  0x1
+#define ETH_TX_DATA_2ND_BD_START_BD_SHIFT                 8
 #define ETH_TX_DATA_2ND_BD_TUNN_TYPE_MASK                 0x3
-#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_SHIFT                8
+#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_SHIFT                9
 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_MASK           0x1
-#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_SHIFT          10
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_SHIFT          11
 #define ETH_TX_DATA_2ND_BD_IPV6_EXT_MASK                  0x1
-#define ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT                 11
+#define ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT                 12
 #define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_MASK             0x1
-#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_SHIFT            12
+#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_SHIFT            13
 #define ETH_TX_DATA_2ND_BD_L4_UDP_MASK                    0x1
-#define ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT                   13
+#define ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT                   14
 #define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_MASK       0x1
-#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT      14
-#define ETH_TX_DATA_2ND_BD_RESERVED1_MASK                 0x1
-#define ETH_TX_DATA_2ND_BD_RESERVED1_SHIFT                15
+#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT      15
+	__le16 bitfields2;
+#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK     0x1FFF
+#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT    0
+#define ETH_TX_DATA_2ND_BD_RESERVED0_MASK                 0x7
+#define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT                13
 };
 
 /* Regular ETH Rx FP CQE. */
@@ -145,11 +150,68 @@ struct eth_fast_path_rx_reg_cqe {
 	struct parsing_and_err_flags	pars_flags;
 	__le16				vlan_tag;
 	__le32				rss_hash;
-	__le16				len_on_bd;
+	__le16				len_on_first_bd;
 	u8				placement_offset;
-	u8				reserved;
-	__le16				pbl[ETH_REG_CQE_PBL_SIZE];
-	u8				reserved1[10];
+	struct tunnel_parsing_flags	tunnel_pars_flags;
+	u8				bd_num;
+	u8				reserved[7];
+	u32				fw_debug;
+	u8				reserved1[3];
+	u8				flags;
+#define ETH_FAST_PATH_RX_REG_CQE_VALID_MASK          0x1
+#define ETH_FAST_PATH_RX_REG_CQE_VALID_SHIFT         0
+#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_MASK   0x1
+#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_SHIFT  1
+#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_MASK      0x3F
+#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_SHIFT     2
+};
+
+/* TPA-continue ETH Rx FP CQE. */
+struct eth_fast_path_rx_tpa_cont_cqe {
+	u8	type;
+	u8	tpa_agg_index;
+	__le16	len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE];
+	u8	reserved[5];
+	u8	reserved1;
+	__le16	reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE];
+};
+
+/* TPA-end ETH Rx FP CQE. */
+struct eth_fast_path_rx_tpa_end_cqe {
+	u8	type;
+	u8	tpa_agg_index;
+	__le16	total_packet_len;
+	u8	num_of_bds;
+	u8	end_reason;
+	__le16	num_of_coalesced_segs;
+	__le32	ts_delta;
+	__le16	len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE];
+	u8	reserved1[3];
+	u8	reserved2;
+	__le16	reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE];
+};
+
+/* TPA-start ETH Rx FP CQE. */
+struct eth_fast_path_rx_tpa_start_cqe {
+	u8	type;
+	u8	bitfields;
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_MASK  0x7
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_SHIFT 0
+#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_MASK             0xF
+#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_SHIFT            3
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_MASK      0x1
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_SHIFT     7
+	__le16	seg_len;
+	struct parsing_and_err_flags pars_flags;
+	__le16	vlan_tag;
+	__le32	rss_hash;
+	__le16	len_on_first_bd;
+	u8	placement_offset;
+	struct tunnel_parsing_flags tunnel_pars_flags;
+	u8	tpa_agg_index;
+	u8	header_len;
+	__le16	ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE];
+	u32	fw_debug;
 };
 
 /* The L4 pseudo checksum mode for Ethernet */
@@ -168,13 +230,26 @@ struct eth_slow_path_rx_cqe {
 	u8	type;
 	u8	ramrod_cmd_id;
 	u8	error_flag;
-	u8	reserved[27];
+	u8	reserved[25];
 	__le16	echo;
+	u8	reserved1;
+	u8	flags;
+/* for PMD mode - valid indication */
+#define ETH_SLOW_PATH_RX_CQE_VALID_MASK         0x1
+#define ETH_SLOW_PATH_RX_CQE_VALID_SHIFT        0
+/* for PMD mode - valid toggle indication */
+#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_MASK  0x1
+#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_SHIFT 1
+#define ETH_SLOW_PATH_RX_CQE_RESERVED2_MASK     0x3F
+#define ETH_SLOW_PATH_RX_CQE_RESERVED2_SHIFT    2
 };
 
 /* union for all ETH Rx CQE types */
 union eth_rx_cqe {
 	struct eth_fast_path_rx_reg_cqe		fast_path_regular;
+	struct eth_fast_path_rx_tpa_start_cqe	fast_path_tpa_start;
+	struct eth_fast_path_rx_tpa_cont_cqe	fast_path_tpa_cont;
+	struct eth_fast_path_rx_tpa_end_cqe	fast_path_tpa_end;
 	struct eth_slow_path_rx_cqe		slow_path;
 };
 
@@ -183,15 +258,18 @@ enum eth_rx_cqe_type {
 	ETH_RX_CQE_TYPE_UNUSED,
 	ETH_RX_CQE_TYPE_REGULAR,
 	ETH_RX_CQE_TYPE_SLOW_PATH,
+	ETH_RX_CQE_TYPE_TPA_START,
+	ETH_RX_CQE_TYPE_TPA_CONT,
+	ETH_RX_CQE_TYPE_TPA_END,
 	MAX_ETH_RX_CQE_TYPE
 };
 
 /* ETH Rx producers data */
 struct eth_rx_prod_data {
 	__le16	bd_prod;
-	__le16	sge_prod;
 	__le16	cqe_prod;
 	__le16	reserved;
+	__le16	reserved1;
 };
 
 /* The first tx bd of a given packet */
@@ -211,12 +289,17 @@ struct eth_tx_2nd_bd {
 /* The parsing information data for the third tx bd of a given packet. */
 struct eth_tx_data_3rd_bd {
 	__le16	lso_mss;
-	u8	bitfields;
+	__le16	bitfields;
 #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK  0xF
 #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT 0
 #define ETH_TX_DATA_3RD_BD_HDR_NBD_MASK         0xF
 #define ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT        4
-	u8	resereved0[3];
+#define ETH_TX_DATA_3RD_BD_START_BD_MASK        0x1
+#define ETH_TX_DATA_3RD_BD_START_BD_SHIFT       8
+#define ETH_TX_DATA_3RD_BD_RESERVED0_MASK       0x7F
+#define ETH_TX_DATA_3RD_BD_RESERVED0_SHIFT      9
+	u8	tunn_l4_hdr_start_offset_w;
+	u8	tunn_hdr_size_w;
 };
 
 /* The third tx bd of a given packet */
@@ -226,12 +309,24 @@ struct eth_tx_3rd_bd {
 	struct eth_tx_data_3rd_bd	data;
 };
 
+/* Complementary information for the regular tx bd of a given packet. */
+struct eth_tx_data_bd {
+	__le16	reserved0;
+	__le16	bitfields;
+#define ETH_TX_DATA_BD_RESERVED1_MASK  0xFF
+#define ETH_TX_DATA_BD_RESERVED1_SHIFT 0
+#define ETH_TX_DATA_BD_START_BD_MASK   0x1
+#define ETH_TX_DATA_BD_START_BD_SHIFT  8
+#define ETH_TX_DATA_BD_RESERVED2_MASK  0x7F
+#define ETH_TX_DATA_BD_RESERVED2_SHIFT 9
+	__le16 reserved3;
+};
+
 /* The common non-special TX BD ring element */
 struct eth_tx_bd {
 	struct regpair	addr;
 	__le16		nbytes;
-	__le16		reserved0;
-	__le32		reserved1;
+	struct eth_tx_data_bd	data;
 };
 
 union eth_tx_bd_types {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d4a32e878180..3d43c1d4ecef 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -80,7 +80,7 @@ struct qed_dev_info {
 	u8		num_hwfns;
 
 	u8		hw_mac[ETH_ALEN];
-	bool		is_mf;
+	bool		is_mf_default;
 
 	/* FW version */
 	u16		fw_major;
@@ -360,6 +360,12 @@ enum DP_MODULE {
 	/* to be added...up to 0x8000000 */
 };
 
+enum qed_mf_mode {
+	QED_MF_DEFAULT,
+	QED_MF_OVLAN,
+	QED_MF_NPAR,
+};
+
 struct qed_eth_stats {
 	u64	no_buff_discards;
 	u64	packet_too_big_discard;
-- 
cgit v1.2.3


From 7bbf3cae65b6e438bf52033b63fdce4a86e89e17 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 15 Feb 2016 21:25:57 +0000
Subject: ipv4: Remove inet_lro library

There are no longer any in-tree drivers that use it.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_lro.h | 142 ------------------
 net/ipv4/Kconfig         |   8 -
 net/ipv4/Makefile        |   1 -
 net/ipv4/inet_lro.c      | 374 -----------------------------------------------
 4 files changed, 525 deletions(-)
 delete mode 100644 include/linux/inet_lro.h
 delete mode 100644 net/ipv4/inet_lro.c

(limited to 'include')

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
deleted file mode 100644
index 9a715cfa1fe3..000000000000
--- a/include/linux/inet_lro.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  linux/include/linux/inet_lro.h
- *
- *  Large Receive Offload (ipv4 / tcp)
- *
- *  (C) Copyright IBM Corp. 2007
- *
- *  Authors:
- *       Jan-Bernd Themann <themann@de.ibm.com>
- *       Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __INET_LRO_H_
-#define __INET_LRO_H_
-
-#include <net/ip.h>
-#include <net/tcp.h>
-
-/*
- * LRO statistics
- */
-
-struct net_lro_stats {
-	unsigned long aggregated;
-	unsigned long flushed;
-	unsigned long no_desc;
-};
-
-/*
- * LRO descriptor for a tcp session
- */
-struct net_lro_desc {
-	struct sk_buff *parent;
-	struct sk_buff *last_skb;
-	struct skb_frag_struct *next_frag;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	__wsum  data_csum;
-	__be32 tcp_rcv_tsecr;
-	__be32 tcp_rcv_tsval;
-	__be32 tcp_ack;
-	u32 tcp_next_seq;
-	u32 skb_tot_frags_len;
-	u16 ip_tot_len;
-	u16 tcp_saw_tstamp; 		/* timestamps enabled */
-	__be16 tcp_window;
-	int pkt_aggr_cnt;		/* counts aggregated packets */
-	int vlan_packet;
-	int mss;
-	int active;
-};
-
-/*
- * Large Receive Offload (LRO) Manager
- *
- * Fields must be set by driver
- */
-
-struct net_lro_mgr {
-	struct net_device *dev;
-	struct net_lro_stats stats;
-
-	/* LRO features */
-	unsigned long features;
-#define LRO_F_NAPI            1  /* Pass packets to stack via NAPI */
-#define LRO_F_EXTRACT_VLAN_ID 2  /* Set flag if VLAN IDs are extracted
-				    from received packets and eth protocol
-				    is still ETH_P_8021Q */
-
-	/*
-	 * Set for generated SKBs that are not added to
-	 * the frag list in fragmented mode
-	 */
-	u32 ip_summed;
-	u32 ip_summed_aggr; /* Set in aggregated SKBs: CHECKSUM_UNNECESSARY
-			     * or CHECKSUM_NONE */
-
-	int max_desc; /* Max number of LRO descriptors  */
-	int max_aggr; /* Max number of LRO packets to be aggregated */
-
-	int frag_align_pad; /* Padding required to properly align layer 3
-			     * headers in generated skb when using frags */
-
-	struct net_lro_desc *lro_arr; /* Array of LRO descriptors */
-
-	/*
-	 * Optimized driver functions
-	 *
-	 * get_skb_header: returns tcp and ip header for packet in SKB
-	 */
-	int (*get_skb_header)(struct sk_buff *skb, void **ip_hdr,
-			      void **tcpudp_hdr, u64 *hdr_flags, void *priv);
-
-	/* hdr_flags: */
-#define LRO_IPV4 1 /* ip_hdr is IPv4 header */
-#define LRO_TCP  2 /* tcpudp_hdr is TCP header */
-
-	/*
-	 * get_frag_header: returns mac, tcp and ip header for packet in SKB
-	 *
-	 * @hdr_flags: Indicate what kind of LRO has to be done
-	 *             (IPv4/IPv6/TCP/UDP)
-	 */
-	int (*get_frag_header)(struct skb_frag_struct *frag, void **mac_hdr,
-			       void **ip_hdr, void **tcpudp_hdr, u64 *hdr_flags,
-			       void *priv);
-};
-
-/*
- * Processes a SKB
- *
- * @lro_mgr: LRO manager to use
- * @skb: SKB to aggregate
- * @priv: Private data that may be used by driver functions
- *        (for example get_tcp_ip_hdr)
- */
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
-		     struct sk_buff *skb,
-		     void *priv);
-/*
- * Forward all aggregated SKBs held by lro_mgr to network stack
- */
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr);
-
-#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 395d82754626..238225b0c970 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -406,14 +406,6 @@ config INET_XFRM_MODE_BEET
 
 	  If unsure, say Y.
 
-config INET_LRO
-	tristate "Large Receive Offload (ipv4/tcp)"
-	default y
-	---help---
-	  Support for Large Receive Offload (ipv4/tcp).
-
-	  If unsure, say Y.
-
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 62c049b647e9..bfa133691cde 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
-obj-$(CONFIG_INET_LRO) += inet_lro.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
 obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
 obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
deleted file mode 100644
index f17ea49b28fb..000000000000
--- a/net/ipv4/inet_lro.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- *  linux/net/ipv4/inet_lro.c
- *
- *  Large Receive Offload (ipv4 / tcp)
- *
- *  (C) Copyright IBM Corp. 2007
- *
- *  Authors:
- *       Jan-Bernd Themann <themann@de.ibm.com>
- *       Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include <linux/module.h>
-#include <linux/if_vlan.h>
-#include <linux/inet_lro.h>
-#include <net/checksum.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
-MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
-
-#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
-#define IP_HDR_LEN(iph) (iph->ihl << 2)
-#define TCP_PAYLOAD_LENGTH(iph, tcph) \
-	(ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
-
-#define IPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_W_TIMESTAMP 8
-
-#define LRO_MAX_PG_HLEN 64
-
-#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
-
-/*
- * Basic tcp checks whether packet is suitable for LRO
- */
-
-static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
-			    int len, const struct net_lro_desc *lro_desc)
-{
-        /* check ip header: don't aggregate padded frames */
-	if (ntohs(iph->tot_len) != len)
-		return -1;
-
-	if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
-		return -1;
-
-	if (iph->ihl != IPH_LEN_WO_OPTIONS)
-		return -1;
-
-	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
-	    tcph->rst || tcph->syn || tcph->fin)
-		return -1;
-
-	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
-		return -1;
-
-	if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
-	    tcph->doff != TCPH_LEN_W_TIMESTAMP)
-		return -1;
-
-	/* check tcp options (only timestamp allowed) */
-	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
-		__be32 *topt = (__be32 *)(tcph + 1);
-
-		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
-				   | (TCPOPT_TIMESTAMP << 8)
-				   | TCPOLEN_TIMESTAMP))
-			return -1;
-
-		/* timestamp should be in right order */
-		topt++;
-		if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
-				      ntohl(*topt)))
-			return -1;
-
-		/* timestamp reply should not be zero */
-		topt++;
-		if (*topt == 0)
-			return -1;
-	}
-
-	return 0;
-}
-
-static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
-{
-	struct iphdr *iph = lro_desc->iph;
-	struct tcphdr *tcph = lro_desc->tcph;
-	__be32 *p;
-	__wsum tcp_hdr_csum;
-
-	tcph->ack_seq = lro_desc->tcp_ack;
-	tcph->window = lro_desc->tcp_window;
-
-	if (lro_desc->tcp_saw_tstamp) {
-		p = (__be32 *)(tcph + 1);
-		*(p+2) = lro_desc->tcp_rcv_tsecr;
-	}
-
-	csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
-	iph->tot_len = htons(lro_desc->ip_tot_len);
-
-	tcph->check = 0;
-	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
-	lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
-	tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-					lro_desc->ip_tot_len -
-					IP_HDR_LEN(iph), IPPROTO_TCP,
-					lro_desc->data_csum);
-}
-
-static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
-{
-	__wsum tcp_csum;
-	__wsum tcp_hdr_csum;
-	__wsum tcp_ps_hdr_csum;
-
-	tcp_csum = ~csum_unfold(tcph->check);
-	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
-
-	tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
-					     len + TCP_HDR_LEN(tcph),
-					     IPPROTO_TCP, 0);
-
-	return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
-			tcp_ps_hdr_csum);
-}
-
-static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
-			  struct iphdr *iph, struct tcphdr *tcph)
-{
-	int nr_frags;
-	__be32 *ptr;
-	u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
-	nr_frags = skb_shinfo(skb)->nr_frags;
-	lro_desc->parent = skb;
-	lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
-	lro_desc->iph = iph;
-	lro_desc->tcph = tcph;
-	lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
-	lro_desc->tcp_ack = tcph->ack_seq;
-	lro_desc->tcp_window = tcph->window;
-
-	lro_desc->pkt_aggr_cnt = 1;
-	lro_desc->ip_tot_len = ntohs(iph->tot_len);
-
-	if (tcph->doff == 8) {
-		ptr = (__be32 *)(tcph+1);
-		lro_desc->tcp_saw_tstamp = 1;
-		lro_desc->tcp_rcv_tsval = *(ptr+1);
-		lro_desc->tcp_rcv_tsecr = *(ptr+2);
-	}
-
-	lro_desc->mss = tcp_data_len;
-	lro_desc->active = 1;
-
-	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
-						tcp_data_len);
-}
-
-static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
-{
-	memset(lro_desc, 0, sizeof(struct net_lro_desc));
-}
-
-static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
-			   struct tcphdr *tcph, int tcp_data_len)
-{
-	struct sk_buff *parent = lro_desc->parent;
-	__be32 *topt;
-
-	lro_desc->pkt_aggr_cnt++;
-	lro_desc->ip_tot_len += tcp_data_len;
-	lro_desc->tcp_next_seq += tcp_data_len;
-	lro_desc->tcp_window = tcph->window;
-	lro_desc->tcp_ack = tcph->ack_seq;
-
-	/* don't update tcp_rcv_tsval, would not work with PAWS */
-	if (lro_desc->tcp_saw_tstamp) {
-		topt = (__be32 *) (tcph + 1);
-		lro_desc->tcp_rcv_tsecr = *(topt + 2);
-	}
-
-	lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
-					     lro_tcp_data_csum(iph, tcph,
-							       tcp_data_len),
-					     parent->len);
-
-	parent->len += tcp_data_len;
-	parent->data_len += tcp_data_len;
-	if (tcp_data_len > lro_desc->mss)
-		lro_desc->mss = tcp_data_len;
-}
-
-static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
-			   struct iphdr *iph, struct tcphdr *tcph)
-{
-	struct sk_buff *parent = lro_desc->parent;
-	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
-	lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
-	skb_pull(skb, (skb->len - tcp_data_len));
-	parent->truesize += skb->truesize;
-
-	if (lro_desc->last_skb)
-		lro_desc->last_skb->next = skb;
-	else
-		skb_shinfo(parent)->frag_list = skb;
-
-	lro_desc->last_skb = skb;
-}
-
-
-static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
-			      struct iphdr *iph,
-			      struct tcphdr *tcph)
-{
-	if ((lro_desc->iph->saddr != iph->saddr) ||
-	    (lro_desc->iph->daddr != iph->daddr) ||
-	    (lro_desc->tcph->source != tcph->source) ||
-	    (lro_desc->tcph->dest != tcph->dest))
-		return -1;
-	return 0;
-}
-
-static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
-					 struct net_lro_desc *lro_arr,
-					 struct iphdr *iph,
-					 struct tcphdr *tcph)
-{
-	struct net_lro_desc *lro_desc = NULL;
-	struct net_lro_desc *tmp;
-	int max_desc = lro_mgr->max_desc;
-	int i;
-
-	for (i = 0; i < max_desc; i++) {
-		tmp = &lro_arr[i];
-		if (tmp->active)
-			if (!lro_check_tcp_conn(tmp, iph, tcph)) {
-				lro_desc = tmp;
-				goto out;
-			}
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		if (!lro_arr[i].active) {
-			lro_desc = &lro_arr[i];
-			goto out;
-		}
-	}
-
-	LRO_INC_STATS(lro_mgr, no_desc);
-out:
-	return lro_desc;
-}
-
-static void lro_flush(struct net_lro_mgr *lro_mgr,
-		      struct net_lro_desc *lro_desc)
-{
-	if (lro_desc->pkt_aggr_cnt > 1)
-		lro_update_tcp_ip_header(lro_desc);
-
-	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
-
-	if (lro_mgr->features & LRO_F_NAPI)
-		netif_receive_skb(lro_desc->parent);
-	else
-		netif_rx(lro_desc->parent);
-
-	LRO_INC_STATS(lro_mgr, flushed);
-	lro_clear_desc(lro_desc);
-}
-
-static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
-			  void *priv)
-{
-	struct net_lro_desc *lro_desc;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	u64 flags;
-	int vlan_hdr_len = 0;
-
-	if (!lro_mgr->get_skb_header ||
-	    lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
-				    &flags, priv))
-		goto out;
-
-	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
-		goto out;
-
-	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
-	if (!lro_desc)
-		goto out;
-
-	if ((skb->protocol == htons(ETH_P_8021Q)) &&
-	    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
-		vlan_hdr_len = VLAN_HLEN;
-
-	if (!lro_desc->active) { /* start new lro session */
-		if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
-			goto out;
-
-		skb->ip_summed = lro_mgr->ip_summed_aggr;
-		lro_init_desc(lro_desc, skb, iph, tcph);
-		LRO_INC_STATS(lro_mgr, aggregated);
-		return 0;
-	}
-
-	if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
-		goto out2;
-
-	if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
-		goto out2;
-
-	lro_add_packet(lro_desc, skb, iph, tcph);
-	LRO_INC_STATS(lro_mgr, aggregated);
-
-	if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
-	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
-		lro_flush(lro_mgr, lro_desc);
-
-	return 0;
-
-out2: /* send aggregated SKBs to stack */
-	lro_flush(lro_mgr, lro_desc);
-
-out:
-	return 1;
-}
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
-		     struct sk_buff *skb,
-		     void *priv)
-{
-	if (__lro_proc_skb(lro_mgr, skb, priv)) {
-		if (lro_mgr->features & LRO_F_NAPI)
-			netif_receive_skb(skb);
-		else
-			netif_rx(skb);
-	}
-}
-EXPORT_SYMBOL(lro_receive_skb);
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr)
-{
-	int i;
-	struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
-
-	for (i = 0; i < lro_mgr->max_desc; i++) {
-		if (lro_desc[i].active)
-			lro_flush(lro_mgr, &lro_desc[i]);
-	}
-}
-EXPORT_SYMBOL(lro_flush_all);
-- 
cgit v1.2.3


From e014860e31e2a66b1a94088504360a6ebc023564 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 17 Feb 2016 14:59:30 -0800
Subject: net: pack tc_cls_u32_knode struct slighter better

By packing the structure we can remove a few holes as Jamal
suggests.

before:

struct tc_cls_u32_knode {
	struct tcf_exts *          exts;                 /*     0     8 */
	u8                         fshift;               /*     8     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        handle;               /*    12     4 */
	u32                        val;                  /*    16     4 */
	u32                        mask;                 /*    20     4 */
	u32                        link_handle;          /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct tc_u32_sel *        sel;                  /*    32     8 */

	/* size: 40, cachelines: 1, members: 7 */
	/* sum members: 33, holes: 2, sum holes: 7 */
	/* last cacheline: 40 bytes */
};

after:

struct tc_cls_u32_knode {
	struct tcf_exts *          exts;                 /*     0     8 */
	struct tc_u32_sel *        sel;                  /*     8     8 */
	u32                        handle;               /*    16     4 */
	u32                        val;                  /*    20     4 */
	u32                        mask;                 /*    24     4 */
	u32                        link_handle;          /*    28     4 */
	u8                         fshift;               /*    32     1 */

	/* size: 40, cachelines: 1, members: 7 */
	/* padding: 7 */
	/* last cacheline: 40 bytes */
};

Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 59789ca6e2c8..2121df574262 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -360,12 +360,12 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 
 struct tc_cls_u32_knode {
 	struct tcf_exts *exts;
-	u8 fshift;
+	struct tc_u32_sel *sel;
 	u32 handle;
 	u32 val;
 	u32 mask;
 	u32 link_handle;
-	struct tc_u32_sel *sel;
+	u8 fshift;
 };
 
 struct tc_cls_u32_hnode {
-- 
cgit v1.2.3


From d4ac05ff3697e036dcb0e2e284c5f7eb77cc0966 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 16 Feb 2016 21:58:57 +0100
Subject: vxlan: introduce vxlan_hdr

Currently, pointer to the vxlan header is kept in a local variable. It has
to be reloaded whenever the pskb pull operations are performed which usually
happens somewhere deep in called functions.

Create a vxlan_hdr function and use it to reference the vxlan header
instead.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 17 +++++++----------
 include/net/vxlan.h |  5 +++++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ee1206d9f8df..524e3b139122 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1257,7 +1257,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct metadata_dst *tun_dst = NULL;
 	struct vxlan_sock *vs;
-	struct vxlanhdr *vxh;
 	u32 flags, vni;
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
@@ -1266,9 +1265,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
 		goto error;
 
-	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
-	flags = ntohl(vxh->vx_flags);
-	vni = ntohl(vxh->vx_vni);
+	flags = ntohl(vxlan_hdr(skb)->vx_flags);
+	vni = ntohl(vxlan_hdr(skb)->vx_vni);
 
 	if (flags & VXLAN_HF_VNI) {
 		flags &= ~VXLAN_HF_VNI;
@@ -1279,16 +1277,14 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
 		goto drop;
-	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
 	vs = rcu_dereference_sk_user_data(sk);
 	if (!vs)
 		goto drop;
 
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
-		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
-				    !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL));
-		if (!vxh)
+		if (!vxlan_remcsum(skb, vxlan_hdr(skb), sizeof(struct vxlanhdr), vni,
+				   !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)))
 			goto drop;
 
 		flags &= ~VXLAN_HF_RCO;
@@ -1313,7 +1309,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
 		struct vxlanhdr_gbp *gbp;
 
-		gbp = (struct vxlanhdr_gbp *)vxh;
+		gbp = (struct vxlanhdr_gbp *)vxlan_hdr(skb);
 		md->gbp = ntohs(gbp->policy_id);
 
 		if (tun_dst)
@@ -1351,7 +1347,8 @@ drop:
 
 bad_flags:
 	netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
-		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
+		   ntohl(vxlan_hdr(skb)->vx_flags),
+		   ntohl(vxlan_hdr(skb)->vx_vni));
 
 error:
 	if (tun_dst)
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b314e4af89c5..3f38b40ec4aa 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -262,6 +262,11 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
 /* IPv6 header + UDP + VXLAN + Ethernet header */
 #define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
 
+static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb)
+{
+	return (struct vxlanhdr *)(udp_hdr(skb) + 1);
+}
+
 #if IS_ENABLED(CONFIG_VXLAN)
 void vxlan_get_rx_port(struct net_device *netdev);
 #else
-- 
cgit v1.2.3


From 54bfd872bf16d40b61bd0cd9b769b2fef67dd272 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 16 Feb 2016 21:58:58 +0100
Subject: vxlan: keep flags and vni in network byte order

Prevent repeated conversions from and to network order in the fast path.

To achieve this, define all flag constants in big endian order and store VNI
as __be32. To prevent confusion between the actual VNI value and the VNI
field from the header (which contains additional reserved byte), strictly
distinguish between "vni" and "vni_field".

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 115 ++++++++++++++++++++++++++--------------------------
 include/net/vxlan.h |  70 +++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 524e3b139122..4e3d3dfe2a0e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -197,9 +197,9 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 #endif
 
 /* Virtual Network hash table head */
-static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
+static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
 {
-	return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
+	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
 }
 
 /* Socket hash table head */
@@ -242,12 +242,12 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
 	return NULL;
 }
 
-static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
+static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni)
 {
 	struct vxlan_dev *vxlan;
 
-	hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
-		if (vxlan->default_dst.remote_vni == id)
+	hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) {
+		if (vxlan->default_dst.remote_vni == vni)
 			return vxlan;
 	}
 
@@ -255,7 +255,7 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
 }
 
 /* Look up VNI in a per net namespace table */
-static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
+static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni,
 					sa_family_t family, __be16 port,
 					u32 flags)
 {
@@ -265,7 +265,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
 	if (!vs)
 		return NULL;
 
-	return vxlan_vs_find_vni(vs, id);
+	return vxlan_vs_find_vni(vs, vni);
 }
 
 /* Fill in neighbour message in skbuff. */
@@ -315,7 +315,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
 		goto nla_put_failure;
 	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
-	    nla_put_u32(skb, NDA_VNI, rdst->remote_vni))
+	    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
 		goto nla_put_failure;
 	if (rdst->remote_ifindex &&
 	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
@@ -383,7 +383,7 @@ static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
 	};
 	struct vxlan_rdst remote = {
 		.remote_ip = *ipa, /* goes to NDA_DST */
-		.remote_vni = VXLAN_N_VID,
+		.remote_vni = cpu_to_be32(VXLAN_N_VID),
 	};
 
 	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
@@ -452,7 +452,7 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 /* caller should hold vxlan->hash_lock */
 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
 					      union vxlan_addr *ip, __be16 port,
-					      __u32 vni, __u32 ifindex)
+					      __be32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -469,7 +469,8 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
 
 /* Replace destination of unicast mac */
 static int vxlan_fdb_replace(struct vxlan_fdb *f,
-			     union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
+			     union vxlan_addr *ip, __be16 port, __be32 vni,
+			     __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -491,7 +492,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 
 /* Add/update destinations for multicast */
 static int vxlan_fdb_append(struct vxlan_fdb *f,
-			    union vxlan_addr *ip, __be16 port, __u32 vni,
+			    union vxlan_addr *ip, __be16 port, __be32 vni,
 			    __u32 ifindex, struct vxlan_rdst **rdp)
 {
 	struct vxlan_rdst *rd;
@@ -523,7 +524,8 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  unsigned int off,
 					  struct vxlanhdr *vh, size_t hdrlen,
-					  u32 data, struct gro_remcsum *grc,
+					  __be32 vni_field,
+					  struct gro_remcsum *grc,
 					  bool nopartial)
 {
 	size_t start, offset;
@@ -534,10 +536,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
 
-	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
-	offset = start + ((data & VXLAN_RCO_UDP) ?
-			  offsetof(struct udphdr, check) :
-			  offsetof(struct tcphdr, check));
+	start = vxlan_rco_start(vni_field);
+	offset = start + vxlan_rco_offset(vni_field);
 
 	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
 				     start, offset, grc, nopartial);
@@ -557,7 +557,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	int flush = 1;
 	struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
 					     udp_offloads);
-	u32 flags;
+	__be32 flags;
 	struct gro_remcsum grc;
 
 	skb_gro_remcsum_init(&grc);
@@ -573,11 +573,11 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 
 	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
 
-	flags = ntohl(vh->vx_flags);
+	flags = vh->vx_flags;
 
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
 		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
-				       ntohl(vh->vx_vni), &grc,
+				       vh->vx_vni, &grc,
 				       !!(vs->flags &
 					  VXLAN_F_REMCSUM_NOPARTIAL));
 
@@ -668,7 +668,7 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __u16 flags,
-			    __be16 port, __u32 vni, __u32 ifindex,
+			    __be16 port, __be32 vni, __u32 ifindex,
 			    __u8 ndm_flags)
 {
 	struct vxlan_rdst *rd = NULL;
@@ -777,7 +777,8 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 }
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
-			   union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex)
+			   union vxlan_addr *ip, __be16 *port, __be32 *vni,
+			   u32 *ifindex)
 {
 	struct net *net = dev_net(vxlan->dev);
 	int err;
@@ -810,7 +811,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 	if (tb[NDA_VNI]) {
 		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
 			return -EINVAL;
-		*vni = nla_get_u32(tb[NDA_VNI]);
+		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
 	} else {
 		*vni = vxlan->default_dst.remote_vni;
 	}
@@ -840,7 +841,8 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	/* struct net *net = dev_net(vxlan->dev); */
 	union vxlan_addr ip;
 	__be16 port;
-	u32 vni, ifindex;
+	__be32 vni;
+	u32 ifindex;
 	int err;
 
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
@@ -877,7 +879,8 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct vxlan_rdst *rd = NULL;
 	union vxlan_addr ip;
 	__be16 port;
-	u32 vni, ifindex;
+	__be32 vni;
+	u32 ifindex;
 	int err;
 
 	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
@@ -1133,17 +1136,16 @@ static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
 }
 
 static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
-				      size_t hdrlen, u32 data, bool nopartial)
+				      size_t hdrlen, __be32 vni_field,
+				      bool nopartial)
 {
 	size_t start, offset, plen;
 
 	if (skb->remcsum_offload)
 		return vh;
 
-	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
-	offset = start + ((data & VXLAN_RCO_UDP) ?
-			  offsetof(struct udphdr, check) :
-			  offsetof(struct tcphdr, check));
+	start = vxlan_rco_start(vni_field);
+	offset = start + vxlan_rco_offset(vni_field);
 
 	plen = hdrlen + offset + sizeof(u16);
 
@@ -1159,7 +1161,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 }
 
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
-		      struct vxlan_metadata *md, u32 vni,
+		      struct vxlan_metadata *md, __be32 vni,
 		      struct metadata_dst *tun_dst)
 {
 	struct iphdr *oip = NULL;
@@ -1257,7 +1259,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct metadata_dst *tun_dst = NULL;
 	struct vxlan_sock *vs;
-	u32 flags, vni;
+	__be32 flags, vni_field;
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
 
@@ -1265,8 +1267,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
 		goto error;
 
-	flags = ntohl(vxlan_hdr(skb)->vx_flags);
-	vni = ntohl(vxlan_hdr(skb)->vx_vni);
+	flags = vxlan_hdr(skb)->vx_flags;
+	vni_field = vxlan_hdr(skb)->vx_vni;
 
 	if (flags & VXLAN_HF_VNI) {
 		flags &= ~VXLAN_HF_VNI;
@@ -1283,17 +1285,18 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto drop;
 
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
-		if (!vxlan_remcsum(skb, vxlan_hdr(skb), sizeof(struct vxlanhdr), vni,
+		if (!vxlan_remcsum(skb, vxlan_hdr(skb), sizeof(struct vxlanhdr),
+				   vni_field,
 				   !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)))
 			goto drop;
 
 		flags &= ~VXLAN_HF_RCO;
-		vni &= VXLAN_VNI_MASK;
+		vni_field &= VXLAN_VNI_MASK;
 	}
 
 	if (vxlan_collect_metadata(vs)) {
 		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
-					 cpu_to_be64(vni >> 8), sizeof(*md));
+					 vxlan_vni(vni_field), sizeof(*md));
 
 		if (!tun_dst)
 			goto drop;
@@ -1324,7 +1327,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		flags &= ~VXLAN_GBP_USED_BITS;
 	}
 
-	if (flags || vni & ~VXLAN_VNI_MASK) {
+	if (flags || vni_field & ~VXLAN_VNI_MASK) {
 		/* If there are any unprocessed flags remaining treat
 		 * this as a malformed packet. This behavior diverges from
 		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
@@ -1337,7 +1340,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	vxlan_rcv(vs, skb, md, vni >> 8, tun_dst);
+	vxlan_rcv(vs, skb, md, vxlan_vni(vni_field), tun_dst);
 	return 0;
 
 drop:
@@ -1680,7 +1683,7 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
 		return;
 
 	gbp = (struct vxlanhdr_gbp *)vxh;
-	vxh->vx_flags |= htonl(VXLAN_HF_GBP);
+	vxh->vx_flags |= VXLAN_HF_GBP;
 
 	if (md->gbp & VXLAN_GBP_DONT_LEARN)
 		gbp->dont_learn = 1;
@@ -1700,7 +1703,6 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 	int min_headroom;
 	int err;
 	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
-	u16 hdrlen = sizeof(struct vxlanhdr);
 
 	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
 	    skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -1733,18 +1735,15 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 		return PTR_ERR(skb);
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
-	vxh->vx_flags = htonl(VXLAN_HF_VNI);
-	vxh->vx_vni = vni;
+	vxh->vx_flags = VXLAN_HF_VNI;
+	vxh->vx_vni = vxlan_vni_field(vni);
 
 	if (type & SKB_GSO_TUNNEL_REMCSUM) {
-		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
-			   VXLAN_RCO_SHIFT;
+		unsigned int start;
 
-		if (skb->csum_offset == offsetof(struct udphdr, check))
-			data |= VXLAN_RCO_UDP;
-
-		vxh->vx_vni |= htonl(data);
-		vxh->vx_flags |= htonl(VXLAN_HF_RCO);
+		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
+		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
+		vxh->vx_flags |= VXLAN_HF_RCO;
 
 		if (!skb_is_gso(skb)) {
 			skb->ip_summed = CHECKSUM_NONE;
@@ -1892,7 +1891,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
-	u32 vni;
+	__be32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
@@ -1914,7 +1913,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto drop;
 		}
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
-		vni = be64_to_cpu(info->key.tun_id);
+		vni = vxlan_tun_id_to_vni(info->key.tun_id);
 		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
 		if (remote_ip.sa.sa_family == AF_INET)
 			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
@@ -2007,7 +2006,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 		err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
-				      htonl(vni << 8), md, flags, udp_sum);
+				      vni, md, flags, udp_sum);
 		if (err < 0)
 			goto xmit_tx_error;
 
@@ -2065,7 +2064,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
 		skb_scrub_packet(skb, xnet);
 		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
-				      htonl(vni << 8), md, flags, udp_sum);
+				      vni, md, flags, udp_sum);
 		if (err < 0) {
 			dst_release(ndst);
 			return;
@@ -2222,7 +2221,7 @@ static void vxlan_cleanup(unsigned long arg)
 static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
 {
 	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
-	__u32 vni = vxlan->default_dst.remote_vni;
+	__be32 vni = vxlan->default_dst.remote_vni;
 
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
@@ -2837,7 +2836,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	memset(&conf, 0, sizeof(conf));
 
 	if (data[IFLA_VXLAN_ID])
-		conf.vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+		conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
 
 	if (data[IFLA_VXLAN_GROUP]) {
 		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
@@ -2941,7 +2940,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 		break;
 
 	case -EEXIST:
-		pr_info("duplicate VNI %u\n", conf.vni);
+		pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
 		break;
 	}
 
@@ -2999,7 +2998,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		.high = htons(vxlan->cfg.port_max),
 	};
 
-	if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
 		goto nla_put_failure;
 
 	if (!vxlan_addr_any(&dst->remote_ip)) {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3f38b40ec4aa..1b85a3b40c5a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -24,11 +24,11 @@ struct vxlanhdr {
 };
 
 /* VXLAN header flags. */
-#define VXLAN_HF_VNI BIT(27)
+#define VXLAN_HF_VNI	cpu_to_be32(BIT(27))
 
 #define VXLAN_N_VID     (1u << 24)
 #define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
-#define VXLAN_VNI_MASK  (VXLAN_VID_MASK << 8)
+#define VXLAN_VNI_MASK	cpu_to_be32(VXLAN_VID_MASK << 8)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
 #define VNI_HASH_BITS	10
@@ -55,14 +55,14 @@ struct vxlanhdr {
  */
 
 /* VXLAN-RCO header flags. */
-#define VXLAN_HF_RCO BIT(21)
+#define VXLAN_HF_RCO	cpu_to_be32(BIT(21))
 
 /* Remote checksum offload header option */
-#define VXLAN_RCO_MASK  0x7f    /* Last byte of vni field */
-#define VXLAN_RCO_UDP   0x80    /* Indicate UDP RCO (TCP when not set *) */
-#define VXLAN_RCO_SHIFT 1       /* Left shift of start */
+#define VXLAN_RCO_MASK	cpu_to_be32(0x7f)  /* Last byte of vni field */
+#define VXLAN_RCO_UDP	cpu_to_be32(0x80)  /* Indicate UDP RCO (TCP when not set *) */
+#define VXLAN_RCO_SHIFT	1		   /* Left shift of start */
 #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1)
-#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT)
+#define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT)
 
 /*
  * VXLAN Group Based Policy Extension (VXLAN_F_GBP):
@@ -105,9 +105,9 @@ struct vxlanhdr_gbp {
 };
 
 /* VXLAN-GBP header flags. */
-#define VXLAN_HF_GBP BIT(31)
+#define VXLAN_HF_GBP	cpu_to_be32(BIT(31))
 
-#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF)
+#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF))
 
 /* skb->mark mapping
  *
@@ -144,7 +144,7 @@ union vxlan_addr {
 struct vxlan_rdst {
 	union vxlan_addr	 remote_ip;
 	__be16			 remote_port;
-	u32			 remote_vni;
+	__be32			 remote_vni;
 	u32			 remote_ifindex;
 	struct list_head	 list;
 	struct rcu_head		 rcu;
@@ -154,7 +154,7 @@ struct vxlan_rdst {
 struct vxlan_config {
 	union vxlan_addr	remote_ip;
 	union vxlan_addr	saddr;
-	u32			vni;
+	__be32			vni;
 	int			remote_ifindex;
 	int			mtu;
 	__be16			dst_port;
@@ -267,6 +267,54 @@ static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb)
 	return (struct vxlanhdr *)(udp_hdr(skb) + 1);
 }
 
+static inline __be32 vxlan_vni(__be32 vni_field)
+{
+#if defined(__BIG_ENDIAN)
+	return vni_field >> 8;
+#else
+	return (vni_field & VXLAN_VNI_MASK) << 8;
+#endif
+}
+
+static inline __be32 vxlan_vni_field(__be32 vni)
+{
+#if defined(__BIG_ENDIAN)
+	return vni << 8;
+#else
+	return vni >> 8;
+#endif
+}
+
+static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
+{
+#if defined(__BIG_ENDIAN)
+	return tun_id;
+#else
+	return tun_id >> 32;
+#endif
+}
+
+static inline size_t vxlan_rco_start(__be32 vni_field)
+{
+	return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
+}
+
+static inline size_t vxlan_rco_offset(__be32 vni_field)
+{
+	return (vni_field & VXLAN_RCO_UDP) ?
+		offsetof(struct udphdr, check) :
+		offsetof(struct tcphdr, check);
+}
+
+static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset)
+{
+	__be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT);
+
+	if (offset == offsetof(struct udphdr, check))
+		vni_field |= VXLAN_RCO_UDP;
+	return vni_field;
+}
+
 #if IS_ENABLED(CONFIG_VXLAN)
 void vxlan_get_rx_port(struct net_device *netdev);
 #else
-- 
cgit v1.2.3


From d1b4c689d4130bcfd3532680b64db562300716b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:24 +0100
Subject: netlink: remove mmapped netlink support

mmapped netlink has a number of unresolved issues:

- TX zerocopy support had to be disabled more than a year ago via
  commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.")
  because the content of the mmapped area can change after netlink
  attribute validation but before message processing.

- RX support was implemented mainly to speed up nfqueue dumping packet
  payload to userspace.  However, since commit ae08ce0021087a5d812d2
  ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy
  with the socket-based interface too (via the skb_zerocopy helper).

The other problem is that skbs attached to mmaped netlink socket
behave different from normal skbs:

- they don't have a shinfo area, so all functions that use skb_shinfo()
(e.g. skb_clone) cannot be used.

- reserving headroom prevents userspace from seeing the content as
it expects message to start at skb->head.
See for instance
commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump").

- skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we
crash because it needs the sk to check if a tx ring is attached.

Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359
("netfilter: nfnetlink: use original skbuff when acking batches").

mmaped netlink also didn't play nicely with the skb_zerocopy helper
used by nfqueue and openvswitch.  Daniel Borkmann fixed this via
commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue
zero-copy")' but at the cost of also needing to provide remaining
length to the allocation function.

nfqueue also has problems when used with mmaped rx netlink:
- mmaped netlink doesn't allow use of nfqueue batch verdict messages.
  Problem is that in the mmap case, the allocation time also determines
  the ordering in which the frame will be seen by userspace (A
  allocating before B means that A is located in earlier ring slot,
  but this also means that B might get a lower sequence number then A
  since seqno is decided later.  To fix this we would need to extend the
  spinlocked region to also cover the allocation and message setup which
  isn't desirable.
- nfqueue can now be configured to queue large (GSO) skbs to userspace.
  Queing GSO packets is faster than having to force a software segmentation
  in the kernel, so this is a desirable option.  However, with a mmap based
  ring one has to use 64kb per ring slot element, else mmap has to fall back
  to the socket path (NL_MMAP_STATUS_COPY) for all large packets.

To use the mmap interface, userspace not only has to probe for mmap netlink
support, it also has to implement a recv/socket receive path in order to
handle messages that exceed the size of an rx ring element.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netlink_mmap.txt | 332 -------------
 include/uapi/linux/netlink.h              |   4 +
 include/uapi/linux/netlink_diag.h         |   2 +
 net/netlink/Kconfig                       |   9 -
 net/netlink/af_netlink.c                  | 754 +-----------------------------
 net/netlink/af_netlink.h                  |  15 -
 net/netlink/diag.c                        |  39 --
 7 files changed, 15 insertions(+), 1140 deletions(-)
 delete mode 100644 Documentation/networking/netlink_mmap.txt

(limited to 'include')

diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
deleted file mode 100644
index 54f10478e8e3..000000000000
--- a/Documentation/networking/netlink_mmap.txt
+++ /dev/null
@@ -1,332 +0,0 @@
-This file documents how to use memory mapped I/O with netlink.
-
-Author: Patrick McHardy <kaber@trash.net>
-
-Overview
---------
-
-Memory mapped netlink I/O can be used to increase throughput and decrease
-overhead of unicast receive and transmit operations. Some netlink subsystems
-require high throughput, these are mainly the netfilter subsystems
-nfnetlink_queue and nfnetlink_log, but it can also help speed up large
-dump operations of f.i. the routing database.
-
-Memory mapped netlink I/O used two circular ring buffers for RX and TX which
-are mapped into the processes address space.
-
-The RX ring is used by the kernel to directly construct netlink messages into
-user-space memory without copying them as done with regular socket I/O,
-additionally as long as the ring contains messages no recvmsg() or poll()
-syscalls have to be issued by user-space to get more message.
-
-The TX ring is used to process messages directly from user-space memory, the
-kernel processes all messages contained in the ring using a single sendmsg()
-call.
-
-Usage overview
---------------
-
-In order to use memory mapped netlink I/O, user-space needs three main changes:
-
-- ring setup
-- conversion of the RX path to get messages from the ring instead of recvmsg()
-- conversion of the TX path to construct messages into the ring
-
-Ring setup is done using setsockopt() to provide the ring parameters to the
-kernel, then a call to mmap() to map the ring into the processes address space:
-
-- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
-- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
-- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
-
-Usage of either ring is optional, but even if only the RX ring is used the
-mapping still needs to be writable in order to update the frame status after
-processing.
-
-Conversion of the reception path involves calling poll() on the file
-descriptor, once the socket is readable the frames from the ring are
-processed in order until no more messages are available, as indicated by
-a status word in the frame header.
-
-On kernel side, in order to make use of memory mapped I/O on receive, the
-originating netlink subsystem needs to support memory mapped I/O, otherwise
-it will use an allocated socket buffer as usual and the contents will be
- copied to the ring on transmission, nullifying most of the performance gains.
-Dumps of kernel databases automatically support memory mapped I/O.
-
-Conversion of the transmit path involves changing message construction to
-use memory from the TX ring instead of (usually) a buffer declared on the
-stack and setting up the frame header appropriately. Optionally poll() can
-be used to wait for free frames in the TX ring.
-
-Structured and definitions for using memory mapped I/O are contained in
-<linux/netlink.h>.
-
-RX and TX rings
-----------------
-
-Each ring contains a number of continuous memory blocks, containing frames of
-fixed size dependent on the parameters used for ring setup.
-
-Ring:	[ block 0 ]
-		[ frame 0 ]
-		[ frame 1 ]
-	[ block 1 ]
-		[ frame 2 ]
-		[ frame 3 ]
-	...
-	[ block n ]
-		[ frame 2 * n ]
-		[ frame 2 * n + 1 ]
-
-The blocks are only visible to the kernel, from the point of view of user-space
-the ring just contains the frames in a continuous memory zone.
-
-The ring parameters used for setting up the ring are defined as follows:
-
-struct nl_mmap_req {
-	unsigned int	nm_block_size;
-	unsigned int	nm_block_nr;
-	unsigned int	nm_frame_size;
-	unsigned int	nm_frame_nr;
-};
-
-Frames are grouped into blocks, where each block is a continuous region of memory
-and holds nm_block_size / nm_frame_size frames. The total number of frames in
-the ring is nm_frame_nr. The following invariants hold:
-
-- frames_per_block = nm_block_size / nm_frame_size
-
-- nm_frame_nr = frames_per_block * nm_block_nr
-
-Some parameters are constrained, specifically:
-
-- nm_block_size must be a multiple of the architectures memory page size.
-  The getpagesize() function can be used to get the page size.
-
-- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
-  able to hold at least the frame header
-
-- nm_frame_size must be smaller or equal to nm_block_size
-
-- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
-
-- nm_frame_nr must equal the actual number of frames as specified above.
-
-When the kernel can't allocate physically continuous memory for a ring block,
-it will fall back to use physically discontinuous memory. This might affect
-performance negatively, in order to avoid this the nm_frame_size parameter
-should be chosen to be as small as possible for the required frame size and
-the number of blocks should be increased instead.
-
-Ring frames
-------------
-
-Each frames contain a frame header, consisting of a synchronization word and some
-meta-data, and the message itself.
-
-Frame:	[ header message ]
-
-The frame header is defined as follows:
-
-struct nl_mmap_hdr {
-	unsigned int	nm_status;
-	unsigned int	nm_len;
-	__u32		nm_group;
-	/* credentials */
-	__u32		nm_pid;
-	__u32		nm_uid;
-	__u32		nm_gid;
-};
-
-- nm_status is used for synchronizing processing between the kernel and user-
-  space and specifies ownership of the frame as well as the operation to perform
-
-- nm_len contains the length of the message contained in the data area
-
-- nm_group specified the destination multicast group of message
-
-- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
-  process. These values correspond to the data available using SOCK_PASSCRED in
-  the SCM_CREDENTIALS cmsg.
-
-The possible values in the status word are:
-
-- NL_MMAP_STATUS_UNUSED:
-	RX ring:	frame belongs to the kernel and contains no message
-			for user-space. Approriate action is to invoke poll()
-			to wait for new messages.
-
-	TX ring:	frame belongs to user-space and can be used for
-			message construction.
-
-- NL_MMAP_STATUS_RESERVED:
-	RX ring only:	frame is currently used by the kernel for message
-			construction and contains no valid message yet.
-			Appropriate action is to invoke poll() to wait for
-			new messages.
-
-- NL_MMAP_STATUS_VALID:
-	RX ring:	frame contains a valid message. Approriate action is
-			to process the message and release the frame back to
-			the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED or queue the frame by setting the
-			status to NL_MMAP_STATUS_SKIP.
-
-	TX ring:	the frame contains a valid message from user-space to
-			be processed by the kernel. After completing processing
-			the kernel will release the frame back to user-space by
-			setting the status to NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_COPY:
-	RX ring only:	a message is ready to be processed but could not be
-			stored in the ring, either because it exceeded the
-			frame size or because the originating subsystem does
-			not support memory mapped I/O. Appropriate action is
-			to invoke recvmsg() to receive the message and release
-			the frame back to the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_SKIP:
-	RX ring only:	user-space queued the message for later processing, but
-			processed some messages following it in the ring. The
-			kernel should skip this frame when looking for unused
-			frames.
-
-The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
-frame header.
-
-TX limitations
---------------
-
-As of Jan 2015 the message is always copied from the ring frame to an
-allocated buffer due to unresolved security concerns.
-See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.").
-
-Example
--------
-
-Ring setup:
-
-	unsigned int block_size = 16 * getpagesize();
-	struct nl_mmap_req req = {
-		.nm_block_size		= block_size,
-		.nm_block_nr		= 64,
-		.nm_frame_size		= 16384,
-		.nm_frame_nr		= 64 * block_size / 16384,
-	};
-	unsigned int ring_size;
-	void *rx_ring, *tx_ring;
-
-	/* Configure ring parameters */
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
-		exit(1);
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
-		exit(1)
-
-	/* Calculate size of each individual ring */
-	ring_size = req.nm_block_nr * req.nm_block_size;
-
-	/* Map RX/TX rings. The TX ring is located after the RX ring */
-	rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
-		       MAP_SHARED, fd, 0);
-	if ((long)rx_ring == -1L)
-		exit(1);
-	tx_ring = rx_ring + ring_size:
-
-Message reception:
-
-This example assumes some ring parameters of the ring setup are available.
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	unsigned char buf[16384];
-	ssize_t len;
-
-	while (1) {
-		struct pollfd pfds[1];
-
-		pfds[0].fd	= fd;
-		pfds[0].events	= POLLIN | POLLERR;
-		pfds[0].revents	= 0;
-
-		if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
-			exit(1);
-
-		/* Check for errors. Error handling omitted */
-		if (pfds[0].revents & POLLERR)
-			<handle error>
-
-		/* If no new messages, poll again */
-		if (!(pfds[0].revents & POLLIN))
-			continue;
-
-		/* Process all frames */
-		while (1) {
-			/* Get next frame header */
-			hdr = rx_ring + frame_offset;
-
-			if (hdr->nm_status == NL_MMAP_STATUS_VALID) {
-				/* Regular memory mapped frame */
-				nlh = (void *)hdr + NL_MMAP_HDRLEN;
-				len = hdr->nm_len;
-
-				/* Release empty message immediately. May happen
-				 * on error during message construction.
-				 */
-				if (len == 0)
-					goto release;
-			} else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
-				/* Frame queued to socket receive queue */
-				len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
-				if (len <= 0)
-					break;
-				nlh = buf;
-			} else
-				/* No more messages to process, continue polling */
-				break;
-
-			process_msg(nlh);
-release:
-			/* Release frame back to the kernel */
-			hdr->nm_status = NL_MMAP_STATUS_UNUSED;
-
-			/* Advance frame offset to next frame */
-			frame_offset = (frame_offset + frame_size) % ring_size;
-		}
-	}
-
-Message transmission:
-
-This example assumes some ring parameters of the ring setup are available.
-A single message is constructed and transmitted, to send multiple messages
-at once they would be constructed in consecutive frames before a final call
-to sendto().
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	struct sockaddr_nl addr = {
-		.nl_family	= AF_NETLINK,
-	};
-
-	hdr = tx_ring + frame_offset;
-	if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
-		/* No frame available. Use poll() to avoid. */
-		exit(1);
-
-	nlh = (void *)hdr + NL_MMAP_HDRLEN;
-
-	/* Build message */
-	build_message(nlh);
-
-	/* Fill frame header: length and status need to be set */
-	hdr->nm_len	= nlh->nlmsg_len;
-	hdr->nm_status	= NL_MMAP_STATUS_VALID;
-
-	if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
-		exit(1);
-
-	/* Advance frame offset to next frame */
-	frame_offset = (frame_offset + frame_size) % ring_size;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f095155d8749..0dba4e4ed2be 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -107,8 +107,10 @@ struct nlmsgerr {
 #define NETLINK_PKTINFO			3
 #define NETLINK_BROADCAST_ERROR		4
 #define NETLINK_NO_ENOBUFS		5
+#ifndef __KERNEL__
 #define NETLINK_RX_RING			6
 #define NETLINK_TX_RING			7
+#endif
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
@@ -134,6 +136,7 @@ struct nl_mmap_hdr {
 	__u32		nm_gid;
 };
 
+#ifndef __KERNEL__
 enum nl_mmap_status {
 	NL_MMAP_STATUS_UNUSED,
 	NL_MMAP_STATUS_RESERVED,
@@ -145,6 +148,7 @@ enum nl_mmap_status {
 #define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
 #define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
 #define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif
 
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index f2159d30d1f5..d79399394b46 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -48,6 +48,8 @@ enum {
 
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
+#ifndef __KERNEL__
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#endif
 
 #endif
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
 # Netlink Sockets
 #
 
-config NETLINK_MMAP
-	bool "NETLINK: mmaped IO"
-	---help---
-	  This option enables support for memory mapped netlink IO. This
-	  reduces overhead by avoiding copying data between kernel- and
-	  userspace.
-
-	  If unsure, say N.
-
 config NETLINK_DIAG
 	tristate "NETLINK: socket monitoring interface"
 	default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1ffb34e253f..85aa6ef86dfd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
 	dev_hold(dev);
 
-	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
+	if (is_vmalloc_addr(skb->head))
 		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
 	else
 		nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
 		wake_up_interruptible(&nlk->wait);
 }
 
-#ifdef CONFIG_NETLINK_MMAP
-static bool netlink_rx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
-}
-
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
-static __pure struct page *pgvec_to_page(const void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return vmalloc_to_page(addr);
-	else
-		return virt_to_page(addr);
-}
-
-static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
-{
-	unsigned int i;
-
-	for (i = 0; i < len; i++) {
-		if (pg_vec[i] != NULL) {
-			if (is_vmalloc_addr(pg_vec[i]))
-				vfree(pg_vec[i]);
-			else
-				free_pages((unsigned long)pg_vec[i], order);
-		}
-	}
-	kfree(pg_vec);
-}
-
-static void *alloc_one_pg_vec_page(unsigned long order)
-{
-	void *buffer;
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
-			  __GFP_NOWARN | __GFP_NORETRY;
-
-	buffer = (void *)__get_free_pages(gfp_flags, order);
-	if (buffer != NULL)
-		return buffer;
-
-	buffer = vzalloc((1 << order) * PAGE_SIZE);
-	if (buffer != NULL)
-		return buffer;
-
-	gfp_flags &= ~__GFP_NORETRY;
-	return (void *)__get_free_pages(gfp_flags, order);
-}
-
-static void **alloc_pg_vec(struct netlink_sock *nlk,
-			   struct nl_mmap_req *req, unsigned int order)
-{
-	unsigned int block_nr = req->nm_block_nr;
-	unsigned int i;
-	void **pg_vec;
-
-	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
-	if (pg_vec == NULL)
-		return NULL;
-
-	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (pg_vec[i] == NULL)
-			goto err1;
-	}
-
-	return pg_vec;
-err1:
-	free_pg_vec(pg_vec, order, block_nr);
-	return NULL;
-}
-
-
-static void
-__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
-		   unsigned int order)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct sk_buff_head *queue;
-	struct netlink_ring *ring;
-
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	spin_lock_bh(&queue->lock);
-
-	ring->frame_max		= req->nm_frame_nr - 1;
-	ring->head		= 0;
-	ring->frame_size	= req->nm_frame_size;
-	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-	swap(ring->pg_vec_len, req->nm_block_nr);
-	swap(ring->pg_vec_order, order);
-	swap(ring->pg_vec, pg_vec);
-
-	__skb_queue_purge(queue);
-	spin_unlock_bh(&queue->lock);
-
-	WARN_ON(atomic_read(&nlk->mapped));
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-}
-
-static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool tx_ring)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	void **pg_vec = NULL;
-	unsigned int order = 0;
-
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	if (atomic_read(&nlk->mapped))
-		return -EBUSY;
-	if (atomic_read(&ring->pending))
-		return -EBUSY;
-
-	if (req->nm_block_nr) {
-		if (ring->pg_vec != NULL)
-			return -EBUSY;
-
-		if ((int)req->nm_block_size <= 0)
-			return -EINVAL;
-		if (!PAGE_ALIGNED(req->nm_block_size))
-			return -EINVAL;
-		if (req->nm_frame_size < NL_MMAP_HDRLEN)
-			return -EINVAL;
-		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
-			return -EINVAL;
-
-		ring->frames_per_block = req->nm_block_size /
-					 req->nm_frame_size;
-		if (ring->frames_per_block == 0)
-			return -EINVAL;
-		if (ring->frames_per_block * req->nm_block_nr !=
-		    req->nm_frame_nr)
-			return -EINVAL;
-
-		order = get_order(req->nm_block_size);
-		pg_vec = alloc_pg_vec(nlk, req, order);
-		if (pg_vec == NULL)
-			return -ENOMEM;
-	} else {
-		if (req->nm_frame_nr)
-			return -EINVAL;
-	}
-
-	mutex_lock(&nlk->pg_vec_lock);
-	if (atomic_read(&nlk->mapped) == 0) {
-		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
-		mutex_unlock(&nlk->pg_vec_lock);
-		return 0;
-	}
-
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-
-	return -EBUSY;
-}
-
-static void netlink_mm_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_inc(&nlk_sk(sk)->mapped);
-}
-
-static void netlink_mm_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_dec(&nlk_sk(sk)->mapped);
-}
-
-static const struct vm_operations_struct netlink_mmap_ops = {
-	.open	= netlink_mm_open,
-	.close	= netlink_mm_close,
-};
-
-static int netlink_mmap(struct file *file, struct socket *sock,
-			struct vm_area_struct *vma)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	unsigned long start, size, expected;
-	unsigned int i;
-	int err = -EINVAL;
-
-	if (vma->vm_pgoff)
-		return -EINVAL;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	expected = 0;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
-	}
-
-	if (expected == 0)
-		goto out;
-
-	size = vma->vm_end - vma->vm_start;
-	if (size != expected)
-		goto out;
-
-	start = vma->vm_start;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-
-		for (i = 0; i < ring->pg_vec_len; i++) {
-			struct page *page;
-			void *kaddr = ring->pg_vec[i];
-			unsigned int pg_num;
-
-			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
-				page = pgvec_to_page(kaddr);
-				err = vm_insert_page(vma, start, page);
-				if (err < 0)
-					goto out;
-				start += PAGE_SIZE;
-				kaddr += PAGE_SIZE;
-			}
-		}
-	}
-
-	atomic_inc(&nlk->mapped);
-	vma->vm_ops = &netlink_mmap_ops;
-	err = 0;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
-{
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
-	struct page *p_start, *p_end;
-
-	/* First page is flushed through netlink_{get,set}_status */
-	p_start = pgvec_to_page(hdr + PAGE_SIZE);
-	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
-	while (p_start <= p_end) {
-		flush_dcache_page(p_start);
-		p_start++;
-	}
-#endif
-}
-
-static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
-{
-	smp_rmb();
-	flush_dcache_page(pgvec_to_page(hdr));
-	return hdr->nm_status;
-}
-
-static void netlink_set_status(struct nl_mmap_hdr *hdr,
-			       enum nl_mmap_status status)
-{
-	smp_mb();
-	hdr->nm_status = status;
-	flush_dcache_page(pgvec_to_page(hdr));
-}
-
-static struct nl_mmap_hdr *
-__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
-{
-	unsigned int pg_vec_pos, frame_off;
-
-	pg_vec_pos = pos / ring->frames_per_block;
-	frame_off  = pos % ring->frames_per_block;
-
-	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
-}
-
-static struct nl_mmap_hdr *
-netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
-		     enum nl_mmap_status status)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = __netlink_lookup_frame(ring, pos);
-	if (netlink_get_status(hdr) != status)
-		return NULL;
-
-	return hdr;
-}
-
-static struct nl_mmap_hdr *
-netlink_current_frame(const struct netlink_ring *ring,
-		      enum nl_mmap_status status)
-{
-	return netlink_lookup_frame(ring, ring->head, status);
-}
-
-static void netlink_increment_head(struct netlink_ring *ring)
-{
-	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
-}
-
-static void netlink_forward_ring(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, ring->head);
-		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
-			break;
-		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
-			break;
-		netlink_increment_head(ring);
-	} while (ring->head != head);
-}
-
-static bool netlink_has_valid_frame(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head, pos = head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, pos);
-		if (hdr->nm_status == NL_MMAP_STATUS_VALID)
-			return true;
-		pos = pos != 0 ? pos - 1 : ring->frame_max;
-	} while (pos != head);
-
-	return false;
-}
-
-static bool netlink_dump_space(struct netlink_sock *nlk)
-{
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-	unsigned int n;
-
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		return false;
-
-	n = ring->head + ring->frame_max / 2;
-	if (n > ring->frame_max)
-		n -= ring->frame_max;
-
-	hdr = __netlink_lookup_frame(ring, n);
-
-	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
-}
-
-static unsigned int netlink_poll(struct file *file, struct socket *sock,
-				 poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	unsigned int mask;
-	int err;
-
-	if (nlk->rx_ring.pg_vec != NULL) {
-		/* Memory mapped sockets don't call recvmsg(), so flow control
-		 * for dumps is performed here. A dump is allowed to continue
-		 * if at least half the ring is unused.
-		 */
-		while (nlk->cb_running && netlink_dump_space(nlk)) {
-			err = netlink_dump(sk);
-			if (err < 0) {
-				sk->sk_err = -err;
-				sk->sk_error_report(sk);
-				break;
-			}
-		}
-		netlink_rcv_wake(sk);
-	}
-
-	mask = datagram_poll(file, sock, wait);
-
-	/* We could already have received frames in the normal receive
-	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
-	 * so if mask contains pollin/etc already, there's no point
-	 * walking the ring.
-	 */
-	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (nlk->rx_ring.pg_vec) {
-			if (netlink_has_valid_frame(&nlk->rx_ring))
-				mask |= POLLIN | POLLRDNORM;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-	}
-
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
-	return mask;
-}
-
-static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
-{
-	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
-}
-
-static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
-				   struct netlink_ring *ring,
-				   struct nl_mmap_hdr *hdr)
-{
-	unsigned int size;
-	void *data;
-
-	size = ring->frame_size - NL_MMAP_HDRLEN;
-	data = (void *)hdr + NL_MMAP_HDRLEN;
-
-	skb->head	= data;
-	skb->data	= data;
-	skb_reset_tail_pointer(skb);
-	skb->end	= skb->tail + size;
-	skb->len	= 0;
-
-	skb->destructor	= netlink_skb_destructor;
-	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
-	NETLINK_CB(skb).sk = sk;
-}
-
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct scm_cookie *scm)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	int err = 0, len = 0;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		unsigned int nm_len;
-
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-
-		nm_len = ACCESS_ONCE(hdr->nm_len);
-		if (nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr, nm_len);
-
-		skb = alloc_skb(nm_len, GFP_KERNEL);
-		if (skb == NULL) {
-			err = -ENOBUFS;
-			goto out;
-		}
-		__skb_put(skb, nm_len);
-		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
-		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = netlink_mmap_hdr(skb);
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_frame_flush_dcache(hdr, hdr->nm_len);
-	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-
-	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
-	kfree_skb(skb);
-}
-
-static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL) {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		kfree_skb(skb);
-		netlink_overrun(sk);
-		return;
-	}
-	netlink_increment_head(ring);
-	__skb_queue_tail(&sk->sk_receive_queue, skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
-}
-
-#else /* CONFIG_NETLINK_MMAP */
-#define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
-#define netlink_mmap			sock_no_mmap
-#define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
-#endif /* CONFIG_NETLINK_MMAP */
-
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	struct nl_mmap_hdr *hdr;
-	struct netlink_ring *ring;
-	struct sock *sk;
-
-	/* If a packet from the kernel to userspace was freed because of an
-	 * error without being delivered to userspace, the kernel must reset
-	 * the status. In the direction userspace to kernel, the status is
-	 * always reset here after the packet was processed and freed.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		hdr = netlink_mmap_hdr(skb);
-		sk = NETLINK_CB(skb).sk;
-
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
-		}
-
-		WARN_ON(atomic_read(&ring->pending) == 0);
-		atomic_dec(&ring->pending);
-		sock_put(sk);
-
-		skb->head = NULL;
-	}
-#endif
 	if (is_vmalloc_addr(skb->head)) {
 		if (!skb->cloned ||
 		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
-#ifdef CONFIG_NETLINK_MMAP
-	if (1) {
-		struct nl_mmap_req req;
-
-		memset(&req, 0, sizeof(req));
-		if (nlk->rx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, false, NULL, 0);
-		memset(&req, 0, sizeof(req));
-		if (nlk->tx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, true, NULL, 0);
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
 		mutex_init(nlk->cb_mutex);
 	}
 	init_waitqueue_head(&nlk->wait);
-#ifdef CONFIG_NETLINK_MMAP
-	mutex_init(&nlk->pg_vec_lock);
-#endif
 
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
@@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	nlk = nlk_sk(sk);
 
 	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
-	    !netlink_skb_is_mmaped(skb)) {
+	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 
 	netlink_deliver_tap(skb);
 
-#ifdef CONFIG_NETLINK_MMAP
-	if (netlink_skb_is_mmaped(skb))
-		netlink_queue_mmaped_skb(sk, skb);
-	else if (netlink_rx_is_mmaped(sk))
-		netlink_ring_set_copied(sk, skb);
-	else
-#endif /* CONFIG_NETLINK_MMAP */
-		skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
 	return len;
 }
@@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 	int delta;
 
 	WARN_ON(skb->sk != NULL);
-	if (netlink_skb_is_mmaped(skb))
-		return skb;
-
 	delta = skb->end - skb->tail;
 	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
 		return skb;
@@ -1880,71 +1252,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
 				    unsigned int ldiff, u32 dst_portid,
 				    gfp_t gfp_mask)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	unsigned int maxlen, linear_size;
-	struct sock *sk = NULL;
-	struct sk_buff *skb;
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-
-	sk = netlink_getsockbyportid(ssk, dst_portid);
-	if (IS_ERR(sk))
-		goto out;
-
-	ring = &nlk_sk(sk)->rx_ring;
-	/* fast-path without atomic ops for common case: non-mmaped receiver */
-	if (ring->pg_vec == NULL)
-		goto out_put;
-
-	/* We need to account the full linear size needed as a ring
-	 * slot cannot have non-linear parts.
-	 */
-	linear_size = size + ldiff;
-	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
-		goto out_put;
-
-	skb = alloc_skb_head(gfp_mask);
-	if (skb == NULL)
-		goto err1;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	/* check again under lock */
-	if (ring->pg_vec == NULL)
-		goto out_free;
-
-	/* check again under lock */
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < linear_size)
-		goto out_free;
-
-	netlink_forward_ring(ring);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		goto err2;
-
-	netlink_ring_setup_skb(skb, sk, ring, hdr);
-	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-	atomic_inc(&ring->pending);
-	netlink_increment_head(ring);
-
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	return skb;
-
-err2:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	netlink_overrun(sk);
-err1:
-	sock_put(sk);
-	return NULL;
-
-out_free:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-out_put:
-	sock_put(sk);
-out:
-#endif
 	return alloc_skb(size, gfp_mask);
 }
 EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	if (level != SOL_NETLINK)
 		return -ENOPROTOOPT;
 
-	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
-	    optlen >= sizeof(int) &&
+	if (optlen >= sizeof(int) &&
 	    get_user(val, (unsigned int __user *)optval))
 		return -EFAULT;
 
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		}
 		err = 0;
 		break;
-#ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
-		struct nl_mmap_req req;
-
-		/* Rings might consume more memory than queue limits, require
-		 * CAP_NET_ADMIN.
-		 */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		if (optlen < sizeof(req))
-			return -EINVAL;
-		if (copy_from_user(&req, optval, sizeof(req)))
-			return -EFAULT;
-		err = netlink_set_ring(sk, &req,
-				       optname == NETLINK_TX_RING);
-		break;
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 	case NETLINK_LISTEN_ALL_NSID:
 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
 			return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		smp_rmb();
 	}
 
-	/* It's a really convoluted way for userland to ask for mmaped
-	 * sendmsg(), but that's what we've got...
-	 */
-	if (netlink_tx_is_mmaped(sk) &&
-	    iter_is_iovec(&msg->msg_iter) &&
-	    msg->msg_iter.nr_segs == 1 &&
-	    msg->msg_iter.iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   &scm);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
 		goto errout_skb;
 	}
 
-	if (!netlink_rx_is_mmaped(sk) &&
-	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		goto errout_skb;
 
 	/* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2831,8 +2105,7 @@ static int netlink_dump(struct sock *sk)
 	 * reasonable static buffer based on the expected largest dump of a
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
-	if (!netlink_rx_is_mmaped(sk))
-		skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
 	netlink_skb_set_owner_r(skb, sk);
 
 	len = cb->dump(skb, cb);
@@ -2884,16 +2157,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_sock *nlk;
 	int ret;
 
-	/* Memory mapped dump requests need to be copied to avoid looping
-	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
-	 * a reference to the skb.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		skb = skb_copy(skb, GFP_KERNEL);
-		if (skb == NULL)
-			return -ENOBUFS;
-	} else
-		atomic_inc(&skb->users);
+	atomic_inc(&skb->users);
 
 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
 	if (sk == NULL) {
@@ -3241,7 +2505,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		netlink_poll,
+	.poll =		datagram_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -3249,7 +2513,7 @@ static const struct proto_ops netlink_ops = {
 	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
-	.mmap =		netlink_mmap,
+	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
 	int			(*netlink_bind)(struct net *net, int group);
 	void			(*netlink_unbind)(struct net *net, int group);
 	struct module		*module;
-#ifdef CONFIG_NETLINK_MMAP
-	struct mutex		pg_vec_lock;
-	struct netlink_ring	rx_ring;
-	struct netlink_ring	tx_ring;
-	atomic_t		mapped;
-#endif /* CONFIG_NETLINK_MMAP */
 
 	struct rhash_head	node;
 	struct rcu_head		rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
 	return container_of(sk, struct netlink_sock, sk);
 }
 
-static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
-{
-#ifdef CONFIG_NETLINK_MMAP
-	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
-#else
-	return false;
-#endif /* CONFIG_NETLINK_MMAP */
-}
-
 struct netlink_table {
 	struct rhashtable	hash;
 	struct hlist_head	mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
 
 #include "af_netlink.h"
 
-#ifdef CONFIG_NETLINK_MMAP
-static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
-			    struct sk_buff *nlskb)
-{
-	struct netlink_diag_ring ndr;
-
-	ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
-	ndr.ndr_block_nr   = ring->pg_vec_len;
-	ndr.ndr_frame_size = ring->frame_size;
-	ndr.ndr_frame_nr   = ring->frame_max + 1;
-
-	return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
-}
-
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	int ret;
-
-	mutex_lock(&nlk->pg_vec_lock);
-	ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
-	if (!ret)
-		ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
-				       nlskb);
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	return ret;
-}
-#else
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	return 0;
-}
-#endif
-
 static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
-	if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
-	    sk_diag_put_rings_cfg(sk, skb))
-		goto out_nlmsg_trim;
-
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From 263ea09084d172cac6e40459a690babe8de8e448 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:26 +0100
Subject: Revert "genl: Add genlmsg_new_unicast() for unicast message
 allocation"

This reverts commit bb9b18fb55b0 ("genl: Add genlmsg_new_unicast() for
unicast message allocation")'.

Nothing wrong with it; its no longer needed since this was only for
mmapped netlink support.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h    |  4 ----
 net/netlink/genetlink.c    | 21 ---------------------
 net/openvswitch/datapath.c | 10 +++++-----
 net/tipc/netlink_compat.c  |  1 -
 4 files changed, 5 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43c0e771f417..8d4608ce8716 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -83,7 +83,6 @@ struct genl_family {
  * @attrs: netlink attributes
  * @_net: network namespace
  * @user_ptr: user pointers
- * @dst_sk: destination socket
  */
 struct genl_info {
 	u32			snd_seq;
@@ -94,7 +93,6 @@ struct genl_info {
 	struct nlattr **	attrs;
 	possible_net_t		_net;
 	void *			user_ptr[2];
-	struct sock *		dst_sk;
 };
 
 static inline struct net *genl_info_net(struct genl_info *info)
@@ -188,8 +186,6 @@ int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
 
-struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
-				    gfp_t flags);
 void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
 		  struct genl_family *family, int flags, u8 cmd);
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0ffd721126e7..a09132a69869 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -462,26 +462,6 @@ int genl_unregister_family(struct genl_family *family)
 }
 EXPORT_SYMBOL(genl_unregister_family);
 
-/**
- * genlmsg_new_unicast - Allocate generic netlink message for unicast
- * @payload: size of the message payload
- * @info: information on destination
- * @flags: the type of memory to allocate
- *
- * Allocates a new sk_buff large enough to cover the specified payload
- * plus required Netlink headers. Will check receiving socket for
- * memory mapped i/o capability and use it if enabled. Will fall back
- * to non-mapped skb if message size exceeds the frame size of the ring.
- */
-struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
-				    gfp_t flags)
-{
-	size_t len = nlmsg_total_size(genlmsg_total_size(payload));
-
-	return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags);
-}
-EXPORT_SYMBOL_GPL(genlmsg_new_unicast);
-
 /**
  * genlmsg_put - Add generic netlink header to netlink message
  * @skb: socket buffer holding the message
@@ -642,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	info.genlhdr = nlmsg_data(nlh);
 	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
 	info.attrs = attrbuf;
-	info.dst_sk = skb->sk;
 	genl_info_net_set(&info, net);
 	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 35a2659a277e..c4e8455d5d56 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1477,7 +1477,7 @@ error:
 	return -EMSGSIZE;
 }
 
-static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
+static struct sk_buff *ovs_dp_cmd_alloc_info(void)
 {
 	return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
 }
@@ -1532,7 +1532,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
 		goto err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1653,7 +1653,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1686,7 +1686,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1719,7 +1719,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 2c016fdefe97..de66d8f945ed 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1104,7 +1104,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
 	req_nlh = (struct nlmsghdr *)skb->data;
 	msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
 	msg.cmd = req_userhdr->cmd;
-	msg.dst_sk = info->dst_sk;
 	msg.net = genl_info_net(info);
 
 	if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
-- 
cgit v1.2.3


From 905f0a739ad82c6371fb0cb0e71db14a750702ad Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:27 +0100
Subject: nfnetlink: remove nfnetlink_alloc_skb

Following mmapped netlink removal this code can be simplified by
removing the alloc wrapper.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h | 2 --
 net/netfilter/nfnetlink.c           | 7 -------
 net/netfilter/nfnetlink_log.c       | 5 ++---
 3 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index ba0d9789eb6e..1d82dd5e9a08 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -34,8 +34,6 @@ int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 int nfnetlink_has_listeners(struct net *net, unsigned int group);
-struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
-				    u32 dst_portid, gfp_t gfp_mask);
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags);
 int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a7ba23353dab..9a99f686d06f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
-struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
-				    u32 dst_portid, gfp_t gfp_mask)
-{
-	return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
-
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags)
 {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 8ca932057c13..11f81c8385fc 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
 	 * message.  WARNING: has to be <= 128k due to slab restrictions */
 
 	n = max(inst_size, pkt_size);
-	skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
+	skb = alloc_skb(n, GFP_ATOMIC);
 	if (!skb) {
 		if (n > pkt_size) {
 			/* try to allocate only as much as we need for current
 			 * packet */
 
-			skb = nfnetlink_alloc_skb(net, pkt_size,
-						  peer_portid, GFP_ATOMIC);
+			skb = alloc_skb(pkt_size, GFP_ATOMIC);
 		}
 	}
 
-- 
cgit v1.2.3


From c5b0db3263b92526bc0c1b6380c0c99f91f069fc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:28 +0100
Subject: nfnetlink: Revert "nfnetlink: add support for memory mapped netlink"

reverts commit 3ab1f683bf8b ("nfnetlink: add support for memory mapped
netlink")'

Like previous commits in the series, remove wrappers that are not needed
after mmapped netlink removal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h         | 10 ----------
 net/netfilter/nfnetlink_queue.c |  6 ++----
 net/netlink/af_netlink.c        | 20 ++++----------------
 3 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 0b41959aab9f..da14ab61f363 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -69,16 +69,6 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group)
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 
-extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
-					   unsigned int ldiff, u32 dst_portid,
-					   gfp_t gfp_mask);
-static inline struct sk_buff *
-netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
-		  gfp_t gfp_mask)
-{
-	return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
-}
-
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 1d3936587ace..75429997ed41 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			   __be32 **packet_id_ptr)
 {
 	size_t size;
-	size_t data_len = 0, cap_len = 0, rem_len = 0;
+	size_t data_len = 0, cap_len = 0;
 	unsigned int hlen = 0;
 	struct sk_buff *skb;
 	struct nlattr *nla;
@@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		hlen = min_t(unsigned int, hlen, data_len);
 		size += sizeof(struct nlattr) + hlen;
 		cap_len = entskb->len;
-		rem_len = data_len - hlen;
 		break;
 	}
 
@@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			size += nla_total_size(seclen);
 	}
 
-	skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
-				  GFP_ATOMIC);
+	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb) {
 		skb_tx_error(entskb);
 		return NULL;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 85aa6ef86dfd..c8416792cce0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1248,14 +1248,6 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
-				    unsigned int ldiff, u32 dst_portid,
-				    gfp_t gfp_mask)
-{
-	return alloc_skb(size, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
-
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
 	int res = 0;
@@ -2082,15 +2074,12 @@ static int netlink_dump(struct sock *sk)
 
 	if (alloc_min_size < nlk->max_recvmsg_len) {
 		alloc_size = nlk->max_recvmsg_len;
-		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
-					GFP_KERNEL |
-					__GFP_NOWARN |
-					__GFP_NORETRY);
+		skb = alloc_skb(alloc_size, GFP_KERNEL |
+					    __GFP_NOWARN | __GFP_NORETRY);
 	}
 	if (!skb) {
 		alloc_size = alloc_min_size;
-		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
-					GFP_KERNEL);
+		skb = alloc_skb(alloc_size, GFP_KERNEL);
 	}
 	if (!skb)
 		goto errout_skb;
@@ -2230,8 +2219,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
 		payload += nlmsg_len(nlh);
 
-	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
-				NETLINK_CB(in_skb).portid, GFP_KERNEL);
+	skb = nlmsg_new(payload, GFP_KERNEL);
 	if (!skb) {
 		struct sock *sk;
 
-- 
cgit v1.2.3


From 07dabf20d9867710b90b91108b2adcd448773e25 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 18 Feb 2016 19:19:29 +0100
Subject: vxlan: tun_id is 64bit, not 32bit

The tun_id field in struct ip_tunnel_key is __be64, not __be32. We need to
convert the vni to tun_id correctly.

Fixes: 54bfd872bf16 ("vxlan: keep flags and vni in network byte order")
Reported-by: Paolo Abeni <pabeni@redhat.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 5 +++--
 include/net/vxlan.h | 9 +++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3a84680b5117..75bccb360599 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1310,9 +1310,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto drop;
 
 	if (vxlan_collect_metadata(vs)) {
+		__be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
+
 		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
-					 vxlan_vni(vxlan_hdr(skb)->vx_vni),
-					 sizeof(*md));
+					 vxlan_vni_to_tun_id(vni), sizeof(*md));
 
 		if (!tun_dst)
 			goto drop;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 1b85a3b40c5a..748083de367a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -294,6 +294,15 @@ static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
 #endif
 }
 
+static inline __be64 vxlan_vni_to_tun_id(__be32 vni)
+{
+#if defined(__BIG_ENDIAN)
+	return (__be64)vni;
+#else
+	return (__be64)vni << 32;
+#endif
+}
+
 static inline size_t vxlan_rco_start(__be32 vni_field)
 {
 	return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
-- 
cgit v1.2.3


From 7f290c94352e59b1d720055fce760a69a63bd0a1 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 18 Feb 2016 11:22:52 +0100
Subject: iptunnel: scrub packet in iptunnel_pull_header

Part of skb_scrub_packet was open coded in iptunnel_pull_header. Let it call
skb_scrub_packet directly instead.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      | 4 ++--
 drivers/net/vxlan.c       | 4 ++--
 include/net/ip_tunnels.h  | 3 ++-
 net/ipv4/ip_gre.c         | 2 +-
 net/ipv4/ip_tunnel_core.c | 8 +++-----
 net/ipv4/ipip.c           | 2 +-
 net/ipv6/sit.c            | 2 +-
 7 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4ceccf871b3f..dfbe3ca687f7 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -237,7 +237,6 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 	}
 
 	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
 	skb->protocol = eth_type_trans(skb, geneve->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
@@ -356,7 +355,8 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	opts_len = geneveh->opt_len * 4;
 	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
-				 htons(ETH_P_TEB)))
+				 htons(ETH_P_TEB),
+				 !net_eq(geneve->net, dev_net(geneve->dev))))
 		goto drop;
 
 	geneve_rx(geneve, gs, skb);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 16a176cd0dad..c963897e713d 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1198,7 +1198,6 @@ static void vxlan_rcv(struct vxlan_dev *vxlan, struct vxlan_sock *vs,
 	int err = 0;
 
 	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
 	skb->protocol = eth_type_trans(skb, vxlan->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
@@ -1305,7 +1304,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!vxlan)
 		goto drop;
 
-	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
+	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB),
+				 !net_eq(vxlan->net, dev_net(vxlan->dev))))
 		goto drop;
 
 	if (vxlan_collect_metadata(vs)) {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 87408ab80856..4dd616376fec 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -270,7 +270,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 	return INET_ECN_encapsulate(tos, inner);
 }
 
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+			 bool xnet);
 void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, u8 proto,
 		   u8 tos, u8 ttl, __be16 df, bool xnet);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 917c2c1bfadd..12071e28d958 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return -EINVAL;
 		}
 	}
-	return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+	return iptunnel_pull_header(skb, hdr_len, tpi->proto, false);
 }
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index a6e58b6141cd..eaca2449a09a 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(iptunnel_xmit);
 
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+			 bool xnet)
 {
 	if (unlikely(!pskb_may_pull(skb, hdr_len)))
 		return -ENOMEM;
@@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
 		skb->protocol = inner_proto;
 	}
 
-	nf_reset(skb);
-	secpath_reset(skb);
 	skb_clear_hash_if_not_l4(skb);
-	skb_dst_drop(skb);
 	skb->vlan_tci = 0;
 	skb_set_queue_mapping(skb, 0);
-	skb->pkt_type = PACKET_HOST;
+	skb_scrub_packet(skb, xnet);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iptunnel_pull_header);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 6ec5b42fd172..ec51d02166de 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)
 	if (tunnel) {
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
-		if (iptunnel_pull_header(skb, 0, tpi.proto))
+		if (iptunnel_pull_header(skb, 0, tpi.proto, false))
 			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0625ac6356b5..f45b8ffc2840 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb)
 
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
-		if (iptunnel_pull_header(skb, 0, tpi.proto))
+		if (iptunnel_pull_header(skb, 0, tpi.proto, false))
 			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
-- 
cgit v1.2.3


From e550785c30f639b3cc6ca70c489a6463ff298453 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Wed, 17 Feb 2016 16:20:33 -0800
Subject: ipv6: Annotate change of locking mechanism for np->opt

follows up commit 45f6fad84cc3 ("ipv6: add complete rcu protection around
np->opt") which added mixed rcu/refcount protection to np->opt.

Given the current implementation of rcu_pointer_handoff(), this has no
effect at runtime.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6570f379aba2..f3c9857c645d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -259,8 +259,12 @@ static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
 
 	rcu_read_lock();
 	opt = rcu_dereference(np->opt);
-	if (opt && !atomic_inc_not_zero(&opt->refcnt))
-		opt = NULL;
+	if (opt) {
+		if (!atomic_inc_not_zero(&opt->refcnt))
+			opt = NULL;
+		else
+			opt = rcu_pointer_handoff(opt);
+	}
 	rcu_read_unlock();
 	return opt;
 }
-- 
cgit v1.2.3


From 9e74a6dadbbf31ac18a2712048bf866c8e32aab2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 17 Feb 2016 11:23:55 -0800
Subject: net: Optimize local checksum offload

This patch takes advantage of several assumptions we can make about the
headers of the frame in order to reduce overall processing overhead for
computing the outer header checksum.

First we can assume the entire header is in the region pointed to by
skb->head as this is what csum_start is based on.

Second, as a result of our first assumption, we can just call csum_partial
instead of making a call to skb_checksum which would end up having to
configure things so that we could walk through the frags list.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39206751463e..89b536796e53 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3713,19 +3713,18 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
  */
 static inline __wsum lco_csum(struct sk_buff *skb)
 {
-	char *inner_csum_field;
-	__wsum csum;
+	unsigned char *csum_start = skb_checksum_start(skb);
+	unsigned char *l4_hdr = skb_transport_header(skb);
+	__wsum partial;
 
 	/* Start with complement of inner checksum adjustment */
-	inner_csum_field = skb->data + skb_checksum_start_offset(skb) +
-				skb->csum_offset;
-	csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field);
+	partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
+						    skb->csum_offset));
+
 	/* Add in checksum of our headers (incl. outer checksum
-	 * adjustment filled in by caller)
+	 * adjustment filled in by caller) and return result.
 	 */
-	csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum);
-	/* The result is the checksum from skb->data to end of packet */
-	return csum;
+	return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
 }
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 3f9b4a6972d50562613daa649ed064244e6bc7bb Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Thu, 18 Feb 2016 17:00:39 +0200
Subject: qed: Lay infrastructure for vlan filtering offload

Today, interfaces are working in vlan-promisc mode; But once
vlan filtering offloaded would be supported, we'll need a method to
control it directly [e.g., when setting device to PROMISC, or when
running out of vlan credits].

This adds the necessary API for L2 client to manually choose whether to
accept all vlans or only those for which filters were configured.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c | 23 +++++++++++++++++++----
 include/linux/qed/qed_eth_if.h           |  2 ++
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 978d07a61bbf..73feaf7eedb8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -124,6 +124,8 @@ struct qed_sp_vport_update_params {
 	u8				update_vport_active_tx_flg;
 	u8				vport_active_tx_flg;
 	u8				update_approx_mcast_flg;
+	u8				update_accept_any_vlan_flg;
+	u8				accept_any_vlan;
 	unsigned long			bins[8];
 	struct qed_rss_params		*rss_params;
 	struct qed_filter_accept_flags	accept_flags;
@@ -393,7 +395,9 @@ qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 	p_cmn->update_rx_active_flg = p_params->update_vport_active_rx_flg;
 	p_cmn->tx_active_flg = p_params->vport_active_tx_flg;
 	p_cmn->update_tx_active_flg = p_params->update_vport_active_tx_flg;
-
+	p_cmn->accept_any_vlan = p_params->accept_any_vlan;
+	p_cmn->update_accept_any_vlan_flg =
+			p_params->update_accept_any_vlan_flg;
 	rc = qed_sp_vport_update_rss(p_hwfn, p_ramrod, p_rss_params);
 	if (rc) {
 		/* Return spq entry which is taken in qed_sp_init_request()*/
@@ -444,8 +448,10 @@ static int qed_sp_vport_stop(struct qed_hwfn *p_hwfn,
 static int qed_filter_accept_cmd(struct qed_dev *cdev,
 				 u8 vport,
 				 struct qed_filter_accept_flags accept_flags,
-				 enum spq_mode comp_mode,
-				 struct qed_spq_comp_cb *p_comp_data)
+				 u8 update_accept_any_vlan,
+				 u8 accept_any_vlan,
+				enum spq_mode comp_mode,
+				struct qed_spq_comp_cb *p_comp_data)
 {
 	struct qed_sp_vport_update_params vport_update_params;
 	int i, rc;
@@ -454,6 +460,8 @@ static int qed_filter_accept_cmd(struct qed_dev *cdev,
 	memset(&vport_update_params, 0, sizeof(vport_update_params));
 	vport_update_params.vport_id = vport;
 	vport_update_params.accept_flags = accept_flags;
+	vport_update_params.update_accept_any_vlan_flg = update_accept_any_vlan;
+	vport_update_params.accept_any_vlan = accept_any_vlan;
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
@@ -471,6 +479,10 @@ static int qed_filter_accept_cmd(struct qed_dev *cdev,
 			   "Accept filter configured, flags = [Rx]%x [Tx]%x\n",
 			   accept_flags.rx_accept_filter,
 			   accept_flags.tx_accept_filter);
+		if (update_accept_any_vlan)
+			DP_VERBOSE(p_hwfn, QED_MSG_SP,
+				   "accept_any_vlan=%d configured\n",
+				   accept_any_vlan);
 	}
 
 	return 0;
@@ -1347,6 +1359,9 @@ static int qed_update_vport(struct qed_dev *cdev,
 		params->update_vport_active_flg;
 	sp_params.vport_active_rx_flg = params->vport_active_flg;
 	sp_params.vport_active_tx_flg = params->vport_active_flg;
+	sp_params.accept_any_vlan = params->accept_any_vlan;
+	sp_params.update_accept_any_vlan_flg =
+		params->update_accept_any_vlan_flg;
 
 	/* RSS - is a bit tricky, since upper-layer isn't familiar with hwfns.
 	 * We need to re-fix the rss values per engine for CMT.
@@ -1566,7 +1581,7 @@ static int qed_configure_filter_rx_mode(struct qed_dev *cdev,
 	else if (type == QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC)
 		accept_flags.rx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
 
-	return qed_filter_accept_cmd(cdev, 0, accept_flags,
+	return qed_filter_accept_cmd(cdev, 0, accept_flags, false, false,
 				     QED_SPQ_MODE_CB, NULL);
 }
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 81ab178e31c1..e53b0ca49e41 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -33,6 +33,8 @@ struct qed_update_vport_params {
 	u8 vport_id;
 	u8 update_vport_active_flg;
 	u8 vport_active_flg;
+	u8 update_accept_any_vlan_flg;
+	u8 accept_any_vlan;
 	u8 update_rss_flg;
 	struct qed_update_vport_rss_params rss_params;
 };
-- 
cgit v1.2.3


From 2125715635053d4207a756a35aa718f548824e58 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 16 Feb 2016 12:46:54 +0100
Subject: bridge: mdb: add support for more attributes and export timer

Currently mdb entries are exported directly as a structure inside
MDBA_MDB_ENTRY_INFO attribute, we can't really extend it without
breaking user-space. In order to export new mdb fields, I've converted
the MDBA_MDB_ENTRY_INFO into a nested attribute which starts like before
with struct br_mdb_entry (without header, as it's casted directly in
iproute2) and continues with MDBA_MDB_EATTR_ attributes. This way we
keep compatibility with older users and can export new data.
I've tested this with iproute2, both with and without support for the
added attribute and it works fine.
So basically we again have MDBA_MDB_ENTRY_INFO with struct br_mdb_entry
inside but it may contain also some additional MDBA_MDB_EATTR_ attributes
such as MDBA_MDB_EATTR_TIMER which can be parsed by user-space.

So the new structure is:
[MDBA_MDB] = {
     [MDBA_MDB_ENTRY] = {
         [MDBA_MDB_ENTRY_INFO]
         [MDBA_MDB_ENTRY_INFO] { <- Nested attribute
             struct br_mdb_entry <- nla_put_nohdr()
             [MDBA_MDB_ENTRY attributes] <- normal netlink attributes
         }
     }
}

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 ++++++++++++-
 net/bridge/br_mdb.c            | 16 +++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ec3547234998..0890b217580d 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -137,7 +137,10 @@ struct bridge_vlan_info {
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
- *         [MDBA_MDB_ENTRY_INFO]
+ *         [MDBA_MDB_ENTRY_INFO] {
+ *		struct br_mdb_entry
+ *		[MDBA_MDB_EATTR attributes]
+ *         }
  *     }
  * }
  * [MDBA_ROUTER] = {
@@ -166,6 +169,14 @@ enum {
 };
 #define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
 
+/* per mdb entry additional attributes */
+enum {
+	MDBA_MDB_EATTR_UNSPEC,
+	MDBA_MDB_EATTR_TIMER,
+	__MDBA_MDB_EATTR_MAX
+};
+#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e66619171386..cf51b7bcb5d5 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -88,11 +88,13 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			for (pp = &mp->ports;
 			     (p = rcu_dereference(*pp)) != NULL;
 			      pp = &p->next) {
+				struct nlattr *nest_ent;
 				struct br_mdb_entry e;
 
 				port = p->port;
 				if (!port)
 					continue;
+
 				memset(&e, 0, sizeof(e));
 				e.ifindex = port->dev->ifindex;
 				e.vid = p->addr.vid;
@@ -104,11 +106,23 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					e.addr.u.ip6 = p->addr.u.ip6;
 #endif
 				e.addr.proto = p->addr.proto;
-				if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+				nest_ent = nla_nest_start(skb,
+							  MDBA_MDB_ENTRY_INFO);
+				if (!nest_ent) {
+					nla_nest_cancel(skb, nest2);
+					err = -EMSGSIZE;
+					goto out;
+				}
+				if (nla_put_nohdr(skb, sizeof(e), &e) ||
+				    nla_put_u32(skb,
+						MDBA_MDB_EATTR_TIMER,
+						br_timer_value(&p->timer))) {
+					nla_nest_cancel(skb, nest_ent);
 					nla_nest_cancel(skb, nest2);
 					err = -EMSGSIZE;
 					goto out;
 				}
+				nla_nest_end(skb, nest_ent);
 			}
 			nla_nest_end(skb, nest2);
 		skip:
-- 
cgit v1.2.3


From e52bc7c28ac9f54db6f86b19ed65c599def18c98 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Fri, 19 Feb 2016 09:23:59 -0500
Subject: lib/bitmap.c: conversion routines to/from u32 array

Aimed at transferring bitmaps to/from user-space in a 32/64-bit agnostic
way.

Tested:
  unit tests (next patch) on qemu i386, x86_64, ppc, ppc64 BE and LE,
  ARM.

Signed-off-by: David Decotigny <decot@googlers.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitmap.h | 10 ++++++
 lib/bitmap.c           | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 9653fdb76a42..e9b0b9ab07e5 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -59,6 +59,8 @@
  * bitmap_find_free_region(bitmap, bits, order)	Find and allocate bit region
  * bitmap_release_region(bitmap, pos, order)	Free specified bit region
  * bitmap_allocate_region(bitmap, pos, order)	Allocate specified bit region
+ * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words)
+ * bitmap_to_u32array(buf, nwords, src, nbits)	*buf = *dst (nwords 32b words)
  */
 
 /*
@@ -163,6 +165,14 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
+extern unsigned int bitmap_from_u32array(unsigned long *bitmap,
+					 unsigned int nbits,
+					 const u32 *buf,
+					 unsigned int nwords);
+extern unsigned int bitmap_to_u32array(u32 *buf,
+				       unsigned int nwords,
+				       const unsigned long *bitmap,
+				       unsigned int nbits);
 #ifdef __BIG_ENDIAN
 extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
 #else
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 814814397cce..c66da508cbf7 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -12,6 +12,8 @@
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1059,6 +1061,93 @@ int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
 }
 EXPORT_SYMBOL(bitmap_allocate_region);
 
+/**
+ * bitmap_from_u32array - copy the contents of a u32 array of bits to bitmap
+ *	@bitmap: array of unsigned longs, the destination bitmap, non NULL
+ *	@nbits: number of bits in @bitmap
+ *	@buf: array of u32 (in host byte order), the source bitmap, non NULL
+ *	@nwords: number of u32 words in @buf
+ *
+ * copy min(nbits, 32*nwords) bits from @buf to @bitmap, remaining
+ * bits between nword and nbits in @bitmap (if any) are cleared. In
+ * last word of @bitmap, the bits beyond nbits (if any) are kept
+ * unchanged.
+ *
+ * Return the number of bits effectively copied.
+ */
+unsigned int
+bitmap_from_u32array(unsigned long *bitmap, unsigned int nbits,
+		     const u32 *buf, unsigned int nwords)
+{
+	unsigned int dst_idx, src_idx;
+
+	for (src_idx = dst_idx = 0; dst_idx < BITS_TO_LONGS(nbits); ++dst_idx) {
+		unsigned long part = 0;
+
+		if (src_idx < nwords)
+			part = buf[src_idx++];
+
+#if BITS_PER_LONG == 64
+		if (src_idx < nwords)
+			part |= ((unsigned long) buf[src_idx++]) << 32;
+#endif
+
+		if (dst_idx < nbits/BITS_PER_LONG)
+			bitmap[dst_idx] = part;
+		else {
+			unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
+
+			bitmap[dst_idx] = (bitmap[dst_idx] & ~mask)
+				| (part & mask);
+		}
+	}
+
+	return min_t(unsigned int, nbits, 32*nwords);
+}
+EXPORT_SYMBOL(bitmap_from_u32array);
+
+/**
+ * bitmap_to_u32array - copy the contents of bitmap to a u32 array of bits
+ *	@buf: array of u32 (in host byte order), the dest bitmap, non NULL
+ *	@nwords: number of u32 words in @buf
+ *	@bitmap: array of unsigned longs, the source bitmap, non NULL
+ *	@nbits: number of bits in @bitmap
+ *
+ * copy min(nbits, 32*nwords) bits from @bitmap to @buf. Remaining
+ * bits after nbits in @buf (if any) are cleared.
+ *
+ * Return the number of bits effectively copied.
+ */
+unsigned int
+bitmap_to_u32array(u32 *buf, unsigned int nwords,
+		   const unsigned long *bitmap, unsigned int nbits)
+{
+	unsigned int dst_idx = 0, src_idx = 0;
+
+	while (dst_idx < nwords) {
+		unsigned long part = 0;
+
+		if (src_idx < BITS_TO_LONGS(nbits)) {
+			part = bitmap[src_idx];
+			if (src_idx >= nbits/BITS_PER_LONG)
+				part &= BITMAP_LAST_WORD_MASK(nbits);
+			src_idx++;
+		}
+
+		buf[dst_idx++] = part & 0xffffffffUL;
+
+#if BITS_PER_LONG == 64
+		if (dst_idx < nwords) {
+			part >>= 32;
+			buf[dst_idx++] = part & 0xffffffffUL;
+		}
+#endif
+	}
+
+	return min_t(unsigned int, nbits, 32*nwords);
+}
+EXPORT_SYMBOL(bitmap_to_u32array);
+
 /**
  * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
  * @dst:   destination buffer
-- 
cgit v1.2.3


From ac2c7ad0e5d6030452c9af2fafd192e17fd04264 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:01 -0500
Subject: net/ethtool: introduce a new ioctl for per queue setting

Introduce a new ioctl ETHTOOL_PERQUEUE for per queue parameters setting.
The following patches will enable some SUB_COMMANDs for per queue
setting.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 17 +++++++++++++++++
 net/core/ethtool.c           | 27 +++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190aea0faaf4..f15ae02621a1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1202,6 +1202,21 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_F_WISH          (1 << ETHTOOL_F_WISH__BIT)
 #define ETHTOOL_F_COMPAT        (1 << ETHTOOL_F_COMPAT__BIT)
 
+#define MAX_NUM_QUEUE		4096
+
+/**
+ * struct ethtool_per_queue_op - apply sub command to the queues in mask.
+ * @cmd: ETHTOOL_PERQUEUE
+ * @sub_command: the sub command which apply to each queues
+ * @queue_mask: Bitmap of the queues which sub command apply to
+ * @data: A complete command structure following for each of the queues addressed
+ */
+struct ethtool_per_queue_op {
+	__u32	cmd;
+	__u32	sub_command;
+	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	char	data[];
+};
 
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* Get settings. */
@@ -1285,6 +1300,8 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_STUNABLE	0x00000049 /* Set tunable configuration */
 #define ETHTOOL_GPHYSTATS	0x0000004a /* get PHY-specific statistics */
 
+#define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c2d3118b1395..d640ecf71e74 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,13 +1888,27 @@ out:
 	return ret;
 }
 
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_per_queue_op per_queue_opt;
+
+	if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+		return -EFAULT;
+
+	switch (per_queue_opt.sub_command) {
+
+	default:
+		return -EOPNOTSUPP;
+	};
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
 {
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
 	void __user *useraddr = ifr->ifr_data;
-	u32 ethcmd;
+	u32 ethcmd, sub_cmd;
 	int rc;
 	netdev_features_t old_features;
 
@@ -1904,8 +1918,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
 		return -EFAULT;
 
+	if (ethcmd == ETHTOOL_PERQUEUE) {
+		if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+			return -EFAULT;
+	} else {
+		sub_cmd = ethcmd;
+	}
 	/* Allow some commands to be done by anyone */
-	switch (ethcmd) {
+	switch (sub_cmd) {
 	case ETHTOOL_GSET:
 	case ETHTOOL_GDRVINFO:
 	case ETHTOOL_GMSGLVL:
@@ -2135,6 +2155,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GPHYSTATS:
 		rc = ethtool_get_phy_stats(dev, useraddr);
 		break;
+	case ETHTOOL_PERQUEUE:
+		rc = ethtool_set_per_queue(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 421797b1aa363cb897f29f7d365e068dc9d9db81 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:02 -0500
Subject: net/ethtool: support get coalesce per queue

This patch implements sub command ETHTOOL_GCOALESCE for ioctl
ETHTOOL_PERQUEUE. It introduces an interface get_per_queue_coalesce to
get coalesce of each masked queue from device driver. Then the interrupt
coalescing parameters will be copied back to user space one by one.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  8 +++++++-
 net/core/ethtool.c      | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 653dc9c4ebac..de56600023a7 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -201,6 +201,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  * @get_module_eeprom: Get the eeprom information from the plug-in module
  * @get_eee: Get Energy-Efficient (EEE) supported and status.
  * @set_eee: Set EEE status (enable/disable) as well as LPI timers.
+ * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue.
+ *	It must check that the given queue number is valid. If neither a RX nor
+ *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
+ *	queue has this number, set the inapplicable fields to ~0 and return 0.
+ *	Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -279,7 +284,8 @@ struct ethtool_ops {
 			       const struct ethtool_tunable *, void *);
 	int	(*set_tunable)(struct net_device *,
 			       const struct ethtool_tunable *, const void *);
-
+	int	(*get_per_queue_coalesce)(struct net_device *, u32,
+					  struct ethtool_coalesce *);
 
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d640ecf71e74..2a6c3a26f63f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,6 +1888,38 @@ out:
 	return ret;
 }
 
+static int ethtool_get_per_queue_coalesce(struct net_device *dev,
+					  void __user *useraddr,
+					  struct ethtool_per_queue_op *per_queue_opt)
+{
+	u32 bit;
+	int ret;
+	DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+	if (!dev->ethtool_ops->get_per_queue_coalesce)
+		return -EOPNOTSUPP;
+
+	useraddr += sizeof(*per_queue_opt);
+
+	bitmap_from_u32array(queue_mask,
+			     MAX_NUM_QUEUE,
+			     per_queue_opt->queue_mask,
+			     DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+
+	for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+		struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+		ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+		if (ret != 0)
+			return ret;
+		if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+			return -EFAULT;
+		useraddr += sizeof(coalesce);
+	}
+
+	return 0;
+}
+
 static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_per_queue_op per_queue_opt;
@@ -1896,7 +1928,8 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 		return -EFAULT;
 
 	switch (per_queue_opt.sub_command) {
-
+	case ETHTOOL_GCOALESCE:
+		return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
 	default:
 		return -EOPNOTSUPP;
 	};
-- 
cgit v1.2.3


From f38d138a7da6510a1184e3bc5f425deb187c3265 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:03 -0500
Subject: net/ethtool: support set coalesce per queue

This patch implements sub command ETHTOOL_SCOALESCE for ioctl
ETHTOOL_PERQUEUE. It introduces an interface set_per_queue_coalesce to
set coalesce of each masked queue to device driver. The wanted coalesce
information are stored in "data" for each masked queue, which can copy
from userspace.
If it fails to set coalesce to device driver, the value which already
set to specific queue will be tried to rollback.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  7 ++++++
 net/core/ethtool.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index de56600023a7..472d7d7b01c2 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -206,6 +206,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, set the inapplicable fields to ~0 and return 0.
  *	Returns a negative error code or zero.
+ * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue.
+ *	It must check that the given queue number is valid. If neither a RX nor
+ *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
+ *	queue has this number, ignore the inapplicable fields.
+ *	Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -286,6 +291,8 @@ struct ethtool_ops {
 			       const struct ethtool_tunable *, const void *);
 	int	(*get_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
+	int	(*set_per_queue_coalesce)(struct net_device *, u32,
+					  struct ethtool_coalesce *);
 
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2a6c3a26f63f..2406101002b1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1920,6 +1920,65 @@ static int ethtool_get_per_queue_coalesce(struct net_device *dev,
 	return 0;
 }
 
+static int ethtool_set_per_queue_coalesce(struct net_device *dev,
+					  void __user *useraddr,
+					  struct ethtool_per_queue_op *per_queue_opt)
+{
+	u32 bit;
+	int i, ret = 0;
+	int n_queue;
+	struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+	DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+	if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+	    (!dev->ethtool_ops->get_per_queue_coalesce))
+		return -EOPNOTSUPP;
+
+	useraddr += sizeof(*per_queue_opt);
+
+	bitmap_from_u32array(queue_mask,
+			     MAX_NUM_QUEUE,
+			     per_queue_opt->queue_mask,
+			     DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+	n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+	tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+	if (!backup)
+		return -ENOMEM;
+
+	for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+		struct ethtool_coalesce coalesce;
+
+		ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+		if (ret != 0)
+			goto roll_back;
+
+		tmp++;
+
+		if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+			ret = -EFAULT;
+			goto roll_back;
+		}
+
+		ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+		if (ret != 0)
+			goto roll_back;
+
+		useraddr += sizeof(coalesce);
+	}
+
+roll_back:
+	if (ret != 0) {
+		tmp = backup;
+		for_each_set_bit(i, queue_mask, bit) {
+			dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+			tmp++;
+		}
+	}
+	kfree(backup);
+
+	return ret;
+}
+
 static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_per_queue_op per_queue_opt;
@@ -1930,6 +1989,8 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 	switch (per_queue_opt.sub_command) {
 	case ETHTOOL_GCOALESCE:
 		return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+	case ETHTOOL_SCOALESCE:
+		return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
 	default:
 		return -EOPNOTSUPP;
 	};
-- 
cgit v1.2.3


From 568b329a02f75ed3aaae5eb2cca384cb9e09cb29 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 17 Feb 2016 19:58:57 -0800
Subject: perf: generalize perf_callchain

. avoid walking the stack when there is no room left in the buffer
. generalize get_perf_callchain() to be called from bpf helper

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/stacktrace.h |  2 +-
 arch/x86/kernel/cpu/perf_event.c  |  4 ++--
 arch/x86/kernel/dumpstack.c       |  6 ++++--
 arch/x86/kernel/stacktrace.c      | 18 +++++++++++-------
 arch/x86/oprofile/backtrace.c     |  3 ++-
 include/linux/perf_event.h        | 13 +++++++++++--
 kernel/events/callchain.c         | 32 ++++++++++++++++++++------------
 kernel/events/internal.h          |  2 --
 8 files changed, 51 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 70bbe39043a9..7c247e7404be 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
 /* Generic stack tracer with callbacks */
 
 struct stacktrace_ops {
-	void (*address)(void *data, unsigned long address, int reliable);
+	int (*address)(void *data, unsigned long address, int reliable);
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
 	walk_stack_t	walk_stack;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b443db2db50..d276b31ca473 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2180,11 +2180,11 @@ static int backtrace_stack(void *data, char *name)
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	perf_callchain_store(entry, addr);
+	return perf_callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acfadae2..0d1ff4b407d4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
 		if (!__kernel_text_address(addr))
 			break;
 
-		ops->address(data, addr, 1);
+		if (ops->address(data, addr, 1))
+			break;
 		frame = frame->next_frame;
 		ret_addr = &frame->return_address;
 		print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
@@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
 /*
  * Print one address/symbol entries per line.
  */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+static int print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	touch_nmi_watchdog();
 	printk_stack_address(addr, reliable, data);
+	return 0;
 }
 
 static const struct stacktrace_ops print_trace_ops = {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index fdd0c6430e5a..9ee98eefc44d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
 	return 0;
 }
 
-static void
+static int
 __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
 {
 	struct stack_trace *trace = data;
 #ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
-		return;
+		return 0;
 #endif
 	if (nosched && in_sched_functions(addr))
-		return;
+		return 0;
 	if (trace->skip > 0) {
 		trace->skip--;
-		return;
+		return 0;
 	}
-	if (trace->nr_entries < trace->max_entries)
+	if (trace->nr_entries < trace->max_entries) {
 		trace->entries[trace->nr_entries++] = addr;
+		return 0;
+	} else {
+		return -1; /* no more room, stop walking the stack */
+	}
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static int save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	return __save_stack_address(data, addr, reliable, false);
 }
 
-static void
+static int
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
 	return __save_stack_address(data, addr, reliable, true);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 4e664bdb535a..cb31a4440e58 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	unsigned int *depth = data;
 
 	if ((*depth)--)
 		oprofile_add_trace(addr);
+	return 0;
 }
 
 static struct stacktrace_ops backtrace_ops = {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b35a61a481fa..7da3c25999df 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -964,11 +964,20 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
 extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+		   bool crosstask, bool add_mark);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
 
-static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
+	if (entry->nr < PERF_MAX_STACK_DEPTH) {
 		entry->ip[entry->nr++] = ip;
+		return 0;
+	} else {
+		return -1; /* no more room, stop walking the stack */
+	}
 }
 
 extern int sysctl_perf_event_paranoid;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 9c418002b8c1..343c22f5e867 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,15 +159,24 @@ put_callchain_entry(int rctx)
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
-	int rctx;
-	struct perf_callchain_entry *entry;
-
-	int kernel = !event->attr.exclude_callchain_kernel;
-	int user   = !event->attr.exclude_callchain_user;
+	bool kernel = !event->attr.exclude_callchain_kernel;
+	bool user   = !event->attr.exclude_callchain_user;
+	/* Disallow cross-task user callchains. */
+	bool crosstask = event->ctx->task && event->ctx->task != current;
 
 	if (!kernel && !user)
 		return NULL;
 
+	return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+}
+
+struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+		   bool crosstask, bool add_mark)
+{
+	struct perf_callchain_entry *entry;
+	int rctx;
+
 	entry = get_callchain_entry(&rctx);
 	if (rctx == -1)
 		return NULL;
@@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!entry)
 		goto exit_put;
 
-	entry->nr = 0;
+	entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
-		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+		if (add_mark)
+			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(entry, regs);
 	}
 
@@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 		}
 
 		if (regs) {
-			/*
-			 * Disallow cross-task user callchains.
-			 */
-			if (event->ctx->task && event->ctx->task != current)
+			if (crosstask)
 				goto exit_put;
 
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			if (add_mark)
+				perf_callchain_store(entry, PERF_CONTEXT_USER);
 			perf_callchain_user(entry, regs);
 		}
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..4199b6d193f5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
 /* Callchain handling */
 extern struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs);
-extern int get_callchain_buffers(void);
-extern void put_callchain_buffers(void);
 
 static inline int get_recursion_context(int *recursion)
 {
-- 
cgit v1.2.3


From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 17 Feb 2016 19:58:58 -0800
Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE

add new map type to store stack traces and corresponding helper
bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
@ctx: struct pt_regs*
@map: pointer to stack_trace map
@flags: bits 0-7 - numer of stack frames to skip
        bit 8 - collect user stack instead of kernel
        bit 9 - compare stacks by hash only
        bit 10 - if two different stacks hash into the same stackid
                 discard old
        other bits - reserved
Return: >= 0 stackid on success or negative error

stackid is a 32-bit integer handle that can be further combined with
other data (including other stackid) and used as a key into maps.

Userspace will access stackmap using standard lookup/delete syscall commands to
retrieve full stack trace for given stackid.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |  21 +++++
 kernel/bpf/Makefile      |   3 +
 kernel/bpf/stackmap.c    | 237 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    |   6 +-
 kernel/trace/bpf_trace.c |   2 +
 6 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/stackmap.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 90ee6ab24bc5..0cadbb7456c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ee0fde1bf96..d3e77da8e9e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -83,6 +83,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
 	BPF_MAP_TYPE_PERCPU_ARRAY,
+	BPF_MAP_TYPE_STACK_TRACE,
 };
 
 enum bpf_prog_type {
@@ -272,6 +273,20 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_perf_event_output,
 	BPF_FUNC_skb_load_bytes,
+
+	/**
+	 * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
+	 * @ctx: struct pt_regs*
+	 * @map: pointer to stack_trace map
+	 * @flags: bits 0-7 - numer of stack frames to skip
+	 *         bit 8 - collect user stack instead of kernel
+	 *         bit 9 - compare stacks by hash only
+	 *         bit 10 - if two different stacks hash into the same stackid
+	 *                  discard old
+	 *         other bits - reserved
+	 * Return: >= 0 stackid on success or negative error
+	 */
+	BPF_FUNC_get_stackid,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -294,6 +309,12 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
+/* BPF_FUNC_get_stackid flags. */
+#define BPF_F_SKIP_FIELD_MASK		0xffULL
+#define BPF_F_USER_STACK		(1ULL << 8)
+#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
+#define BPF_F_REUSE_STACKID		(1ULL << 10)
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 13272582eee0..8a932d079c24 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,3 +2,6 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..8a60ee14a977
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+
+struct stack_map_bucket {
+	struct rcu_head rcu;
+	u32 hash;
+	u32 nr;
+	u64 ip[];
+};
+
+struct bpf_stack_map {
+	struct bpf_map map;
+	u32 n_buckets;
+	struct stack_map_bucket __rcu *buckets[];
+};
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+	u32 value_size = attr->value_size;
+	struct bpf_stack_map *smap;
+	u64 cost, n_buckets;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    value_size < 8 || value_size % 8 ||
+	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+		return ERR_PTR(-EINVAL);
+
+	/* hash table size must be power of 2 */
+	n_buckets = roundup_pow_of_two(attr->max_entries);
+
+	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
+	if (!smap) {
+		smap = vzalloc(cost);
+		if (!smap)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	err = -E2BIG;
+	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_smap;
+
+	smap->map.map_type = attr->map_type;
+	smap->map.key_size = attr->key_size;
+	smap->map.value_size = value_size;
+	smap->map.max_entries = attr->max_entries;
+	smap->n_buckets = n_buckets;
+	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = get_callchain_buffers();
+	if (err)
+		goto free_smap;
+
+	return &smap->map;
+
+free_smap:
+	kvfree(smap);
+	return ERR_PTR(err);
+}
+
+static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+	struct pt_regs *regs = (struct pt_regs *) (long) r1;
+	struct bpf_map *map = (struct bpf_map *) (long) r2;
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct perf_callchain_entry *trace;
+	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+	u32 max_depth = map->value_size / 8;
+	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
+	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	u32 hash, id, trace_nr, trace_len;
+	bool user = flags & BPF_F_USER_STACK;
+	bool kernel = !user;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+
+	if (unlikely(!trace))
+		/* couldn't fetch the stack trace */
+		return -EFAULT;
+
+	/* get_perf_callchain() guarantees that trace->nr >= init_nr
+	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 */
+	trace_nr = trace->nr - init_nr;
+
+	if (trace_nr <= skip)
+		/* skipping more than usable stack trace */
+		return -EFAULT;
+
+	trace_nr -= skip;
+	trace_len = trace_nr * sizeof(u64);
+	ips = trace->ip + skip + init_nr;
+	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+	id = hash & (smap->n_buckets - 1);
+	bucket = rcu_dereference(smap->buckets[id]);
+
+	if (bucket && bucket->hash == hash) {
+		if (flags & BPF_F_FAST_STACK_CMP)
+			return id;
+		if (bucket->nr == trace_nr &&
+		    memcmp(bucket->ip, ips, trace_len) == 0)
+			return id;
+	}
+
+	/* this call stack is not in the map, try to add it */
+	if (bucket && !(flags & BPF_F_REUSE_STACKID))
+		return -EEXIST;
+
+	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
+			     GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!new_bucket))
+		return -ENOMEM;
+
+	memcpy(new_bucket->ip, ips, trace_len);
+	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
+	new_bucket->hash = hash;
+	new_bucket->nr = trace_nr;
+
+	old_bucket = xchg(&smap->buckets[id], new_bucket);
+	if (old_bucket)
+		kfree_rcu(old_bucket, rcu);
+	return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+	.func		= bpf_get_stackid,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+/* Called from syscall or from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return NULL;
+	bucket = rcu_dereference(smap->buckets[id]);
+	return bucket ? bucket->ip : NULL;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *old_bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return -E2BIG;
+
+	old_bucket = xchg(&smap->buckets[id], NULL);
+	if (old_bucket) {
+		kfree_rcu(old_bucket, rcu);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	int i;
+
+	synchronize_rcu();
+
+	for (i = 0; i < smap->n_buckets; i++)
+		if (smap->buckets[i])
+			kfree_rcu(smap->buckets[i], rcu);
+	kvfree(smap);
+	put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+	.map_alloc = stack_map_alloc,
+	.map_free = stack_map_free,
+	.map_get_next_key = stack_map_get_next_key,
+	.map_lookup_elem = stack_map_lookup_elem,
+	.map_update_elem = stack_map_update_elem,
+	.map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+	.ops = &stack_map_ops,
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+	bpf_register_map_type(&stack_map_type);
+	return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1d3e8f57de9..42ba4ccc020b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -246,6 +246,7 @@ static const struct {
 	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
+	{BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		 * don't allow any other map type to be passed into
 		 * the special func;
 		 */
-		if (bool_func && bool_map != bool_func)
+		if (bool_func && bool_map != bool_func) {
+			verbose("cannot pass map_type %d into func %d\n",
+				map->map_type, func_id);
 			return -EINVAL;
+		}
 	}
 
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 326a75e884db..4b8caa392b86 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
+	case BPF_FUNC_get_stackid:
+		return &bpf_get_stackid_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 745041e2aaf1d668f293aaab4b0f6ad7daa056a5 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Fri, 19 Feb 2016 09:43:16 +0000
Subject: lwtunnel: autoload of lwt modules

The lwt implementations using net devices can autoload using the
existing mechanism using IFLA_INFO_KIND. However, there's no mechanism
that lwt modules not using net devices can use.

Therefore, add the ability to autoload modules registering lwt
operations for lwt implementations not using a net device so that
users don't have to manually load the modules.

Only users with the CAP_NET_ADMIN capability can cause modules to be
loaded, which is ensured by rtnetlink_rcv_msg rejecting non-RTM_GETxxx
messages for users without this capability, and by
lwtunnel_build_state not being called in response to RTM_GETxxx
messages.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h |  4 +++-
 net/core/lwtunnel.c    | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 66350ce3e955..e9f116e29c22 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -170,6 +170,8 @@ static inline int lwtunnel_input(struct sk_buff *skb)
 	return -EOPNOTSUPP;
 }
 
-#endif
+#endif /* CONFIG_LWTUNNEL */
+
+#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))
 
 #endif /* __NET_LWTUNNEL_H */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 299cfc24d888..669ecc9f884e 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -27,6 +27,31 @@
 #include <net/rtnetlink.h>
 #include <net/ip6_fib.h>
 
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+	/* Only lwt encaps implemented without using an interface for
+	 * the encap need to return a string here.
+	 */
+	switch (encap_type) {
+	case LWTUNNEL_ENCAP_MPLS:
+		return "MPLS";
+	case LWTUNNEL_ENCAP_ILA:
+		return "ILA";
+	case LWTUNNEL_ENCAP_IP6:
+	case LWTUNNEL_ENCAP_IP:
+	case LWTUNNEL_ENCAP_NONE:
+	case __LWTUNNEL_ENCAP_MAX:
+		/* should not have got here */
+		WARN_ON(1);
+		break;
+	}
+	return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
 struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
 {
 	struct lwtunnel_state *lws;
@@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	ret = -EOPNOTSUPP;
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+		if (encap_type_str) {
+			rcu_read_unlock();
+			request_module("rtnl-lwt-%s", encap_type_str);
+			rcu_read_lock();
+			ops = rcu_dereference(lwtun_encaps[encap_type]);
+		}
+	}
+#endif
 	if (likely(ops && ops->build_state))
 		ret = ops->build_state(dev, encap, family, cfg, lws);
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 6ceb31ca5f65acff299dbc3da5854d54e147b7d8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 19 Feb 2016 11:26:31 -0800
Subject: VXLAN: Support outer IPv4 Tx checksums by default

This change makes it so that if UDP CSUM is not specified we will default
to enabling it.  The main motivation behind this is the fact that with the
use of outer checksum we can greatly improve the performance for VXLAN
tunnels on devices that don't know how to parse tunnel headers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 19 +++++++++----------
 include/net/vxlan.h |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c963897e713d..2ddc642fb64f 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1957,13 +1957,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto drop;
 		sk = vxlan->vn4_sock->sock->sk;
 
-		if (info) {
-			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
-				df = htons(IP_DF);
-		} else {
-			udp_sum = !!(flags & VXLAN_F_UDP_CSUM);
-		}
-
 		rt = vxlan_get_route(vxlan, skb,
 				     rdst ? rdst->remote_ifindex : 0, tos,
 				     dst->sin.sin_addr.s_addr, &saddr,
@@ -1997,6 +1990,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 
+		if (!info)
+			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
+		else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+			df = htons(IP_DF);
+
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 		err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
@@ -2898,8 +2896,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_PORT])
 		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
 
-	if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
-		conf.flags |= VXLAN_F_UDP_CSUM;
+	if (data[IFLA_VXLAN_UDP_CSUM] &&
+	    !nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
 
 	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
 	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
@@ -3043,7 +3042,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
 	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
-			!!(vxlan->flags & VXLAN_F_UDP_CSUM)) ||
+			!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
 			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 748083de367a..6eda4ed4d78b 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -197,7 +197,7 @@ struct vxlan_dev {
 #define VXLAN_F_L2MISS			0x08
 #define VXLAN_F_L3MISS			0x10
 #define VXLAN_F_IPV6			0x20
-#define VXLAN_F_UDP_CSUM		0x40
+#define VXLAN_F_UDP_ZERO_CSUM_TX	0x40
 #define VXLAN_F_UDP_ZERO_CSUM6_TX	0x80
 #define VXLAN_F_UDP_ZERO_CSUM6_RX	0x100
 #define VXLAN_F_REMCSUM_TX		0x200
-- 
cgit v1.2.3


From 8e2fe1d9f1a20924f98ea46931a1d7fb092aa876 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:22 +0100
Subject: bpf: add new arg_type that allows for 0 sized stack buffer

Currently, when we pass a buffer from the eBPF stack into a helper
function, the function proto indicates argument types as ARG_PTR_TO_STACK
and ARG_CONST_STACK_SIZE pair. If R<X> contains the former, then R<X+1>
must be of the latter type. Then, verifier checks whether the buffer
points into eBPF stack, is initialized, etc. The verifier also guarantees
that the constant value passed in R<X+1> is greater than 0, so helper
functions don't need to test for it and can always assume a non-NULL
initialized buffer as well as non-0 buffer size.

This patch adds a new argument types ARG_CONST_STACK_SIZE_OR_ZERO that
allows to also pass NULL as R<X> and 0 as R<X+1> into the helper function.
Such helper functions, of course, need to be able to handle these cases
internally then. Verifier guarantees that either R<X> == NULL && R<X+1> == 0
or R<X> != NULL && R<X+1> != 0 (like the case of ARG_CONST_STACK_SIZE), any
other combinations are not possible to load.

I went through various options of extending the verifier, and introducing
the type ARG_CONST_STACK_SIZE_OR_ZERO seems to have most minimal changes
needed to the verifier.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c | 42 ++++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0cadbb7456c0..51e498e5470e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -65,6 +65,7 @@ enum bpf_arg_type {
 	 */
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
+	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 42ba4ccc020b..36dc497deaa3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -779,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * bytes from that pointer, make sure that it's within stack boundary
  * and all elements of stack are initialized
  */
-static int check_stack_boundary(struct verifier_env *env,
-				int regno, int access_size)
+static int check_stack_boundary(struct verifier_env *env, int regno,
+				int access_size, bool zero_size_allowed)
 {
 	struct verifier_state *state = &env->cur_state;
 	struct reg_state *regs = state->regs;
 	int off, i;
 
-	if (regs[regno].type != PTR_TO_STACK)
+	if (regs[regno].type != PTR_TO_STACK) {
+		if (zero_size_allowed && access_size == 0 &&
+		    regs[regno].type == CONST_IMM &&
+		    regs[regno].imm  == 0)
+			return 0;
+
+		verbose("R%d type=%s expected=%s\n", regno,
+			reg_type_str[regs[regno].type],
+			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
+	}
 
 	off = regs[regno].imm;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -830,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-	} else if (arg_type == ARG_CONST_STACK_SIZE) {
+	} else if (arg_type == ARG_CONST_STACK_SIZE ||
+		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		expected_type = CONST_IMM;
 	} else if (arg_type == ARG_CONST_MAP_PTR) {
 		expected_type = CONST_PTR_TO_MAP;
 	} else if (arg_type == ARG_PTR_TO_CTX) {
 		expected_type = PTR_TO_CTX;
+	} else if (arg_type == ARG_PTR_TO_STACK) {
+		expected_type = PTR_TO_STACK;
+		/* One exception here. In case function allows for NULL to be
+		 * passed in as argument, it's a CONST_IMM type. Final test
+		 * happens during stack boundary checking.
+		 */
+		if (reg->type == CONST_IMM && reg->imm == 0)
+			expected_type = CONST_IMM;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -868,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->key_size);
-
+		err = check_stack_boundary(env, regno, (*mapp)->key_size,
+					   false);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -879,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->value_size);
+		err = check_stack_boundary(env, regno, (*mapp)->value_size,
+					   false);
+	} else if (arg_type == ARG_CONST_STACK_SIZE ||
+		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
 
-	} else if (arg_type == ARG_CONST_STACK_SIZE) {
 		/* bpf_xxx(..., buf, len) call will access 'len' bytes
 		 * from stack pointer 'buf'. Check it
 		 * note: regno == len, regno - 1 == buf
@@ -891,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno - 1, reg->imm);
+		err = check_stack_boundary(env, regno - 1, reg->imm,
+					   zero_size_allowed);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:23 +0100
Subject: bpf: add generic bpf_csum_diff helper

For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's
currently limited to handle 2 and 4 byte changes in a header and feeds the
from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When
working with IPv6, for example, this makes it rather cumbersome to deal
with, similarly when editing larger parts of a header.

Instead, extend the API in a more generic way: For bpf_l4_csum_replace(),
add a case for header field mask of 0 to change the checksum at a given
offset through inet_proto_csum_replace_by_diff(), and provide a helper
bpf_csum_diff() that can generically calculate a from/to diff for arbitrary
amounts of data.

This can be used in multiple ways: for the bpf_l4_csum_replace() only
part, this even provides us with the option to insert precalculated diffs
from user space f.e. from a map, or from bpf_csum_diff() during runtime.

bpf_csum_diff() has a optional from/to stack buffer input, so we can
calculate a diff by using a scratchbuffer for scenarios where we're
inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers
don't need to be of equal size) data. Also, bpf_csum_diff() allows to
feed a previous csum into csum_partial(), so the function can also be
cascaded.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 ++++++++++
 net/core/filter.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d3e77da8e9e8..48d0a6c54609 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@ enum bpf_func_id {
 	 * Return: >= 0 stackid on success or negative error
 	 */
 	BPF_FUNC_get_stackid,
+
+	/**
+	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
+	 * @from: raw from buffer
+	 * @from_size: length of from buffer
+	 * @to: raw to buffer
+	 * @to_size: length of to buffer
+	 * @seed: optional seed
+	 * Return: csum result
+	 */
+	BPF_FUNC_csum_diff,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 2a6e9562f1ab..bf504f8fbe15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1491,6 +1491,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+		break;
 	case 2:
 		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
 		break;
@@ -1519,6 +1525,51 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_csum_scratchpad {
+	__be32 diff[128];
+};
+
+static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
+
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+	struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+	u64 diff_size = from_size + to_size;
+	__be32 *from = (__be32 *) (long) r1;
+	__be32 *to   = (__be32 *) (long) r3;
+	int i, j = 0;
+
+	/* This is quite flexible, some examples:
+	 *
+	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
+	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
+	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
+	 *
+	 * Even for diffing, from_size and to_size don't need to be equal.
+	 */
+	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+		     diff_size > sizeof(sp->diff)))
+		return -EINVAL;
+
+	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = ~from[i];
+	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = to[i];
+
+	return csum_partial(sp->diff, diff_size, seed);
+}
+
+const struct bpf_func_proto bpf_csum_diff_proto = {
+	.func		= bpf_csum_diff,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1849,6 +1900,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
-- 
cgit v1.2.3


From 3697649ff29e0f647565eed04b27a7779c646a22 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:25 +0100
Subject: bpf: try harder on clones when writing into skb

When we're dealing with clones and the area is not writeable, try
harder and get a copy via pskb_expand_head(). Replace also other
occurences in tc actions with the new skb_try_make_writable().

Reported-by: Ashhad Sheikh <ashhadsheikh394@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  7 +++++++
 net/core/filter.c      | 19 ++++++++++---------
 net/sched/act_csum.c   |  8 ++------
 net/sched/act_nat.c    | 18 +++++-------------
 4 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89b536796e53..6a57757a86cf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2630,6 +2630,13 @@ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len
 	       skb_headroom(skb) + len <= skb->hdr_len;
 }
 
+static inline int skb_try_make_writable(struct sk_buff *skb,
+					unsigned int write_len)
+{
+	return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
+	       pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
 static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
 			    int cloned)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index ea391e6be7fa..f031b82128f3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1364,9 +1364,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 */
 	if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + len)))
+	if (unlikely(skb_try_make_writable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, sp->buff);
@@ -1439,9 +1437,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + sizeof(sum))))
+	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1488,9 +1484,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + sizeof(sum))))
+	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1734,6 +1728,13 @@ bool bpf_helper_changes_skb_data(void *func)
 		return true;
 	if (func == bpf_skb_vlan_pop)
 		return true;
+	if (func == bpf_skb_store_bytes)
+		return true;
+	if (func == bpf_l3_csum_replace)
+		return true;
+	if (func == bpf_l4_csum_replace)
+		return true;
+
 	return false;
 }
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b07c535ba8e7..eeb3eb3ea9eb 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -105,9 +105,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
 	int hl = ihl + jhl;
 
 	if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
-	    (skb_cloned(skb) &&
-	     !skb_clone_writable(skb, hl + ntkoff) &&
-	     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+	    skb_try_make_writable(skb, hl + ntkoff))
 		return NULL;
 	else
 		return (void *)(skb_network_header(skb) + ihl);
@@ -365,9 +363,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 	}
 
 	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff))
 			goto fail;
 
 		ip_send_check(ip_hdr(skb));
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index b7c4ead8b5a8..27607b863aba 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -126,9 +126,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		addr = iph->daddr;
 
 	if (!((old_addr ^ addr) & mask)) {
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, sizeof(*iph) + noff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, sizeof(*iph) + noff))
 			goto drop;
 
 		new_addr &= mask;
@@ -156,9 +154,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		struct tcphdr *tcph;
 
 		if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
-		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
-		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		    skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff))
 			goto drop;
 
 		tcph = (void *)(skb_network_header(skb) + ihl);
@@ -171,9 +167,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		struct udphdr *udph;
 
 		if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
-		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
-		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		    skb_try_make_writable(skb, ihl + sizeof(*udph) + noff))
 			goto drop;
 
 		udph = (void *)(skb_network_header(skb) + ihl);
@@ -213,10 +207,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		if ((old_addr ^ addr) & mask)
 			break;
 
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, ihl + sizeof(*icmph) +
-					     sizeof(*iph) + noff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, ihl + sizeof(*icmph) +
+					  sizeof(*iph) + noff))
 			goto drop;
 
 		icmph = (void *)(skb_network_header(skb) + ihl);
-- 
cgit v1.2.3


From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:26 +0100
Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp

When using this helper for updating UDP checksums, we need to extend
this in order to write CSUM_MANGLED_0 for csum computations that result
into 0 as sum. Reason we need this is because packets with a checksum
could otherwise become incorrectly marked as a packet without a checksum.
Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should
not turn packets without a checksum into ones with a checksum.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 48d0a6c54609..6496f98d3d68 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,6 +313,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f031b82128f3..8a0b8c3eb189 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1477,10 +1477,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
 	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+			       BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
@@ -1490,6 +1492,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
 	if (unlikely(!ptr))
 		return -EFAULT;
+	if (is_mmzero && !*ptr)
+		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
 	case 0:
@@ -1508,6 +1512,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	}
 
+	if (is_mmzero && !*ptr)
+		*ptr = CSUM_MANGLED_0;
 	if (ptr == &sum)
 		/* skb_store_bits guaranteed to not return -EFAULT here */
 		skb_store_bits(skb, offset, ptr, sizeof(sum));
-- 
cgit v1.2.3


From 944945986f125bdbbeaa78dac0c0eadb963eb34a Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Sun, 21 Feb 2016 11:40:10 +0200
Subject: qed: Introduce DMA_REGPAIR_LE

FW hsi contains regpairs, mostly for 64-bit address representations.
Since same paradigm is applied each time a regpair is filled, this
introduces a new utility macro for setting such regpairs.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  9 +++------
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 12 ++++--------
 drivers/net/ethernet/qlogic/qed/qed_spq.c         |  9 +++------
 include/linux/qed/qed_chain.h                     |  4 ++++
 4 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 8d1bc7e7e996..bba59c51f72c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -557,12 +557,10 @@ qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
 	p_ramrod->complete_event_flg	= 1;
 
 	p_ramrod->bd_max_bytes	= cpu_to_le16(bd_max_bytes);
-	p_ramrod->bd_base.hi	= DMA_HI_LE(bd_chain_phys_addr);
-	p_ramrod->bd_base.lo	= DMA_LO_LE(bd_chain_phys_addr);
+	DMA_REGPAIR_LE(p_ramrod->bd_base, bd_chain_phys_addr);
 
 	p_ramrod->num_of_pbl_pages	= cpu_to_le16(cqe_pbl_size);
-	p_ramrod->cqe_pbl_addr.hi	= DMA_HI_LE(cqe_pbl_addr);
-	p_ramrod->cqe_pbl_addr.lo	= DMA_LO_LE(cqe_pbl_addr);
+	DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr, cqe_pbl_addr);
 
 	rc = qed_spq_post(p_hwfn, p_ent, NULL);
 
@@ -721,8 +719,7 @@ qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
 	p_ramrod->stats_counter_id	= stats_id;
 
 	p_ramrod->pbl_size		= cpu_to_le16(pbl_size);
-	p_ramrod->pbl_base_addr.hi	= DMA_HI_LE(pbl_addr);
-	p_ramrod->pbl_base_addr.lo	= DMA_LO_LE(pbl_addr);
+	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, pbl_addr);
 
 	pq_id			= qed_get_qm_pq(p_hwfn,
 						PROTOCOLID_ETH,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index e271ef95745c..1c06c37d4c3d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -136,16 +136,12 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->outer_tag = p_hwfn->hw_info.ovlan;
 
 	/* Place EQ address in RAMROD */
-	p_ramrod->event_ring_pbl_addr.hi =
-			DMA_HI_LE(p_hwfn->p_eq->chain.pbl.p_phys_table);
-	p_ramrod->event_ring_pbl_addr.lo =
-			DMA_LO_LE(p_hwfn->p_eq->chain.pbl.p_phys_table);
+	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
+		       p_hwfn->p_eq->chain.pbl.p_phys_table);
 	p_ramrod->event_ring_num_pages = (u8)p_hwfn->p_eq->chain.page_cnt;
 
-	p_ramrod->consolid_q_pbl_addr.hi =
-			DMA_HI_LE(p_hwfn->p_consq->chain.pbl.p_phys_table);
-	p_ramrod->consolid_q_pbl_addr.lo =
-			DMA_LO_LE(p_hwfn->p_consq->chain.pbl.p_phys_table);
+	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
+		       p_hwfn->p_consq->chain.pbl.p_phys_table);
 
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index f6c6c21601d7..89469d5aae25 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -183,10 +183,8 @@ static void qed_spq_hw_initialize(struct qed_hwfn *p_hwfn,
 	p_cxt->xstorm_st_context.spq_base_hi =
 		DMA_HI_LE(p_spq->chain.p_phys_addr);
 
-	p_cxt->xstorm_st_context.consolid_base_addr.lo =
-		DMA_LO_LE(p_hwfn->p_consq->chain.p_phys_addr);
-	p_cxt->xstorm_st_context.consolid_base_addr.hi =
-		DMA_HI_LE(p_hwfn->p_consq->chain.p_phys_addr);
+	DMA_REGPAIR_LE(p_cxt->xstorm_st_context.consolid_base_addr,
+		       p_hwfn->p_consq->chain.p_phys_addr);
 }
 
 static int qed_spq_hw_post(struct qed_hwfn *p_hwfn,
@@ -423,8 +421,7 @@ void qed_spq_setup(struct qed_hwfn *p_hwfn)
 	p_virt	= p_spq->p_virt;
 
 	for (i = 0; i < p_spq->chain.capacity; i++) {
-		p_virt->elem.data_ptr.hi = DMA_HI_LE(p_phys);
-		p_virt->elem.data_ptr.lo = DMA_LO_LE(p_phys);
+		DMA_REGPAIR_LE(p_virt->elem.data_ptr, p_phys);
 
 		list_add_tail(&p_virt->list, &p_spq->free_pool);
 
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 41b9049b57e2..5f8fcaaa6504 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -19,6 +19,10 @@
 /* dma_addr_t manip */
 #define DMA_LO_LE(x)            cpu_to_le32(lower_32_bits(x))
 #define DMA_HI_LE(x)            cpu_to_le32(upper_32_bits(x))
+#define DMA_REGPAIR_LE(x, val)  do { \
+					(x).hi = DMA_HI_LE((val)); \
+					(x).lo = DMA_LO_LE((val)); \
+				} while (0)
 
 #define HILO_GEN(hi, lo, type)  ((((type)(hi)) << 32) + (lo))
 #define HILO_DMA(hi, lo)        HILO_GEN(hi, lo, dma_addr_t)
-- 
cgit v1.2.3


From 6d5d2ee63cee7025badda3b74ae2ef7ab097acfa Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Jan 2016 19:28:58 +0100
Subject: Bluetooth: add LED trigger for indicating HCI is powered up

Add support for LED triggers to the Bluetooth subsystem and add kernel
config symbol BT_LEDS for it.

For now one trigger for indicating "HCI is powered up" is supported.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  3 ++
 net/bluetooth/Kconfig            |  9 +++++
 net/bluetooth/Makefile           |  1 +
 net/bluetooth/hci_core.c         |  8 ++++
 net/bluetooth/leds.c             | 80 ++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/leds.h             | 18 +++++++++
 6 files changed, 119 insertions(+)
 create mode 100644 net/bluetooth/leds.c
 create mode 100644 net/bluetooth/leds.h

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d4f82edb5cff..dc71473462ac 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -25,6 +25,7 @@
 #ifndef __HCI_CORE_H
 #define __HCI_CORE_H
 
+#include <linux/leds.h>
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sock.h>
 
@@ -396,6 +397,8 @@ struct hci_dev {
 	struct delayed_work	rpa_expired;
 	bdaddr_t		rpa;
 
+	struct led_trigger	*power_led;
+
 	int (*open)(struct hci_dev *hdev);
 	int (*close)(struct hci_dev *hdev);
 	int (*flush)(struct hci_dev *hdev);
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 95d1a66ba03a..06c31b9a68b0 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -69,6 +69,15 @@ config BT_6LOWPAN
 	help
 	  IPv6 compression over Bluetooth Low Energy.
 
+config BT_LEDS
+	bool "Enable LED triggers"
+	depends on BT
+	depends on LEDS_CLASS
+	select LEDS_TRIGGERS
+	help
+	  This option selects a few LED triggers for different
+	  Bluetooth events.
+
 config BT_SELFTEST
 	bool "Bluetooth self testing support"
 	depends on BT && DEBUG_KERNEL
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 2b15ae8c1def..b3ff12eb9b6d 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -17,6 +17,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
+bluetooth-$(CONFIG_BT_LEDS) += leds.o
 bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
 bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 883c821a9e78..88f1ef3589d8 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -40,6 +40,7 @@
 #include "hci_request.h"
 #include "hci_debugfs.h"
 #include "smp.h"
+#include "leds.h"
 
 static void hci_rx_work(struct work_struct *work);
 static void hci_cmd_work(struct work_struct *work);
@@ -1395,6 +1396,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
 		set_bit(HCI_UP, &hdev->flags);
 		hci_sock_dev_event(hdev, HCI_DEV_UP);
+		hci_leds_update_powered(hdev, true);
 		if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
 		    !hci_dev_test_flag(hdev, HCI_CONFIG) &&
 		    !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
@@ -1532,6 +1534,8 @@ int hci_dev_do_close(struct hci_dev *hdev)
 		return 0;
 	}
 
+	hci_leds_update_powered(hdev, false);
+
 	/* Flush RX and TX works */
 	flush_work(&hdev->tx_work);
 	flush_work(&hdev->rx_work);
@@ -3067,6 +3071,8 @@ int hci_register_dev(struct hci_dev *hdev)
 	if (error < 0)
 		goto err_wqueue;
 
+	hci_leds_init(hdev);
+
 	hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
 				    RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops,
 				    hdev);
@@ -3128,6 +3134,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
 
 	id = hdev->id;
 
+	hci_leds_exit(hdev);
+
 	write_lock(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
new file mode 100644
index 000000000000..ded7c88eaccc
--- /dev/null
+++ b/net/bluetooth/leds.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "leds.h"
+
+struct hci_basic_led_trigger {
+	struct led_trigger	led_trigger;
+	struct hci_dev		*hdev;
+};
+
+#define to_hci_basic_led_trigger(arg) container_of(arg, \
+			struct hci_basic_led_trigger, led_trigger)
+
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
+{
+	if (hdev->power_led)
+		led_trigger_event(hdev->power_led,
+				  enabled ? LED_FULL : LED_OFF);
+}
+
+static void power_activate(struct led_classdev *led_cdev)
+{
+	struct hci_basic_led_trigger *htrig;
+	bool powered;
+
+	htrig = to_hci_basic_led_trigger(led_cdev->trigger);
+	powered = test_bit(HCI_UP, &htrig->hdev->flags);
+
+	led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+}
+
+static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
+			void (*activate)(struct led_classdev *led_cdev),
+			const char *name)
+{
+	struct hci_basic_led_trigger *htrig;
+
+	htrig =	devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL);
+	if (!htrig)
+		return NULL;
+
+	htrig->hdev = hdev;
+	htrig->led_trigger.activate = activate;
+	htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL,
+						 "%s-%s", hdev->name,
+						 name);
+	if (!htrig->led_trigger.name)
+		goto err_alloc;
+
+	if (led_trigger_register(&htrig->led_trigger))
+		goto err_register;
+
+	return &htrig->led_trigger;
+
+err_register:
+	devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name);
+err_alloc:
+	devm_kfree(&hdev->dev, htrig);
+	return NULL;
+}
+
+void hci_leds_init(struct hci_dev *hdev)
+{
+	/* initialize power_led */
+	hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
+}
+
+void hci_leds_exit(struct hci_dev *hdev)
+{
+	if (hdev->power_led)
+		led_trigger_unregister(hdev->power_led);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
new file mode 100644
index 000000000000..068261a4e12c
--- /dev/null
+++ b/net/bluetooth/leds.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if IS_ENABLED(CONFIG_BT_LEDS)
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
+void hci_leds_init(struct hci_dev *hdev);
+void hci_leds_exit(struct hci_dev *hdev);
+#else
+static inline void hci_leds_update_powered(struct hci_dev *hdev,
+					   bool enabled) {}
+static inline void hci_leds_init(struct hci_dev *hdev) {}
+static inline void hci_leds_exit(struct hci_dev *hdev) {}
+#endif
-- 
cgit v1.2.3


From 07b0188adf7298bf80a9890d3e90f27e973623d3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Fri, 19 Feb 2016 09:59:11 +0100
Subject: mac802154: fix mac header length check

I got report about that sometimes the WARN_ON occurs there which should
never happen. I came to the conclusion that the mac header is there but
inside the headroom of skb. The skb->len information doesn't contain the
information about the headroom length and skb->len is lesser than two.

We check now if the skb_mac_header pointer is set and the room between
mac header pointer and tail pointer.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index da574bbdc333..2e3cdd2048d2 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -247,8 +247,9 @@ struct ieee802154_ops {
  */
 static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 {
-	/* return some invalid fc on failure */
-	if (unlikely(skb->len < 2)) {
+	/* check if we can fc at skb_mac_header of sk buffer */
+	if (unlikely(!skb_mac_header_was_set(skb) ||
+		     (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) {
 		WARN_ON(1);
 		return cpu_to_le16(0);
 	}
-- 
cgit v1.2.3


From 5609c185f24dffca5f6a9c127106869da150be03 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 22 Feb 2016 09:13:54 +0100
Subject: 6lowpan: iphc: add support for stateful compression

This patch introduce support for IPHC stateful address compression. It
will offer the context table via one debugfs entry.
This debugfs has and directory for each cid entry for the context table.
Inside each cid directory there exists the following files:

 - "active": If the entry is added or deleted. The context table is
   original a list implementation, this flag will indicate if the
   context is part of list or not.
 - "prefix": The ipv6 prefix.
 - "prefix_length": The prefix length for the prefix.
 - "compression": The compression flag according RFC6775.

This part should be moved into sysfs after some testing time.

Also the debugfs entry contains a "show" file which is a pretty-printout
for the current context table information.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/6lowpan.h |  32 ++++
 net/6lowpan/core.c    |  39 ++++-
 net/6lowpan/debugfs.c | 247 ++++++++++++++++++++++++++++++
 net/6lowpan/iphc.c    | 414 +++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 674 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index 2f6a3f2233ed..da3a77d25fcb 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -75,6 +75,8 @@
 #define LOWPAN_IPHC_MAX_HC_BUF_LEN	(sizeof(struct ipv6hdr) +	\
 					 LOWPAN_IPHC_MAX_HEADER_LEN +	\
 					 LOWPAN_NHC_MAX_HDR_LEN)
+/* SCI/DCI is 4 bit width, so we have maximum 16 entries */
+#define LOWPAN_IPHC_CTX_TABLE_SIZE	(1 << 4)
 
 #define LOWPAN_DISPATCH_IPV6		0x41 /* 01000001 = 65 */
 #define LOWPAN_DISPATCH_IPHC		0x60 /* 011xxxxx = ... */
@@ -98,9 +100,39 @@ enum lowpan_lltypes {
 	LOWPAN_LLTYPE_IEEE802154,
 };
 
+enum lowpan_iphc_ctx_flags {
+	LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+	LOWPAN_IPHC_CTX_FLAG_COMPRESSION,
+};
+
+struct lowpan_iphc_ctx {
+	u8 id;
+	struct in6_addr pfx;
+	u8 plen;
+	unsigned long flags;
+};
+
+struct lowpan_iphc_ctx_table {
+	spinlock_t lock;
+	const struct lowpan_iphc_ctx_ops *ops;
+	struct lowpan_iphc_ctx table[LOWPAN_IPHC_CTX_TABLE_SIZE];
+};
+
+static inline bool lowpan_iphc_ctx_is_active(const struct lowpan_iphc_ctx *ctx)
+{
+	return test_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+}
+
+static inline bool
+lowpan_iphc_ctx_is_compression(const struct lowpan_iphc_ctx *ctx)
+{
+	return test_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+}
+
 struct lowpan_priv {
 	enum lowpan_lltypes lltype;
 	struct dentry *iface_debugfs;
+	struct lowpan_iphc_ctx_table ctx;
 
 	/* must be last */
 	u8 priv[0] __aligned(sizeof(void *));
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index faf65baed617..34e44c0c0836 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -20,7 +20,7 @@
 int lowpan_register_netdevice(struct net_device *dev,
 			      enum lowpan_lltypes lltype)
 {
-	int ret;
+	int i, ret;
 
 	dev->addr_len = EUI64_ADDR_LEN;
 	dev->type = ARPHRD_6LOWPAN;
@@ -29,6 +29,10 @@ int lowpan_register_netdevice(struct net_device *dev,
 
 	lowpan_priv(dev)->lltype = lltype;
 
+	spin_lock_init(&lowpan_priv(dev)->ctx.lock);
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+		lowpan_priv(dev)->ctx.table[i].id = i;
+
 	ret = register_netdevice(dev);
 	if (ret < 0)
 		return ret;
@@ -68,6 +72,32 @@ void lowpan_unregister_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(lowpan_unregister_netdev);
 
+static int lowpan_event(struct notifier_block *unused,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	int i;
+
+	if (dev->type != ARPHRD_6LOWPAN)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+			clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+				  &lowpan_priv(dev)->ctx.table[i].flags);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block lowpan_notifier = {
+	.notifier_call = lowpan_event,
+};
+
 static int __init lowpan_module_init(void)
 {
 	int ret;
@@ -76,6 +106,12 @@ static int __init lowpan_module_init(void)
 	if (ret < 0)
 		return ret;
 
+	ret = register_netdevice_notifier(&lowpan_notifier);
+	if (ret < 0) {
+		lowpan_debugfs_exit();
+		return ret;
+	}
+
 	request_module_nowait("ipv6");
 
 	request_module_nowait("nhc_dest");
@@ -92,6 +128,7 @@ static int __init lowpan_module_init(void)
 static void __exit lowpan_module_exit(void)
 {
 	lowpan_debugfs_exit();
+	unregister_netdevice_notifier(&lowpan_notifier);
 }
 
 module_init(lowpan_module_init);
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index 88eef84df0fc..aa49ff4ce6fd 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -16,19 +16,266 @@
 
 #include "6lowpan_i.h"
 
+#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS	8
+
 static struct dentry *lowpan_debugfs;
 
+static int lowpan_ctx_flag_active_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	if (val)
+		set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+	else
+		clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+
+	return 0;
+}
+
+static int lowpan_ctx_flag_active_get(void *data, u64 *val)
+{
+	*val = lowpan_iphc_ctx_is_active(data);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops,
+			lowpan_ctx_flag_active_get,
+			lowpan_ctx_flag_active_set, "%llu\n");
+
+static int lowpan_ctx_flag_c_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	if (val)
+		set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+	else
+		clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+
+	return 0;
+}
+
+static int lowpan_ctx_flag_c_get(void *data, u64 *val)
+{
+	*val = lowpan_iphc_ctx_is_compression(data);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
+			lowpan_ctx_flag_c_set, "%llu\n");
+
+static int lowpan_ctx_plen_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	if (val > 128)
+		return -EINVAL;
+
+	spin_lock_bh(&t->lock);
+	ctx->plen = val;
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_ctx_plen_get(void *data, u64 *val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	spin_lock_bh(&t->lock);
+	*val = ctx->plen;
+	spin_unlock_bh(&t->lock);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
+			lowpan_ctx_plen_set, "%llu\n");
+
+static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset)
+{
+	struct lowpan_iphc_ctx *ctx = file->private;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	spin_lock_bh(&t->lock);
+	seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		   be16_to_cpu(ctx->pfx.s6_addr16[0]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[1]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[2]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[3]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[4]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[5]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[6]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[7]));
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lowpan_ctx_pfx_show, inode->i_private);
+}
+
+static ssize_t lowpan_ctx_pfx_write(struct file *fp,
+				    const char __user *user_buf, size_t count,
+				    loff_t *ppos)
+{
+	char buf[128] = {};
+	struct seq_file *file = fp->private_data;
+	struct lowpan_iphc_ctx *ctx = file->private;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+	int status = count, n, i;
+	unsigned int addr[8];
+
+	if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1,
+						 count))) {
+		status = -EFAULT;
+		goto out;
+	}
+
+	n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		   &addr[0], &addr[1], &addr[2], &addr[3], &addr[4],
+		   &addr[5], &addr[6], &addr[7]);
+	if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_bh(&t->lock);
+	for (i = 0; i < 8; i++)
+		ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff);
+	spin_unlock_bh(&t->lock);
+
+out:
+	return status;
+}
+
+const struct file_operations lowpan_ctx_pfx_fops = {
+	.open		= lowpan_ctx_pfx_open,
+	.read		= seq_read,
+	.write		= lowpan_ctx_pfx_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+				       struct dentry *ctx, u8 id)
+{
+	struct lowpan_priv *lpriv = lowpan_priv(dev);
+	struct dentry *dentry, *root;
+	char buf[32];
+
+	WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE);
+
+	sprintf(buf, "%d", id);
+
+	root = debugfs_create_dir(buf, ctx);
+	if (!root)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("active", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_flag_active_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("compression", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_flag_c_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("prefix", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_pfx_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("prefix_len", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_plen_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int lowpan_context_show(struct seq_file *file, void *offset)
+{
+	struct lowpan_iphc_ctx_table *t = file->private;
+	int i;
+
+	seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C');
+	seq_puts(file, "-------------------------------------------------\n");
+
+	spin_lock_bh(&t->lock);
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		if (!lowpan_iphc_ctx_is_active(&t->table[i]))
+			continue;
+
+		seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id,
+			   &t->table[i].pfx, t->table[i].plen,
+			   lowpan_iphc_ctx_is_compression(&t->table[i]));
+	}
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_context_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lowpan_context_show, inode->i_private);
+}
+
+const struct file_operations lowpan_context_fops = {
+	.open		= lowpan_context_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 int lowpan_dev_debugfs_init(struct net_device *dev)
 {
 	struct lowpan_priv *lpriv = lowpan_priv(dev);
+	struct dentry *contexts, *dentry;
+	int ret, i;
 
 	/* creating the root */
 	lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
 	if (!lpriv->iface_debugfs)
 		goto fail;
 
+	contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs);
+	if (!contexts)
+		goto remove_root;
+
+	dentry = debugfs_create_file("show", 0644, contexts,
+				     &lowpan_priv(dev)->ctx,
+				     &lowpan_context_fops);
+	if (!dentry)
+		goto remove_root;
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i);
+		if (ret < 0)
+			goto remove_root;
+	}
+
 	return 0;
 
+remove_root:
+	lowpan_dev_debugfs_exit(dev);
 fail:
 	return -EINVAL;
 }
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 346b5c1a9185..d2a565cde4f4 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -56,6 +56,7 @@
 /* special link-layer handling */
 #include <net/mac802154.h>
 
+#include "6lowpan_i.h"
 #include "nhc.h"
 
 /* Values of fields within the IPHC encoding first byte */
@@ -147,6 +148,9 @@
 	 (((a)->s6_addr16[6]) == 0) &&		\
 	 (((a)->s6_addr[14]) == 0))
 
+#define LOWPAN_IPHC_CID_DCI(cid)	(cid & 0x0f)
+#define LOWPAN_IPHC_CID_SCI(cid)	((cid & 0xf0) >> 4)
+
 static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr,
 						const void *lladdr)
 {
@@ -195,6 +199,98 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
 	}
 }
 
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id)
+{
+	struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id];
+
+	if (!lowpan_iphc_ctx_is_active(ret))
+		return NULL;
+
+	return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_addr(const struct net_device *dev,
+			    const struct in6_addr *addr)
+{
+	struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+	struct lowpan_iphc_ctx *ret = NULL;
+	struct in6_addr addr_pfx;
+	u8 addr_plen;
+	int i;
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		/* Check if context is valid. A context that is not valid
+		 * MUST NOT be used for compression.
+		 */
+		if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+		    !lowpan_iphc_ctx_is_compression(&table[i]))
+			continue;
+
+		ipv6_addr_prefix(&addr_pfx, addr, table[i].plen);
+
+		/* if prefix len < 64, the remaining bits until 64th bit is
+		 * zero. Otherwise we use table[i]->plen.
+		 */
+		if (table[i].plen < 64)
+			addr_plen = 64;
+		else
+			addr_plen = table[i].plen;
+
+		if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) {
+			/* remember first match */
+			if (!ret) {
+				ret = &table[i];
+				continue;
+			}
+
+			/* get the context with longest prefix len */
+			if (table[i].plen > ret->plen)
+				ret = &table[i];
+		}
+	}
+
+	return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
+				  const struct in6_addr *addr)
+{
+	struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+	struct lowpan_iphc_ctx *ret = NULL;
+	struct in6_addr addr_mcast, network_pfx = {};
+	int i;
+
+	/* init mcast address with  */
+	memcpy(&addr_mcast, addr, sizeof(*addr));
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		/* Check if context is valid. A context that is not valid
+		 * MUST NOT be used for compression.
+		 */
+		if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+		    !lowpan_iphc_ctx_is_compression(&table[i]))
+			continue;
+
+		/* setting plen */
+		addr_mcast.s6_addr[3] = table[i].plen;
+		/* get network prefix to copy into multicast address */
+		ipv6_addr_prefix(&network_pfx, &table[i].pfx,
+				 table[i].plen);
+		/* setting network prefix */
+		memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8);
+
+		if (ipv6_addr_equal(addr, &addr_mcast)) {
+			ret = &table[i];
+			break;
+		}
+	}
+
+	return ret;
+}
+
 /* Uncompress address function for source and
  * destination address(non-multicast).
  *
@@ -259,30 +355,59 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
 /* Uncompress address function for source context
  * based address(non-multicast).
  */
-static int uncompress_context_based_src_addr(struct sk_buff *skb,
-					     struct in6_addr *ipaddr,
-					     u8 address_mode)
+static int uncompress_ctx_addr(struct sk_buff *skb,
+			       const struct net_device *dev,
+			       const struct lowpan_iphc_ctx *ctx,
+			       struct in6_addr *ipaddr, u8 address_mode,
+			       const void *lladdr)
 {
+	bool fail;
+
 	switch (address_mode) {
-	case LOWPAN_IPHC_SAM_00:
-		/* unspec address ::
+	/* SAM and DAM are the same here */
+	case LOWPAN_IPHC_DAM_00:
+		fail = false;
+		/* SAM_00 -> unspec address ::
 		 * Do nothing, address is already ::
+		 *
+		 * DAM 00 -> reserved should never occur.
 		 */
 		break;
 	case LOWPAN_IPHC_SAM_01:
-		/* TODO */
+	case LOWPAN_IPHC_DAM_01:
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	case LOWPAN_IPHC_SAM_10:
-		/* TODO */
+	case LOWPAN_IPHC_DAM_10:
+		ipaddr->s6_addr[11] = 0xFF;
+		ipaddr->s6_addr[12] = 0xFE;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	case LOWPAN_IPHC_SAM_11:
-		/* TODO */
-		netdev_warn(skb->dev, "SAM value 0x%x not supported\n",
-			    address_mode);
-		return -EINVAL;
+	case LOWPAN_IPHC_DAM_11:
+		fail = false;
+		switch (lowpan_priv(dev)->lltype) {
+		case LOWPAN_LLTYPE_IEEE802154:
+			iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+			break;
+		default:
+			iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+			break;
+		}
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	default:
 		pr_debug("Invalid sam value: 0x%x\n", address_mode);
 		return -EINVAL;
 	}
 
+	if (fail) {
+		pr_debug("Failed to fetch skb data\n");
+		return -EIO;
+	}
+
 	raw_dump_inline(NULL,
 			"Reconstructed context based ipv6 src addr is",
 			ipaddr->s6_addr, 16);
@@ -346,6 +471,30 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
 	return 0;
 }
 
+static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb,
+						 struct lowpan_iphc_ctx *ctx,
+						 struct in6_addr *ipaddr,
+						 u8 address_mode)
+{
+	struct in6_addr network_pfx = {};
+	bool fail;
+
+	ipaddr->s6_addr[0] = 0xFF;
+	fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2);
+	fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4);
+	if (fail < 0)
+		return -EIO;
+
+	/* take prefix_len and network prefix from the context */
+	ipaddr->s6_addr[3] = ctx->plen;
+	/* get network prefix to copy into multicast address */
+	ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen);
+	/* setting network prefix */
+	memcpy(&ipaddr->s6_addr[4], &network_pfx, 8);
+
+	return 0;
+}
+
 /* get the ecn values from iphc tf format and set it to ipv6hdr */
 static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf)
 {
@@ -459,7 +608,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 			     const void *daddr, const void *saddr)
 {
 	struct ipv6hdr hdr = {};
-	u8 iphc0, iphc1;
+	struct lowpan_iphc_ctx *ci;
+	u8 iphc0, iphc1, cid = 0;
 	int err;
 
 	raw_dump_table(__func__, "raw skb data dump uncompressed",
@@ -469,12 +619,14 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	    lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1)))
 		return -EINVAL;
 
-	/* another if the CID flag is set */
-	if (iphc1 & LOWPAN_IPHC_CID)
-		return -ENOTSUPP;
-
 	hdr.version = 6;
 
+	/* default CID = 0, another if the CID flag is set */
+	if (iphc1 & LOWPAN_IPHC_CID) {
+		if (lowpan_fetch_skb(skb, &cid, sizeof(cid)))
+			return -EINVAL;
+	}
+
 	err = lowpan_iphc_tf_decompress(skb, &hdr,
 					iphc0 & LOWPAN_IPHC_TF_MASK);
 	if (err < 0)
@@ -500,10 +652,17 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	}
 
 	if (iphc1 & LOWPAN_IPHC_SAC) {
-		/* Source address context based uncompression */
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
+		}
+
 		pr_debug("SAC bit is set. Handle context based source address.\n");
-		err = uncompress_context_based_src_addr(skb, &hdr.saddr,
-							iphc1 & LOWPAN_IPHC_SAM_MASK);
+		err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
+					  iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
 	} else {
 		/* Source address uncompression */
 		pr_debug("source address stateless compression\n");
@@ -515,27 +674,52 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	if (err)
 		return -EINVAL;
 
-	/* check for Multicast Compression */
-	if (iphc1 & LOWPAN_IPHC_M) {
-		if (iphc1 & LOWPAN_IPHC_DAC) {
-			pr_debug("dest: context-based mcast compression\n");
-			/* TODO: implement this */
-		} else {
-			err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
-								iphc1 & LOWPAN_IPHC_DAM_MASK);
+	switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) {
+	case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC:
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
+		}
 
-			if (err)
-				return -EINVAL;
+		/* multicast with context */
+		pr_debug("dest: context-based mcast compression\n");
+		err = lowpan_uncompress_multicast_ctx_daddr(skb, ci,
+							    &hdr.daddr,
+							    iphc1 & LOWPAN_IPHC_DAM_MASK);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+		break;
+	case LOWPAN_IPHC_M:
+		/* multicast */
+		err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
+							iphc1 & LOWPAN_IPHC_DAM_MASK);
+		break;
+	case LOWPAN_IPHC_DAC:
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
 		}
-	} else {
+
+		/* Destination address context based uncompression */
+		pr_debug("DAC bit is set. Handle context based destination address.\n");
+		err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
+					  iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+		break;
+	default:
 		err = uncompress_addr(skb, dev, &hdr.daddr,
 				      iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
 		pr_debug("dest: stateless compression mode %d dest %pI6c\n",
 			 iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
-		if (err)
-			return -EINVAL;
+		break;
 	}
 
+	if (err)
+		return -EINVAL;
+
 	/* Next header data uncompression */
 	if (iphc0 & LOWPAN_IPHC_NH) {
 		err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
@@ -585,6 +769,58 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = {
 	[LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
 };
 
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
+				   const struct lowpan_iphc_ctx *ctx,
+				   const unsigned char *lladdr, bool sam)
+{
+	struct in6_addr tmp = {};
+	u8 dam;
+
+	/* check for SAM/DAM = 11 */
+	memcpy(&tmp.s6_addr[8], lladdr, 8);
+	/* second bit-flip (Universe/Local) is done according RFC2464 */
+	tmp.s6_addr[8] ^= 0x02;
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		dam = LOWPAN_IPHC_DAM_11;
+		goto out;
+	}
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* check for SAM/DAM = 01 */
+	tmp.s6_addr[11] = 0xFF;
+	tmp.s6_addr[12] = 0xFE;
+	memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2);
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2);
+		dam = LOWPAN_IPHC_DAM_10;
+		goto out;
+	}
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* check for SAM/DAM = 10, should always match */
+	memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8);
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8);
+		dam = LOWPAN_IPHC_DAM_01;
+		goto out;
+	}
+
+	WARN_ON_ONCE("context found but no address mode matched\n");
+	return -EINVAL;
+out:
+
+	if (sam)
+		return lowpan_iphc_dam_to_sam_value[dam];
+	else
+		return dam;
+}
+
 static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
 				  const unsigned char *lladdr, bool sam)
 {
@@ -708,6 +944,21 @@ static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr)
 	return val;
 }
 
+static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr,
+					      const struct lowpan_iphc_ctx *ctx,
+					      const struct in6_addr *ipaddr)
+{
+	u8 data[6];
+
+	/* flags/scope, reserved (RIID) */
+	memcpy(data, &ipaddr->s6_addr[1], 2);
+	/* group ID */
+	memcpy(&data[1], &ipaddr->s6_addr[11], 4);
+	lowpan_push_hc_data(hc_ptr, data, 6);
+
+	return LOWPAN_IPHC_DAM_00;
+}
+
 static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
 					  const struct in6_addr *ipaddr)
 {
@@ -742,10 +993,11 @@ static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
 int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 			   const void *daddr, const void *saddr)
 {
-	u8 iphc0, iphc1, *hc_ptr;
+	u8 iphc0, iphc1, *hc_ptr, cid = 0;
 	struct ipv6hdr *hdr;
 	u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {};
-	int ret, addr_type;
+	struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry;
+	int ret, ipv6_daddr_type, ipv6_saddr_type;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		return -EINVAL;
@@ -769,14 +1021,38 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 	iphc0 = LOWPAN_DISPATCH_IPHC;
 	iphc1 = 0;
 
-	/* TODO: context lookup */
-
 	raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
 	raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
 
 	raw_dump_table(__func__, "sending raw skb network uncompressed packet",
 		       skb->data, skb->len);
 
+	ipv6_daddr_type = ipv6_addr_type(&hdr->daddr);
+	spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+	if (ipv6_daddr_type & IPV6_ADDR_MULTICAST)
+		dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr);
+	else
+		dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr);
+	if (dci) {
+		memcpy(&dci_entry, dci, sizeof(*dci));
+		cid |= dci->id;
+	}
+	spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+	spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+	sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr);
+	if (sci) {
+		memcpy(&sci_entry, sci, sizeof(*sci));
+		cid |= (sci->id << 4);
+	}
+	spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+	/* if cid is zero it will be compressed */
+	if (cid) {
+		iphc1 |= LOWPAN_IPHC_CID;
+		lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid));
+	}
+
 	/* Traffic Class, Flow Label compression */
 	iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr);
 
@@ -813,39 +1089,63 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 				    sizeof(hdr->hop_limit));
 	}
 
-	addr_type = ipv6_addr_type(&hdr->saddr);
+	ipv6_saddr_type = ipv6_addr_type(&hdr->saddr);
 	/* source address compression */
-	if (addr_type == IPV6_ADDR_ANY) {
+	if (ipv6_saddr_type == IPV6_ADDR_ANY) {
 		pr_debug("source address is unspecified, setting SAC\n");
 		iphc1 |= LOWPAN_IPHC_SAC;
 	} else {
-		if (addr_type & IPV6_ADDR_LINKLOCAL) {
-			iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr,
-							 saddr, true);
-			pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
-				 &hdr->saddr, iphc1);
+		if (sci) {
+			iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr,
+							  &sci_entry, saddr,
+							  true);
+			iphc1 |= LOWPAN_IPHC_SAC;
 		} else {
-			pr_debug("send the full source address\n");
-			lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16);
+			if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) {
+				iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+								 &hdr->saddr,
+								 saddr, true);
+				pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
+					 &hdr->saddr, iphc1);
+			} else {
+				pr_debug("send the full source address\n");
+				lowpan_push_hc_data(&hc_ptr,
+						    hdr->saddr.s6_addr, 16);
+			}
 		}
 	}
 
-	addr_type = ipv6_addr_type(&hdr->daddr);
 	/* destination address compression */
-	if (addr_type & IPV6_ADDR_MULTICAST) {
+	if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) {
 		pr_debug("destination address is multicast: ");
-		iphc1 |= LOWPAN_IPHC_M;
-		iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr);
+		if (dci) {
+			iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr,
+								     &dci_entry,
+								     &hdr->daddr);
+		} else {
+			iphc1 |= LOWPAN_IPHC_M;
+			iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr,
+								 &hdr->daddr);
+		}
 	} else {
-		if (addr_type & IPV6_ADDR_LINKLOCAL) {
-			/* TODO: context lookup */
-			iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr,
-							 daddr, false);
-			pr_debug("dest address unicast link-local %pI6c "
-				 "iphc1 0x%02x\n", &hdr->daddr, iphc1);
+		if (dci) {
+			iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr,
+							  &dci_entry, daddr,
+							  false);
+			iphc1 |= LOWPAN_IPHC_DAC;
 		} else {
-			pr_debug("dest address unicast %pI6c\n", &hdr->daddr);
-			lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
+			if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) {
+				iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+								 &hdr->daddr,
+								 daddr, false);
+				pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
+					 &hdr->daddr, iphc1);
+			} else {
+				pr_debug("dest address unicast %pI6c\n",
+					 &hdr->daddr);
+				lowpan_push_hc_data(&hc_ptr,
+						    hdr->daddr.s6_addr, 16);
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From a6692754d61a6b3735803783f394880805675f99 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 12 Feb 2016 12:09:39 -0500
Subject: net: dsa: pass bridge down to drivers

Some DSA drivers may or may not support multiple software bridges on top
of an hardware switch.

It is more convenient for them to access the bridge's net_device for
finer configuration.

Removing the need to craft and access a bitmask also simplifies the
code.

This patch changes the signature of bridge related functions, update DSA
drivers, and removes dsa_slave_br_port_mask.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt |  7 ++-----
 drivers/net/dsa/bcm_sf2.c            | 12 +++++++-----
 drivers/net/dsa/bcm_sf2.h            |  2 ++
 drivers/net/dsa/mv88e6xxx.c          | 13 +++++++++++--
 drivers/net/dsa/mv88e6xxx.h          |  6 ++++--
 include/net/dsa.h                    |  5 ++---
 net/dsa/slave.c                      | 31 ++-----------------------------
 7 files changed, 30 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index aa9c1f9313cd..ebf21530471f 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -524,17 +524,14 @@ Bridge layer
 - port_join_bridge: bridge layer function invoked when a given switch port is
   added to a bridge, this function should be doing the necessary at the switch
   level to permit the joining port from being added to the relevant logical
-  domain for it to ingress/egress traffic with other members of the bridge. DSA
-  does nothing but calculate a bitmask of switch ports currently members of the
-  specified bridge being requested the join
+  domain for it to ingress/egress traffic with other members of the bridge.
 
 - port_leave_bridge: bridge layer function invoked when a given switch port is
   removed from a bridge, this function should be doing the necessary at the
   switch level to deny the leaving port from ingress/egress traffic from the
   remaining bridge members. When the port leaves the bridge, it should be aged
   out at the switch hardware for the switch to (re) learn MAC addresses behind
-  this port. DSA calculates the bitmask of ports still members of the bridge
-  being left
+  this port.
 
 - port_stp_update: bridge layer function invoked when a given switch port STP
   state is computed by the bridge layer and should be propagated to switch
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 6f946fedbb77..3f627598f277 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -483,16 +483,17 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
 }
 
 static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
-			      u32 br_port_mask)
+			      struct net_device *bridge)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
 	unsigned int i;
 	u32 reg, p_ctl;
 
+	priv->port_sts[port].bridge_dev = bridge;
 	p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port));
 
 	for (i = 0; i < priv->hw_params.num_ports; i++) {
-		if (!((1 << i) & br_port_mask))
+		if (priv->port_sts[i].bridge_dev != bridge)
 			continue;
 
 		/* Add this local port to the remote port VLAN control
@@ -515,10 +516,10 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
-			       u32 br_port_mask)
+static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
+	struct net_device *bridge = priv->port_sts[port].bridge_dev;
 	unsigned int i;
 	u32 reg, p_ctl;
 
@@ -526,7 +527,7 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
 
 	for (i = 0; i < priv->hw_params.num_ports; i++) {
 		/* Don't touch the remaining ports */
-		if (!((1 << i) & br_port_mask))
+		if (priv->port_sts[i].bridge_dev != bridge)
 			continue;
 
 		reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
@@ -541,6 +542,7 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
 
 	core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(port));
 	priv->port_sts[port].vlan_ctl_mask = p_ctl;
+	priv->port_sts[port].bridge_dev = NULL;
 
 	return 0;
 }
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 6bba1c98d764..200b1f5fdb56 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -50,6 +50,8 @@ struct bcm_sf2_port_status {
 	struct ethtool_eee eee;
 
 	u32 vlan_ctl_mask;
+
+	struct net_device *bridge_dev;
 };
 
 struct bcm_sf2_arl_entry {
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index b0e00edb302e..2e515e8a95fe 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1889,13 +1889,22 @@ unlock:
 	return err;
 }
 
-int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members)
+int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
+			       struct net_device *bridge)
 {
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	ps->ports[port].bridge_dev = bridge;
+
 	return 0;
 }
 
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members)
+int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 {
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	ps->ports[port].bridge_dev = NULL;
+
 	return 0;
 }
 
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 63a6f587e9e8..260b4918e427 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -380,6 +380,7 @@ struct mv88e6xxx_vtu_stu_entry {
 };
 
 struct mv88e6xxx_priv_port {
+	struct net_device *bridge_dev;
 	u8 state;
 };
 
@@ -481,8 +482,9 @@ int mv88e6xxx_phy_write_indirect(struct dsa_switch *ds, int addr, int regnum,
 int mv88e6xxx_get_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
 int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
 		      struct phy_device *phydev, struct ethtool_eee *e);
-int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members);
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members);
+int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
+			       struct net_device *bridge);
+int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
 int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port,
 				const struct switchdev_obj_port_vlan *vlan,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 26a0e86e611e..1c845d7bf0b2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -297,9 +297,8 @@ struct dsa_switch_driver {
 	 * Bridge integration
 	 */
 	int	(*port_join_bridge)(struct dsa_switch *ds, int port,
-				    u32 br_port_mask);
-	int	(*port_leave_bridge)(struct dsa_switch *ds, int port,
-				     u32 br_port_mask);
+				    struct net_device *bridge);
+	int	(*port_leave_bridge)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ab24521beb4d..ab515df5f493 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -385,31 +385,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return -EOPNOTSUPP;
 }
 
-/* Return a bitmask of all ports being currently bridged within a given bridge
- * device. Note that on leave, the mask will still return the bitmask of ports
- * currently bridged, prior to port removal, and this is exactly what we want.
- */
-static u32 dsa_slave_br_port_mask(struct dsa_switch *ds,
-				  struct net_device *bridge)
-{
-	struct dsa_slave_priv *p;
-	unsigned int port;
-	u32 mask = 0;
-
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
-		if (!dsa_is_port_initialized(ds, port))
-			continue;
-
-		p = netdev_priv(ds->ports[port]);
-
-		if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT &&
-		    p->bridge_dev == bridge)
-			mask |= 1 << port;
-	}
-
-	return mask;
-}
-
 static int dsa_slave_stp_update(struct net_device *dev, u8 state)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -533,8 +508,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	p->bridge_dev = br;
 
 	if (ds->drv->port_join_bridge)
-		ret = ds->drv->port_join_bridge(ds, p->port,
-						dsa_slave_br_port_mask(ds, br));
+		ret = ds->drv->port_join_bridge(ds, p->port, br);
 
 	return ret;
 }
@@ -547,8 +521,7 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 
 
 	if (ds->drv->port_leave_bridge)
-		ret = ds->drv->port_leave_bridge(ds, p->port,
-						 dsa_slave_br_port_mask(ds, p->bridge_dev));
+		ret = ds->drv->port_leave_bridge(ds, p->port);
 
 	p->bridge_dev = NULL;
 
-- 
cgit v1.2.3


From 412a6d800c7380c1b87c11080c7da905c27cfea8 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 19:09:05 +0200
Subject: mac80211: support hw managing reorder logic

Enable driver to manage the reordering logic itself.
This is needed for example for the iwlwifi driver that
will support hardware assisted reordering.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  |  6 ++++++
 net/mac80211/agg-rx.c   | 24 ++++++++++++++++++++++--
 net/mac80211/debugfs.c  |  1 +
 net/mac80211/sta_info.h | 21 ++++++++++++---------
 4 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6c9c559394b0..ee6305a52251 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1929,6 +1929,11 @@ struct ieee80211_txq {
  *	by just its MAC address; this prevents, for example, the same station
  *	from connecting to two virtual AP interfaces at the same time.
  *
+ * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the
+ *	reordering buffer internally, guaranteeing mac80211 receives frames in
+ *	order and does not need to manage its own reorder buffer or BA session
+ *	timeout.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -1965,6 +1970,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU,
 	IEEE80211_HW_BEACON_TX_STATUS,
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
+	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index ec80db7c955c..2ab54791281d 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -76,10 +76,11 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 	tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
 					lockdep_is_held(&sta->ampdu_mlme.mtx));
 
-	if (!tid_rx)
+	if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid))
 		return;
 
 	RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL);
+	__clear_bit(tid, sta->ampdu_mlme.agg_session_valid);
 
 	ht_dbg(sta->sdata,
 	       "Rx BA session stop requested for %pM tid %u %s reason: %d\n",
@@ -97,6 +98,13 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 		ieee80211_send_delba(sta->sdata, sta->sta.addr,
 				     tid, WLAN_BACK_RECIPIENT, reason);
 
+	/*
+	 * return here in case tid_rx is not assigned - which will happen if
+	 * IEEE80211_HW_SUPPORTS_REORDERING_BUFFER is set.
+	 */
+	if (!tid_rx)
+		return;
+
 	del_timer_sync(&tid_rx->session_timer);
 
 	/* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
@@ -297,7 +305,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
-	if (sta->ampdu_mlme.tid_rx[tid]) {
+	if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
 		ht_dbg_ratelimited(sta->sdata,
 				   "unexpected AddBA Req from %pM on tid %u\n",
 				   sta->sta.addr, tid);
@@ -308,6 +316,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 						false);
 	}
 
+	if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) {
+		ret = drv_ampdu_action(local, sta->sdata, &params);
+		ht_dbg(sta->sdata,
+		       "Rx A-MPDU request on %pM tid %d result %d\n",
+		       sta->sta.addr, tid, ret);
+		if (!ret)
+			status = WLAN_STATUS_SUCCESS;
+		goto end;
+	}
+
 	/* prepare A-MPDU MLME for Rx aggregation */
 	tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
 	if (!tid_agg_rx)
@@ -369,6 +387,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	}
 
 end:
+	if (status == WLAN_STATUS_SUCCESS)
+		__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
 	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 end_no_lock:
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index abbdff03ce92..e433d0c97e86 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -126,6 +126,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
 	FLAG(SUPPORTS_AMSDU_IN_AMPDU),
 	FLAG(BEACON_TX_STATUS),
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
+	FLAG(SUPPORTS_REORDERING_BUFFER),
 
 	/* keep last for the build bug below */
 	(void *)0x1
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d6051629ed15..f4d38994ecee 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2005, Devicescape Software, Inc.
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -212,20 +213,21 @@ struct tid_ampdu_rx {
 /**
  * struct sta_ampdu_mlme - STA aggregation information.
  *
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ *	to tid_tx[idx], which are protected by the sta spinlock)
+ *	tid_start_tx is also protected by sta->lock.
  * @tid_rx: aggregation info for Rx per TID -- RCU protected
- * @tid_tx: aggregation info for Tx per TID
- * @tid_start_tx: sessions where start was requested
- * @addba_req_num: number of times addBA request has been sent.
- * @last_addba_req_time: timestamp of the last addBA request.
- * @dialog_token_allocator: dialog token enumerator for each new session;
- * @work: work struct for starting/stopping aggregation
  * @tid_rx_timer_expired: bitmap indicating on which TIDs the
  *	RX timer expired until the work for it runs
  * @tid_rx_stop_requested:  bitmap indicating which BA sessions per TID the
  *	driver requested to close until the work for it runs
- * @mtx: mutex to protect all TX data (except non-NULL assignments
- *	to tid_tx[idx], which are protected by the sta spinlock)
- *	tid_start_tx is also protected by sta->lock.
+ * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @work: work struct for starting/stopping aggregation
+ * @tid_tx: aggregation info for Tx per TID
+ * @tid_start_tx: sessions where start was requested
+ * @last_addba_req_time: timestamp of the last addBA request.
+ * @addba_req_num: number of times addBA request has been sent.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
  */
 struct sta_ampdu_mlme {
 	struct mutex mtx;
@@ -233,6 +235,7 @@ struct sta_ampdu_mlme {
 	struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
 	unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+	unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	/* tx */
 	struct work_struct work;
 	struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
-- 
cgit v1.2.3


From 178830481eee5eea147a1c8fab67a96e09d80345 Mon Sep 17 00:00:00 2001
From: Grzegorz Bajorski <grzegorz.bajorski@tieto.com>
Date: Fri, 11 Dec 2015 14:39:46 +0100
Subject: mac80211: allow drivers to report (non-)monitor frames

Some drivers offload some frames internally (e.g.
AddBa). Reporting such frames to mac80211 would
only confuse MLME. However it would be useful to
be able to pass such frames to monitor interfaces
for sniffing purposes, e.g. when running AP +
monitor.

To do that allow drivers to tell mac80211 whether
a given frame should be:
 - processed but not delivered to any monitor vif
 - not processed but delievered to monitor vifs
   only

Signed-off-by: Grzegorz Bajorski <grzegorz.bajorski@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 11 +++++++++++
 net/mac80211/rx.c      |  5 +++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ee6305a52251..5910085af9e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1031,6 +1031,14 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC
  *	is stored in the @ampdu_delimiter_crc field)
  * @RX_FLAG_LDPC: LDPC was used
+ * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without
+ *	processing it in any regular way.
+ *	This is useful if drivers offload some frames but still want to report
+ *	them for sniffing purposes.
+ * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except
+ *	monitor interfaces.
+ *	This is useful if drivers offload some frames but still want to report
+ *	them for sniffing purposes.
  * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3
  * @RX_FLAG_10MHZ: 10 MHz (half channel) was used
  * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used
@@ -1071,6 +1079,8 @@ enum mac80211_rx_flags {
 	RX_FLAG_MACTIME_END		= BIT(21),
 	RX_FLAG_VHT			= BIT(22),
 	RX_FLAG_LDPC			= BIT(23),
+	RX_FLAG_ONLY_MONITOR		= BIT(24),
+	RX_FLAG_SKIP_MONITOR		= BIT(25),
 	RX_FLAG_STBC_MASK		= BIT(26) | BIT(27),
 	RX_FLAG_10MHZ			= BIT(28),
 	RX_FLAG_5MHZ			= BIT(29),
@@ -1089,6 +1099,7 @@ enum mac80211_rx_flags {
  * @RX_VHT_FLAG_160MHZ: 160 MHz was used
  * @RX_VHT_FLAG_BF: packet was beamformed
  */
+
 enum mac80211_rx_vht_flags {
 	RX_VHT_FLAG_80MHZ		= BIT(0),
 	RX_VHT_FLAG_160MHZ		= BIT(1),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fe675d76f29c..ae993edfdecf 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -122,7 +122,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
 	hdr = (void *)(skb->data + rtap_vendor_space);
 
 	if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
-			    RX_FLAG_FAILED_PLCP_CRC))
+			    RX_FLAG_FAILED_PLCP_CRC |
+			    RX_FLAG_ONLY_MONITOR))
 		return true;
 
 	if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
@@ -507,7 +508,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		return NULL;
 	}
 
-	if (!local->monitors) {
+	if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
 		if (should_drop_frame(origskb, present_fcs_len,
 				      rtap_vendor_space)) {
 			dev_kfree_skb(origskb);
-- 
cgit v1.2.3


From 506bcfa8abebdbcebdc17b03e96e38dc0b8ce765 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 13 Dec 2015 15:41:05 +0200
Subject: mac80211: limit the A-MSDU Tx based on peer's capabilities

In VHT, the specification allows to limit the number of
MSDUs in an A-MSDU in the Extended Capabilities IE. There
is also a limitation on the byte size in the VHT IE.
In HT, the only limitation is on the byte size.
Parse the capabilities from the peer and make them
available to the driver.

In HT, there is another limitation when a BA agreement
is active: the byte size can't be greater than 4095.
This is not enforced here.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 19 +++++++++++++++++++
 include/net/mac80211.h    | 14 ++++++++++++++
 net/mac80211/cfg.c        | 29 +++++++++++++++++++++++++++++
 net/mac80211/ht.c         |  5 +++++
 net/mac80211/vht.c        | 17 +++++++++++++++++
 5 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d9ddb89533a7..3b1f6cef9513 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -163,6 +163,14 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
+/* Maximal size of an A-MSDU */
+#define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
+#define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
+
+#define IEEE80211_MAX_MPDU_LEN_VHT_3895		3895
+#define IEEE80211_MAX_MPDU_LEN_VHT_7991		7991
+#define IEEE80211_MAX_MPDU_LEN_VHT_11454	11454
+
 #define IEEE80211_MAX_SSID_LEN		32
 
 #define IEEE80211_MAX_MESH_ID_LEN	32
@@ -1505,6 +1513,7 @@ struct ieee80211_vht_operation {
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991			0x00000001
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454			0x00000002
+#define IEEE80211_VHT_CAP_MAX_MPDU_MASK				0x00000003
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ		0x00000004
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ	0x00000008
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK			0x0000000C
@@ -2086,6 +2095,16 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(5)
 #define WLAN_EXT_CAPA8_OPMODE_NOTIF	BIT(6)
 
+/* Defines the maximal number of MSDUs in an A-MSDU. */
+#define WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB	BIT(7)
+#define WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB	BIT(0)
+
+/*
+ * Fine Timing Measurement Initiator - bit 71 of @WLAN_EID_EXT_CAPABILITY
+ * information element
+ */
+#define WLAN_EXT_CAPA9_FTM_INITIATOR	BIT(7)
+
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5910085af9e6..df5698ed8052 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1714,6 +1714,18 @@ struct ieee80211_sta_rates {
  * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only
  *	valid if the STA is a TDLS peer in the first place.
  * @mfp: indicates whether the STA uses management frame protection or not.
+ * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
+ *	A-MSDU. Taken from the Extended Capabilities element. 0 means
+ *	unlimited.
+ * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. This
+ *	field is always valid for packets with a VHT preamble. For packets
+ *	with a HT preamble, additional limits apply:
+ *		+ If the skb is transmitted as part of a BA agreement, the
+ *		  A-MSDU maximal size is min(max_amsdu_len, 4065) bytes.
+ *		+ If the skb is not part of a BA aggreement, the A-MSDU maximal
+ *		  size is min(max_amsdu_len, 7935) bytes.
+ *	Both additional HT limits must be enforced by the low level driver.
+ *	This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2).
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1732,6 +1744,8 @@ struct ieee80211_sta {
 	bool tdls;
 	bool tdls_initiator;
 	bool mfp;
+	u8 max_amsdu_subframes;
+	u16 max_amsdu_len;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 166a29fe6c35..66d22de93c8d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1131,6 +1131,34 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		sta->sta.max_sp = params->max_sp;
 	}
 
+	/* The sender might not have sent the last bit, consider it to be 0 */
+	if (params->ext_capab_len >= 8) {
+		u8 val = (params->ext_capab[7] &
+			  WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7;
+
+		/* we did get all the bits, take the MSB as well */
+		if (params->ext_capab_len >= 9) {
+			u8 val_msb = params->ext_capab[8] &
+				WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB;
+			val_msb <<= 1;
+			val |= val_msb;
+		}
+
+		switch (val) {
+		case 1:
+			sta->sta.max_amsdu_subframes = 32;
+			break;
+		case 2:
+			sta->sta.max_amsdu_subframes = 16;
+			break;
+		case 3:
+			sta->sta.max_amsdu_subframes = 8;
+			break;
+		default:
+			sta->sta.max_amsdu_subframes = 0;
+		}
+	}
+
 	/*
 	 * cfg80211 validates this (1-2007) and allows setting the AID
 	 * only when creating a new station entry
@@ -1160,6 +1188,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
 						  params->ht_capa, sta);
 
+	/* VHT can override some HT caps such as the A-MSDU max length */
 	if (params->vht_capa)
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
 						    params->vht_capa, sta);
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 7a76ce639d58..f4a528773563 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -230,6 +230,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 	/* set Rx highest rate */
 	ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest;
 
+	if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU)
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935;
+	else
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839;
+
  apply:
 	changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap));
 
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 050de08bf82e..204cf9ad3019 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -281,6 +281,23 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	}
 
 	sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+
+	/* If HT IE reported 3839 bytes only, stay with that size. */
+	if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839)
+		return;
+
+	switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
+	default:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
+		break;
+	}
 }
 
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
-- 
cgit v1.2.3


From 538dc9045251d3d6b5c0216a5c61c32bd9cedac9 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn@kryo.se>
Date: Thu, 24 Dec 2015 00:33:26 -0800
Subject: mac80211: Make addr const in SET_IEEE80211_PERM_ADDR()

Make the addr parameter const in SET_IEEE80211_PERM_ADDR() to save
clients from having to cast away a const qualifier.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index df5698ed8052..566df20dc957 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2217,7 +2217,7 @@ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev
  * @hw: the &struct ieee80211_hw to set the MAC address for
  * @addr: the address to set
  */
-static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, u8 *addr)
+static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr)
 {
 	memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN);
 }
-- 
cgit v1.2.3


From dd21dfc645d5dce0657af78761b3fa11a3a95398 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Jan 2016 10:39:23 +0100
Subject: rfkill: disentangle polling pause and suspend

When suspended while polling is paused, polling will erroneously
resume at resume time. Fix this by tracking pause and suspend in
separate state variable and adding the necessary checks.

Clarify the documentation on this as well.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h |  3 ++-
 net/rfkill/core.c      | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index d9010789b4e8..7af625f6d226 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -104,7 +104,8 @@ int __must_check rfkill_register(struct rfkill *rfkill);
  *
  * Pause polling -- say transmitter is off for other reasons.
  * NOTE: not necessary for suspend/resume -- in that case the
- * core stops polling anyway
+ * core stops polling anyway (but will also correctly handle
+ * the case of polling having been paused before suspend.)
  */
 void rfkill_pause_polling(struct rfkill *rfkill);
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f53bf3b6558b..166439995f34 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -57,6 +57,8 @@ struct rfkill {
 
 	bool			registered;
 	bool			persistent;
+	bool			polling_paused;
+	bool			suspended;
 
 	const struct rfkill_ops	*ops;
 	void			*data;
@@ -786,6 +788,7 @@ void rfkill_pause_polling(struct rfkill *rfkill)
 	if (!rfkill->ops->poll)
 		return;
 
+	rfkill->polling_paused = true;
 	cancel_delayed_work_sync(&rfkill->poll_work);
 }
 EXPORT_SYMBOL(rfkill_pause_polling);
@@ -797,6 +800,11 @@ void rfkill_resume_polling(struct rfkill *rfkill)
 	if (!rfkill->ops->poll)
 		return;
 
+	rfkill->polling_paused = false;
+
+	if (rfkill->suspended)
+		return;
+
 	queue_delayed_work(system_power_efficient_wq,
 			   &rfkill->poll_work, 0);
 }
@@ -807,7 +815,8 @@ static int rfkill_suspend(struct device *dev)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
-	rfkill_pause_polling(rfkill);
+	rfkill->suspended = true;
+	cancel_delayed_work_sync(&rfkill->poll_work);
 
 	return 0;
 }
@@ -817,12 +826,16 @@ static int rfkill_resume(struct device *dev)
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
 
+	rfkill->suspended = false;
+
 	if (!rfkill->persistent) {
 		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 		rfkill_set_block(rfkill, cur);
 	}
 
-	rfkill_resume_polling(rfkill);
+	if (rfkill->ops->poll && !rfkill->polling_paused)
+		queue_delayed_work(system_power_efficient_wq,
+				   &rfkill->poll_work, 0);
 
 	return 0;
 }
-- 
cgit v1.2.3


From d4634e8dea13ccc969dd3f33dab3873cfdf3bc51 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:42 -0500
Subject: rfkill: Update userspace API documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a note to userspace on the effect of RFKILL_OP_CHANGE_ALL also
updating the default state for hotplugged devices.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
[reword a bit]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 058757f7a733..2e00dcebebd0 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -59,6 +59,8 @@ enum rfkill_type {
  * @RFKILL_OP_DEL: a device was removed
  * @RFKILL_OP_CHANGE: a device's state changed -- userspace changes one device
  * @RFKILL_OP_CHANGE_ALL: userspace changes all devices (of a type, or all)
+ *	into a state, also updating the default state used for devices that
+ *	are hot-plugged later.
  */
 enum rfkill_operation {
 	RFKILL_OP_ADD = 0,
-- 
cgit v1.2.3


From f4a0f0c5264e72d9279fbf9cf48a061526e8f788 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Jan 2016 15:46:34 +0200
Subject: mac80211: add RX_FLAG_MACTIME_PLCP_START

The timestamp given by iwlwifi is at the beginning of the
frame over the air, at (or during) the SYNC field. Allow
such timestamps to be given to mac80211, at least (for now)
for frames with non-HT/VHT preambles.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  3 +++
 net/mac80211/ieee80211_i.h |  8 +++++++-
 net/mac80211/util.c        | 14 +++++++++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 566df20dc957..31337f81ec03 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1010,6 +1010,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime
  *	field) is valid and contains the time the last symbol of the MPDU
  *	(including FCS) was received.
+ * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime
+ *	field) is valid and contains the time the SYNC preamble was received.
  * @RX_FLAG_SHORTPRE: Short preamble was used for this frame
  * @RX_FLAG_HT: HT MCS was used and rate_idx is MCS index
  * @RX_FLAG_VHT: VHT MCS was used and rate_index is MCS index
@@ -1058,6 +1060,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
 	RX_FLAG_DECRYPTED		= BIT(1),
+	RX_FLAG_MACTIME_PLCP_START	= BIT(2),
 	RX_FLAG_MMIC_STRIPPED		= BIT(3),
 	RX_FLAG_IV_STRIPPED		= BIT(4),
 	RX_FLAG_FAILED_FCS_CRC		= BIT(5),
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9934447a8b9a..a29f61dc9c06 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1466,7 +1466,13 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
 {
 	WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
 		     status->flag & RX_FLAG_MACTIME_END);
-	return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END);
+	if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END))
+		return true;
+	/* can't handle HT/VHT preamble yet */
+	if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
+	    !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT)))
+		return true;
+	return false;
 }
 
 u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 7d0479e31674..fb90d9c5df59 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -4,7 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015	Intel Deutschland GmbH
+ * Copyright (C) 2015-2016	Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2671,6 +2671,18 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 		sband = local->hw.wiphy->bands[status->band];
 		bitrate = sband->bitrates[status->rate_idx].bitrate;
 		ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
+
+		if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+			/* TODO: handle HT/VHT preambles */
+			if (status->band == IEEE80211_BAND_5GHZ) {
+				ts += 20 << shift;
+				mpdu_offset += 2;
+			} else if (status->flag & RX_FLAG_SHORTPRE) {
+				ts += 96;
+			} else {
+				ts += 192;
+			}
+		}
 	}
 
 	rate = cfg80211_calculate_bitrate(&ri);
-- 
cgit v1.2.3


From dfdfc2beb0dd7e3a067d2eeacb4623cb48e77658 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 26 Jan 2016 17:11:13 +0100
Subject: mac80211: Parse legacy and HT rate in injected frames

Drivers/devices without their own rate control algorithm can get the
information what rates they should use from either the radiotap header of
injected frames or from the rate control algorithm. But the parsing of the
legacy rate information from the radiotap header was removed in commit
e6a9854b05c1 ("mac80211/drivers: rewrite the rate control API").

The removal of this feature heavily reduced the usefulness of frame
injection when wanting to simulate specific transmission behavior. Having
rate parsing together with MCS rates and retry support allows a fine
grained selection of the tx behavior of injected frames for these kind of
tests.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/networking/mac80211-injection.txt | 17 ++++++
 include/net/mac80211.h                          |  2 +
 net/mac80211/tx.c                               | 72 ++++++++++++++++++++++++-
 3 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
index 3a930072b161..ec8f934c2eb2 100644
--- a/Documentation/networking/mac80211-injection.txt
+++ b/Documentation/networking/mac80211-injection.txt
@@ -28,6 +28,23 @@ radiotap headers and used to control injection:
    IEEE80211_RADIOTAP_F_TX_NOACK: frame should be sent without waiting for
 				  an ACK even if it is a unicast frame
 
+ * IEEE80211_RADIOTAP_RATE
+
+   legacy rate for the transmission (only for devices without own rate control)
+
+ * IEEE80211_RADIOTAP_MCS
+
+   HT rate for the transmission (only for devices without own rate control).
+   Also some flags are parsed
+
+   IEEE80211_TX_RC_SHORT_GI: use short guard interval
+   IEEE80211_TX_RC_40_MHZ_WIDTH: send in HT40 mode
+
+ * IEEE80211_RADIOTAP_DATA_RETRIES
+
+   number of retries when either IEEE80211_RADIOTAP_RATE or
+   IEEE80211_RADIOTAP_MCS was used
+
 The injection code can also skip all other currently defined radiotap fields
 facilitating replay of captured radiotap headers directly.
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 31337f81ec03..dbcd69a6bfda 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -708,12 +708,14 @@ enum mac80211_tx_info_flags {
  *	protocol frame (e.g. EAP)
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
+ * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  *
  * These flags are used in tx_info->control.flags.
  */
 enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
+	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3311ce0f3d6c..723cd7aa8953 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -710,6 +710,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 
 	info->control.short_preamble = txrc.short_preamble;
 
+	/* don't ask rate control when rate already injected via radiotap */
+	if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT)
+		return TX_CONTINUE;
+
 	if (tx->sta)
 		assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC);
 
@@ -1665,15 +1669,24 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 	ieee80211_tx(sdata, sta, skb, false);
 }
 
-static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
+static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
+					struct sk_buff *skb)
 {
 	struct ieee80211_radiotap_iterator iterator;
 	struct ieee80211_radiotap_header *rthdr =
 		(struct ieee80211_radiotap_header *) skb->data;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_supported_band *sband =
+		local->hw.wiphy->bands[info->band];
 	int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len,
 						   NULL);
 	u16 txflags;
+	u16 rate = 0;
+	bool rate_found = false;
+	u8 rate_retries = 0;
+	u16 rate_flags = 0;
+	u8 mcs_known, mcs_flags;
+	int i;
 
 	info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
 		       IEEE80211_TX_CTL_DONTFRAG;
@@ -1724,6 +1737,35 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
 				info->flags |= IEEE80211_TX_CTL_NO_ACK;
 			break;
 
+		case IEEE80211_RADIOTAP_RATE:
+			rate = *iterator.this_arg;
+			rate_flags = 0;
+			rate_found = true;
+			break;
+
+		case IEEE80211_RADIOTAP_DATA_RETRIES:
+			rate_retries = *iterator.this_arg;
+			break;
+
+		case IEEE80211_RADIOTAP_MCS:
+			mcs_known = iterator.this_arg[0];
+			mcs_flags = iterator.this_arg[1];
+			if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS))
+				break;
+
+			rate_found = true;
+			rate = iterator.this_arg[2];
+			rate_flags = IEEE80211_TX_RC_MCS;
+
+			if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI &&
+			    mcs_flags & IEEE80211_RADIOTAP_MCS_SGI)
+				rate_flags |= IEEE80211_TX_RC_SHORT_GI;
+
+			if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW &&
+			    mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40)
+				rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+			break;
+
 		/*
 		 * Please update the file
 		 * Documentation/networking/mac80211-injection.txt
@@ -1738,6 +1780,32 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
 	if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */
 		return false;
 
+	if (rate_found) {
+		info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT;
+
+		for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+			info->control.rates[i].idx = -1;
+			info->control.rates[i].flags = 0;
+			info->control.rates[i].count = 0;
+		}
+
+		if (rate_flags & IEEE80211_TX_RC_MCS) {
+			info->control.rates[0].idx = rate;
+		} else {
+			for (i = 0; i < sband->n_bitrates; i++) {
+				if (rate * 5 != sband->bitrates[i].bitrate)
+					continue;
+
+				info->control.rates[0].idx = i;
+				break;
+			}
+		}
+
+		info->control.rates[0].flags = rate_flags;
+		info->control.rates[0].count = min_t(u8, rate_retries + 1,
+						     local->hw.max_rate_tries);
+	}
+
 	/*
 	 * remove the radiotap header
 	 * iterator->_max_length was sanity-checked against
@@ -1819,7 +1887,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 		      IEEE80211_TX_CTL_INJECTED;
 
 	/* process and remove the injection radiotap header */
-	if (!ieee80211_parse_tx_radiotap(skb))
+	if (!ieee80211_parse_tx_radiotap(local, skb))
 		goto fail;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From f2ac7e301ae6397669ff3f79e691942a9b5d2f39 Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Wed, 27 Jan 2016 15:26:12 +0100
Subject: mac80211: expose txq queue depth and size to drivers

This will allow drivers to make more educated
decisions whether to defer transmission or not.

Relying on wake_tx_queue() call count implicitly
was not possible because it could be called
without queued frame count actually changing on
software tx aggregation start/stop code paths.

It was also not possible to know how long
byte-wise queue was without dequeueing.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 15 +++++++++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/iface.c       |  1 +
 net/mac80211/sta_info.c    |  1 +
 net/mac80211/tx.c          |  8 +++++++-
 net/mac80211/util.c        | 14 ++++++++++++++
 6 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index dbcd69a6bfda..fd35fc4d7127 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5596,4 +5596,19 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);
  */
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 				     struct ieee80211_txq *txq);
+
+/**
+ * ieee80211_txq_get_depth - get pending frame/byte count of given txq
+ *
+ * The values are not guaranteed to be coherent with regard to each other, i.e.
+ * txq state can change half-way of this function and the caller may end up
+ * with "new" frame_cnt and "old" byte_cnt or vice-versa.
+ *
+ * @txq: pointer obtained from station or virtual interface
+ * @frame_cnt: pointer to store frame count
+ * @byte_cnt: pointer to store byte count
+ */
+void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
+			     unsigned long *frame_cnt,
+			     unsigned long *byte_cnt);
 #endif /* MAC80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a29f61dc9c06..a96f8c0461f6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -804,6 +804,7 @@ enum txq_info_flags {
 struct txq_info {
 	struct sk_buff_head queue;
 	unsigned long flags;
+	unsigned long byte_cnt;
 
 	/* keep last! */
 	struct ieee80211_txq txq;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 0451f120746e..453b4e741780 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -979,6 +979,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 		spin_lock_bh(&txqi->queue.lock);
 		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+		txqi->byte_cnt = 0;
 		spin_unlock_bh(&txqi->queue.lock);
 
 		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index b28e7a220d56..5894c0a1c01f 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -116,6 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta)
 
 			ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
 			atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
+			txqi->byte_cnt = 0;
 		}
 	}
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 723cd7aa8953..a5aa275d0434 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1270,7 +1270,11 @@ static void ieee80211_drv_tx(struct ieee80211_local *local,
 	if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
 		netif_stop_subqueue(sdata->dev, ac);
 
-	skb_queue_tail(&txqi->queue, skb);
+	spin_lock_bh(&txqi->queue.lock);
+	txqi->byte_cnt += skb->len;
+	__skb_queue_tail(&txqi->queue, skb);
+	spin_unlock_bh(&txqi->queue.lock);
+
 	drv_wake_tx_queue(local, txqi);
 
 	return;
@@ -1298,6 +1302,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	if (!skb)
 		goto out;
 
+	txqi->byte_cnt -= skb->len;
+
 	atomic_dec(&sdata->txqs_len[ac]);
 	if (__netif_subqueue_stopped(sdata->dev, ac))
 		ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index fb90d9c5df59..091f3dd62ad1 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3368,3 +3368,17 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
 		txqi->txq.ac = IEEE80211_AC_BE;
 	}
 }
+
+void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
+			     unsigned long *frame_cnt,
+			     unsigned long *byte_cnt)
+{
+	struct txq_info *txqi = to_txq_info(txq);
+
+	if (frame_cnt)
+		*frame_cnt = txqi->queue.qlen;
+
+	if (byte_cnt)
+		*byte_cnt = txqi->byte_cnt;
+}
+EXPORT_SYMBOL(ieee80211_txq_get_depth);
-- 
cgit v1.2.3


From 06470f7468c8b6c95e72ebda803a61a99f4ee446 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 28 Jan 2016 16:19:25 +0200
Subject: mac80211: add API to allow filtering frames in BA sessions

If any frames are dropped that are part of a BA session, the reorder
buffer will "indefinitely" (until the timeout) wait for them to come
in (or a BAR moving the window) and won't release frames after them.
This means it isn't possible to filter frames within a BA session in
firmware.

Introduce an API function that allows such filtering. Calling this
function will move the BA window forward to the new SSN, and allows
marking frames after the SSN as having been filtered, so any future
reordering activity will release frames while skipping the holes.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 20 +++++++++++-
 net/mac80211/agg-rx.c   |  1 +
 net/mac80211/rx.c       | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/sta_info.h |  3 ++
 4 files changed, 107 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fd35fc4d7127..57147749ae42 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -5193,6 +5193,24 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw);
 void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
 				  const u8 *addr);
 
+/**
+ * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered
+ * @pubsta: station struct
+ * @tid: the session's TID
+ * @ssn: starting sequence number of the bitmap, all frames before this are
+ *	assumed to be out of the window after the call
+ * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc.
+ * @received_mpdus: number of received mpdus in firmware
+ *
+ * This function moves the BA window and releases all frames before @ssn, and
+ * marks frames marked in the bitmap as having been filtered. Afterwards, it
+ * checks if any frames in the window starting from @ssn can now be released
+ * (in case they were only waiting for frames that were filtered.)
+ */
+void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
+					  u16 ssn, u64 filtered,
+					  u16 received_mpdus);
+
 /**
  * ieee80211_send_bar - send a BlockAckReq frame
  *
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2ab54791281d..1b8a5caa221e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -376,6 +376,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	tid_agg_rx->timeout = timeout;
 	tid_agg_rx->stored_mpdu_num = 0;
 	tid_agg_rx->auto_seq = auto_seq;
+	tid_agg_rx->reorder_buf_filtered = 0;
 	status = WLAN_STATUS_SUCCESS;
 
 	/* activate it for RX */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1153871b570f..9fb7074f0280 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -19,6 +19,7 @@
 #include <linux/etherdevice.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 #include <net/mac80211.h>
 #include <net/ieee80211_radiotap.h>
 #include <asm/unaligned.h>
@@ -806,6 +807,9 @@ static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
 	struct sk_buff *tail = skb_peek_tail(frames);
 	struct ieee80211_rx_status *status;
 
+	if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
+		return true;
+
 	if (!tail)
 		return false;
 
@@ -844,6 +848,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 	}
 
 no_frame:
+	tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
 	tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
 }
 
@@ -3300,6 +3305,85 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
 	ieee80211_rx_handlers(&rx, &frames);
 }
 
+void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
+					  u16 ssn, u64 filtered,
+					  u16 received_mpdus)
+{
+	struct sta_info *sta;
+	struct tid_ampdu_rx *tid_agg_rx;
+	struct sk_buff_head frames;
+	struct ieee80211_rx_data rx = {
+		/* This is OK -- must be QoS data frame */
+		.security_idx = tid,
+		.seqno_idx = tid,
+	};
+	int i, diff;
+
+	if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS))
+		return;
+
+	__skb_queue_head_init(&frames);
+
+	sta = container_of(pubsta, struct sta_info, sta);
+
+	rx.sta = sta;
+	rx.sdata = sta->sdata;
+	rx.local = sta->local;
+
+	rcu_read_lock();
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		goto out;
+
+	spin_lock_bh(&tid_agg_rx->reorder_lock);
+
+	if (received_mpdus >= IEEE80211_SN_MODULO >> 1) {
+		int release;
+
+		/* release all frames in the reorder buffer */
+		release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) %
+			   IEEE80211_SN_MODULO;
+		ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx,
+						 release, &frames);
+		/* update ssn to match received ssn */
+		tid_agg_rx->head_seq_num = ssn;
+	} else {
+		ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn,
+						 &frames);
+	}
+
+	/* handle the case that received ssn is behind the mac ssn.
+	 * it can be tid_agg_rx->buf_size behind and still be valid */
+	diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK;
+	if (diff >= tid_agg_rx->buf_size) {
+		tid_agg_rx->reorder_buf_filtered = 0;
+		goto release;
+	}
+	filtered = filtered >> diff;
+	ssn += diff;
+
+	/* update bitmap */
+	for (i = 0; i < tid_agg_rx->buf_size; i++) {
+		int index = (ssn + i) % tid_agg_rx->buf_size;
+
+		tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
+		if (filtered & BIT_ULL(i))
+			tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index);
+	}
+
+	/* now process also frames that the filter marking released */
+	ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
+
+release:
+	spin_unlock_bh(&tid_agg_rx->reorder_lock);
+
+	ieee80211_rx_handlers(&rx, &frames);
+
+ out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames);
+
 /* main receive path */
 
 static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index f4d38994ecee..053f5c4fa495 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -168,6 +168,8 @@ struct tid_ampdu_tx {
  *
  * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
  *	A-MSDU with individually reported subframes.
+ * @reorder_buf_filtered: bitmap indicating where there are filtered frames in
+ *	the reorder buffer that should be ignored when releasing frames
  * @reorder_time: jiffies when skb was added
  * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
  * @reorder_timer: releases expired frames from the reorder buffer.
@@ -195,6 +197,7 @@ struct tid_ampdu_tx {
 struct tid_ampdu_rx {
 	struct rcu_head rcu_head;
 	spinlock_t reorder_lock;
+	u64 reorder_buf_filtered;
 	struct sk_buff_head *reorder_buf;
 	unsigned long *reorder_time;
 	struct timer_list session_timer;
-- 
cgit v1.2.3


From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Thu, 28 Jan 2016 10:58:25 +0200
Subject: cfg80211: basic support for PBSS network type

PBSS (Personal Basic Service Set) is a new BSS type for DMG
networks. It is similar to infrastructure BSS, having an AP-like
entity called PCP (PBSS Control Point), but it has few differences.
PBSS support is mandatory for 11ad devices.

Add support for PBSS by introducing a new PBSS flag attribute.
The PBSS flag is used in the START_AP command to request starting
a PCP instead of an AP, and in the CONNECT command to request
connecting to a PCP instead of an AP.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 10 ++++++++++
 include/net/cfg80211.h                      |  8 ++++++++
 include/uapi/linux/nl80211.h                |  6 ++++++
 net/wireless/nl80211.c                      | 11 +++++++++++
 net/wireless/sme.c                          |  9 ++++++---
 5 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 20d07ef679e8..1f231cd08138 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -422,6 +422,11 @@ static int wil_cfg80211_connect(struct wiphy *wiphy,
 	if (sme->privacy && !rsn_eid)
 		wil_info(wil, "WSC connection\n");
 
+	if (sme->pbss) {
+		wil_err(wil, "connect - PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	bss = cfg80211_get_bss(wiphy, sme->channel, sme->bssid,
 			       sme->ssid, sme->ssid_len,
 			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
@@ -870,6 +875,11 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
+	if (info->pbss) {
+		wil_err(wil, "AP: PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	switch (info->hidden_ssid) {
 	case NL80211_HIDDEN_SSID_NOT_IN_USE:
 		hidden_ssid = WMI_HIDDEN_SSID_DISABLED;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9bcaaf7cd15a..9e1b24c29f0c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -712,6 +712,8 @@ struct cfg80211_acl_data {
  * @p2p_opp_ps: P2P opportunistic PS
  * @acl: ACL configuration used by the drivers which has support for
  *	MAC address based access control
+ * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
+ *	networks.
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -730,6 +732,7 @@ struct cfg80211_ap_settings {
 	u8 p2p_ctwindow;
 	bool p2p_opp_ps;
 	const struct cfg80211_acl_data *acl;
+	bool pbss;
 };
 
 /**
@@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa:  VHT Capability overrides
  * @vht_capa_mask: The bits of vht_capa which are to be used.
+ * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
+ *	networks.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -1910,6 +1915,7 @@ struct cfg80211_connect_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa;
 	struct ieee80211_vht_cap vht_capa_mask;
+	bool pbss;
 };
 
 /**
@@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys;
  *	registered for unexpected class 3 frames (AP mode)
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
+ * @conn_bss_type: connecting/connected BSS type
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3519,6 +3526,7 @@ struct wireless_dev {
 	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
+	enum ieee80211_bss_type conn_bss_type;
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5b7b5ebe7ca8..7758969a2a8e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1789,6 +1789,10 @@ enum nl80211_commands {
  *	thus it must not specify the number of iterations, only the interval
  *	between scans. The scan plans are executed sequentially.
  *	Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan.
+ * @NL80211_ATTR_PBSS: flag attribute. If set it means operate
+ *	in a PBSS. Specified in %NL80211_CMD_CONNECT to request
+ *	connecting to a PCP, and in %NL80211_CMD_START_AP to start
+ *	a PCP instead of AP. Relevant for DMG networks only.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2164,6 +2168,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
 	NL80211_ATTR_SCHED_SCAN_PLANS,
 
+	NL80211_ATTR_PBSS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d4786f2802aa..268cb493f6a5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
 	[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
 	[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+	[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -7980,6 +7985,12 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= ASSOC_REQ_USE_RRM;
 	}
 
+	connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+		kzfree(connkeys);
+		return -EOPNOTSUPP;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
 	wdev_unlock(dev->ieee80211_ptr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b094d4..79bd3a171caa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 			       wdev->conn->params.bssid,
 			       wdev->conn->params.ssid,
 			       wdev->conn->params.ssid_len,
-			       IEEE80211_BSS_TYPE_ESS,
+			       wdev->conn_bss_type,
 			       IEEE80211_PRIVACY(wdev->conn->params.privacy));
 	if (!bss)
 		return NULL;
@@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
 				       wdev->ssid, wdev->ssid_len,
-				       IEEE80211_BSS_TYPE_ESS,
+				       wdev->conn_bss_type,
 				       IEEE80211_PRIVACY_ANY);
 		if (bss)
 			cfg80211_hold_bss(bss_from_pub(bss));
@@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev,
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
 			       wdev->ssid_len,
-			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+			       wdev->conn_bss_type, IEEE80211_PRIVACY_ANY);
 	if (WARN_ON(!bss))
 		return;
 
@@ -1017,6 +1017,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
 	wdev->ssid_len = connect->ssid_len;
 
+	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
+					      IEEE80211_BSS_TYPE_ESS;
+
 	if (!rdev->ops->connect)
 		err = cfg80211_sme_connect(wdev, connect, prev_bssid);
 	else
-- 
cgit v1.2.3


From f8079d43cf0f1f0171606e75fcef6fe17bb183f2 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Sun, 14 Feb 2016 13:56:35 +0200
Subject: mac80211: move TKIP TX IVs to public part of key struct

Some drivers/devices might want to set the IVs by
themselves (and still let mac80211 generate MMIC).

Specifically, this is needed when the device does
offloading at certain times, and the driver has
to make sure that the IVs of new tx frames (from
the host) are synchronized with IVs that were
potentially used during the offloading.

Similarly to CCMP, move the TX IVs of TKIP keys to the
public part of the key struct, and export a function
to add the IV right into the crypto header.

The public tx_pn field is defined as atomic64, so define
TKIP_PN_TO_IV16/32 helper macros to convert it to iv16/32
when needed.

Since the iv32 used for the p1k cache is taken
directly from the frame, we can safely remove
iv16/32 from being protected by tkip.txlock.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 23 ++++++++++++++++++++---
 net/mac80211/cfg.c         |  5 +++--
 net/mac80211/debugfs_key.c |  5 +++--
 net/mac80211/key.c         |  9 +++++----
 net/mac80211/key.h         | 10 +++++++---
 net/mac80211/tkip.c        | 36 ++++++++++++++++++------------------
 net/mac80211/tkip.h        |  2 --
 net/mac80211/wpa.c         | 11 ++++-------
 8 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 57147749ae42..15879b49baad 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1521,9 +1521,8 @@ enum ieee80211_key_flags {
  *	wants to be given when a frame is transmitted and needs to be
  *	encrypted in hardware.
  * @cipher: The key's cipher suite selector.
- * @tx_pn: PN used for TX on non-TKIP keys, may be used by the driver
- *	as well if it needs to do software PN assignment by itself
- *	(e.g. due to TSO)
+ * @tx_pn: PN used for TX keys, may be used by the driver as well if it
+ *	needs to do software PN assignment by itself (e.g. due to TSO)
  * @flags: key flags, see &enum ieee80211_key_flags.
  * @keyidx: the key index (0-3)
  * @keylen: key material length
@@ -1549,6 +1548,9 @@ struct ieee80211_key_conf {
 
 #define IEEE80211_MAX_PN_LEN	16
 
+#define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff))
+#define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff))
+
 /**
  * struct ieee80211_key_seq - key sequence counter
  *
@@ -4446,6 +4448,21 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
 			    struct sk_buff *skb, u8 *p2k);
 
+/**
+ * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos
+ *
+ * @pos: start of crypto header
+ * @keyconf: the parameter passed with the set key
+ * @pn: PN to add
+ *
+ * Returns: pointer to the octet following IVs (i.e. beginning of
+ * the packet payload)
+ *
+ * This function writes the tkip IV value to pos (which should
+ * point to the crypto header)
+ */
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn);
+
 /**
  * ieee80211_get_key_tx_seq - get key TX sequence counter
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 66d22de93c8d..fe1704c4e8fb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -339,8 +339,9 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		iv32 = key->u.tkip.tx.iv32;
-		iv16 = key->u.tkip.tx.iv16;
+		pn64 = atomic64_read(&key->conf.tx_pn);
+		iv32 = TKIP_PN_TO_IV32(pn64);
+		iv16 = TKIP_PN_TO_IV16(pn64);
 
 		if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
 		    !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 7961e7d0b61e..a2ef95f16f11 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -132,9 +132,10 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 		len = scnprintf(buf, sizeof(buf), "\n");
 		break;
 	case WLAN_CIPHER_SUITE_TKIP:
+		pn = atomic64_read(&key->conf.tx_pn);
 		len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
-				key->u.tkip.tx.iv32,
-				key->u.tkip.tx.iv16);
+				TKIP_PN_TO_IV32(pn),
+				TKIP_PN_TO_IV16(pn));
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 5e5bc599da4c..f9c4cb9c6e06 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -945,8 +945,9 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		seq->tkip.iv32 = key->u.tkip.tx.iv32;
-		seq->tkip.iv16 = key->u.tkip.tx.iv16;
+		pn64 = atomic64_read(&key->conf.tx_pn);
+		seq->tkip.iv32 = TKIP_PN_TO_IV32(pn64);
+		seq->tkip.iv16 = TKIP_PN_TO_IV16(pn64);
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
@@ -1039,8 +1040,8 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		key->u.tkip.tx.iv32 = seq->tkip.iv32;
-		key->u.tkip.tx.iv16 = seq->tkip.iv16;
+		pn64 = (u64)seq->tkip.iv16 | ((u64)seq->tkip.iv32 << 16);
+		atomic64_set(&key->conf.tx_pn, pn64);
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 9951ef06323e..4aa20cef0859 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -44,13 +44,17 @@ enum ieee80211_internal_tkip_state {
 };
 
 struct tkip_ctx {
-	u32 iv32;	/* current iv32 */
-	u16 iv16;	/* current iv16 */
 	u16 p1k[5];	/* p1k cache */
 	u32 p1k_iv32;	/* iv32 for which p1k computed */
 	enum ieee80211_internal_tkip_state state;
 };
 
+struct tkip_ctx_rx {
+	struct tkip_ctx ctx;
+	u32 iv32;	/* current iv32 */
+	u16 iv16;	/* current iv16 */
+};
+
 struct ieee80211_key {
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
@@ -71,7 +75,7 @@ struct ieee80211_key {
 			struct tkip_ctx tx;
 
 			/* last received RSC */
-			struct tkip_ctx rx[IEEE80211_NUM_TIDS];
+			struct tkip_ctx_rx rx[IEEE80211_NUM_TIDS];
 
 			/* number of mic failures */
 			u32 mic_failures;
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 0ae207771a58..b3622823bad2 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
+ * Copyright (C) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -142,15 +143,14 @@ static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx,
 /* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets
  * of the IV. Returns pointer to the octet following IVs (i.e., beginning of
  * the packet payload). */
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key)
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn)
 {
-	lockdep_assert_held(&key->u.tkip.txlock);
-
-	pos = write_tkip_iv(pos, key->u.tkip.tx.iv16);
-	*pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */;
-	put_unaligned_le32(key->u.tkip.tx.iv32, pos);
+	pos = write_tkip_iv(pos, TKIP_PN_TO_IV16(pn));
+	*pos++ = (keyconf->keyidx << 6) | (1 << 5) /* Ext IV */;
+	put_unaligned_le32(TKIP_PN_TO_IV32(pn), pos);
 	return pos + 4;
 }
+EXPORT_SYMBOL_GPL(ieee80211_tkip_add_iv);
 
 static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32)
 {
@@ -250,6 +250,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
 	u8 rc4key[16], keyid, *pos = payload;
 	int res;
 	const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+	struct tkip_ctx_rx *rx_ctx = &key->u.tkip.rx[queue];
 
 	if (payload_len < 12)
 		return -1;
@@ -265,37 +266,36 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
 	if ((keyid >> 6) != key->conf.keyidx)
 		return TKIP_DECRYPT_INVALID_KEYIDX;
 
-	if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT &&
-	    (iv32 < key->u.tkip.rx[queue].iv32 ||
-	     (iv32 == key->u.tkip.rx[queue].iv32 &&
-	      iv16 <= key->u.tkip.rx[queue].iv16)))
+	if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT &&
+	    (iv32 < rx_ctx->iv32 ||
+	     (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16)))
 		return TKIP_DECRYPT_REPLAY;
 
 	if (only_iv) {
 		res = TKIP_DECRYPT_OK;
-		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+		rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
 		goto done;
 	}
 
-	if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT ||
-	    key->u.tkip.rx[queue].iv32 != iv32) {
+	if (rx_ctx->ctx.state == TKIP_STATE_NOT_INIT ||
+	    rx_ctx->iv32 != iv32) {
 		/* IV16 wrapped around - perform TKIP phase 1 */
-		tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32);
+		tkip_mixing_phase1(tk, &rx_ctx->ctx, ta, iv32);
 	}
 	if (key->local->ops->update_tkip_key &&
 	    key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
-	    key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) {
+	    rx_ctx->ctx.state != TKIP_STATE_PHASE1_HW_UPLOADED) {
 		struct ieee80211_sub_if_data *sdata = key->sdata;
 
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			sdata = container_of(key->sdata->bss,
 					struct ieee80211_sub_if_data, u.ap);
 		drv_update_tkip_key(key->local, sdata, &key->conf, key->sta,
-				iv32, key->u.tkip.rx[queue].p1k);
-		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+				iv32, rx_ctx->ctx.p1k);
+		rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
 	}
 
-	tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key);
+	tkip_mixing_phase2(tk, &rx_ctx->ctx, iv16, rc4key);
 
 	res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12);
  done:
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
index e3ecb659b90a..a1bcbfbefe7c 100644
--- a/net/mac80211/tkip.h
+++ b/net/mac80211/tkip.h
@@ -13,8 +13,6 @@
 #include <linux/crypto.h>
 #include "key.h"
 
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key);
-
 int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
 				struct ieee80211_key *key,
 				struct sk_buff *skb,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index d824c38971ed..18848258adde 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2008, Jouni Malinen <j@w1.fi>
+ * Copyright (C) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -183,7 +184,6 @@ mic_fail_no_key:
 	return RX_DROP_UNUSABLE;
 }
 
-
 static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -191,6 +191,7 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	unsigned int hdrlen;
 	int len, tail;
+	u64 pn;
 	u8 *pos;
 
 	if (info->control.hw_key &&
@@ -222,12 +223,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 		return 0;
 
 	/* Increase IV for the frame */
-	spin_lock(&key->u.tkip.txlock);
-	key->u.tkip.tx.iv16++;
-	if (key->u.tkip.tx.iv16 == 0)
-		key->u.tkip.tx.iv32++;
-	pos = ieee80211_tkip_add_iv(pos, key);
-	spin_unlock(&key->u.tkip.txlock);
+	pn = atomic64_inc_return(&key->conf.tx_pn);
+	pos = ieee80211_tkip_add_iv(pos, &key->conf, pn);
 
 	/* hwaccel - with software IV */
 	if (info->control.hw_key)
-- 
cgit v1.2.3


From ca48ebbc7ea7e82e3ae4b55aacead0cdb54ff008 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Mon, 15 Feb 2016 12:34:10 +0200
Subject: mac80211: remove ieee80211_get_key_tx_seq/ieee80211_set_key_tx_seq

Since the PNs of all the tx keys are now tracked in the public
part of the key struct (with atomic counter), we no longer
need these functions.

dvm and vt665{5,6} are currently the only users of these functions,
so update them accordingly.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/dvm/lib.c | 20 +++----
 drivers/staging/vt6655/rxtx.c                | 12 ++--
 drivers/staging/vt6656/rxtx.c                | 12 ++--
 include/net/mac80211.h                       | 34 -----------
 net/mac80211/key.c                           | 87 ----------------------------
 5 files changed, 24 insertions(+), 141 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
index 4841be2aa499..1799469268ea 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
@@ -943,14 +943,16 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw,
 	switch (key->cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
 		if (sta) {
+			u64 pn64;
+
 			tkip_sc = data->rsc_tsc->all_tsc_rsc.tkip.unicast_rsc;
 			tkip_tx_sc = &data->rsc_tsc->all_tsc_rsc.tkip.tsc;
 
 			rx_p1ks = data->tkip->rx_uni;
 
-			ieee80211_get_key_tx_seq(key, &seq);
-			tkip_tx_sc->iv16 = cpu_to_le16(seq.tkip.iv16);
-			tkip_tx_sc->iv32 = cpu_to_le32(seq.tkip.iv32);
+			pn64 = atomic64_read(&key->tx_pn);
+			tkip_tx_sc->iv16 = cpu_to_le16(TKIP_PN_TO_IV16(pn64));
+			tkip_tx_sc->iv32 = cpu_to_le32(TKIP_PN_TO_IV32(pn64));
 
 			ieee80211_get_tkip_p1k_iv(key, seq.tkip.iv32, p1k);
 			iwlagn_convert_p1k(p1k, data->tkip->tx.p1k);
@@ -996,19 +998,13 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw,
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 		if (sta) {
-			u8 *pn = seq.ccmp.pn;
+			u64 pn64;
 
 			aes_sc = data->rsc_tsc->all_tsc_rsc.aes.unicast_rsc;
 			aes_tx_sc = &data->rsc_tsc->all_tsc_rsc.aes.tsc;
 
-			ieee80211_get_key_tx_seq(key, &seq);
-			aes_tx_sc->pn = cpu_to_le64(
-					(u64)pn[5] |
-					((u64)pn[4] << 8) |
-					((u64)pn[3] << 16) |
-					((u64)pn[2] << 24) |
-					((u64)pn[1] << 32) |
-					((u64)pn[0] << 40));
+			pn64 = atomic64_read(&key->tx_pn);
+			aes_tx_sc->pn = cpu_to_le64(pn64);
 		} else
 			aes_sc = data->rsc_tsc->all_tsc_rsc.aes.multicast_rsc;
 
diff --git a/drivers/staging/vt6655/rxtx.c b/drivers/staging/vt6655/rxtx.c
index b668db6a45fb..1a2dda09b69d 100644
--- a/drivers/staging/vt6655/rxtx.c
+++ b/drivers/staging/vt6655/rxtx.c
@@ -1210,7 +1210,7 @@ static void vnt_fill_txkey(struct ieee80211_hdr *hdr, u8 *key_buffer,
 			   struct sk_buff *skb,	u16 payload_len,
 			   struct vnt_mic_hdr *mic_hdr)
 {
-	struct ieee80211_key_seq seq;
+	u64 pn64;
 	u8 *iv = ((u8 *)hdr + ieee80211_get_hdrlen_from_skb(skb));
 
 	/* strip header and icv len from payload */
@@ -1243,9 +1243,13 @@ static void vnt_fill_txkey(struct ieee80211_hdr *hdr, u8 *key_buffer,
 		mic_hdr->payload_len = cpu_to_be16(payload_len);
 		ether_addr_copy(mic_hdr->mic_addr2, hdr->addr2);
 
-		ieee80211_get_key_tx_seq(tx_key, &seq);
-
-		memcpy(mic_hdr->ccmp_pn, seq.ccmp.pn, IEEE80211_CCMP_PN_LEN);
+		pn64 = atomic64_read(&tx_key->tx_pn);
+		mic_hdr->ccmp_pn[5] = pn64;
+		mic_hdr->ccmp_pn[4] = pn64 >> 8;
+		mic_hdr->ccmp_pn[3] = pn64 >> 16;
+		mic_hdr->ccmp_pn[2] = pn64 >> 24;
+		mic_hdr->ccmp_pn[1] = pn64 >> 32;
+		mic_hdr->ccmp_pn[0] = pn64 >> 40;
 
 		if (ieee80211_has_a4(hdr->frame_control))
 			mic_hdr->hlen = cpu_to_be16(28);
diff --git a/drivers/staging/vt6656/rxtx.c b/drivers/staging/vt6656/rxtx.c
index efb54f53b4f9..76378d225b46 100644
--- a/drivers/staging/vt6656/rxtx.c
+++ b/drivers/staging/vt6656/rxtx.c
@@ -719,7 +719,7 @@ static void vnt_fill_txkey(struct vnt_usb_send_context *tx_context,
 	u16 payload_len, struct vnt_mic_hdr *mic_hdr)
 {
 	struct ieee80211_hdr *hdr = tx_context->hdr;
-	struct ieee80211_key_seq seq;
+	u64 pn64;
 	u8 *iv = ((u8 *)hdr + ieee80211_get_hdrlen_from_skb(skb));
 
 	/* strip header and icv len from payload */
@@ -752,9 +752,13 @@ static void vnt_fill_txkey(struct vnt_usb_send_context *tx_context,
 		mic_hdr->payload_len = cpu_to_be16(payload_len);
 		ether_addr_copy(mic_hdr->mic_addr2, hdr->addr2);
 
-		ieee80211_get_key_tx_seq(tx_key, &seq);
-
-		memcpy(mic_hdr->ccmp_pn, seq.ccmp.pn, IEEE80211_CCMP_PN_LEN);
+		pn64 = atomic64_read(&tx_key->tx_pn);
+		mic_hdr->ccmp_pn[5] = pn64;
+		mic_hdr->ccmp_pn[4] = pn64 >> 8;
+		mic_hdr->ccmp_pn[3] = pn64 >> 16;
+		mic_hdr->ccmp_pn[2] = pn64 >> 24;
+		mic_hdr->ccmp_pn[1] = pn64 >> 32;
+		mic_hdr->ccmp_pn[0] = pn64 >> 40;
 
 		if (ieee80211_has_a4(hdr->frame_control))
 			mic_hdr->hlen = cpu_to_be16(28);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 15879b49baad..66155d3ad7e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4463,23 +4463,6 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
  */
 u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn);
 
-/**
- * ieee80211_get_key_tx_seq - get key TX sequence counter
- *
- * @keyconf: the parameter passed with the set key
- * @seq: buffer to receive the sequence data
- *
- * This function allows a driver to retrieve the current TX IV/PN
- * for the given key. It must not be called if IV generation is
- * offloaded to the device.
- *
- * Note that this function may only be called when no TX processing
- * can be done concurrently, for example when queues are stopped
- * and the stop has been synchronized.
- */
-void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq);
-
 /**
  * ieee80211_get_key_rx_seq - get key RX sequence counter
  *
@@ -4499,23 +4482,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq);
 
-/**
- * ieee80211_set_key_tx_seq - set key TX sequence counter
- *
- * @keyconf: the parameter passed with the set key
- * @seq: new sequence data
- *
- * This function allows a driver to set the current TX IV/PNs for the
- * given key. This is useful when resuming from WoWLAN sleep and the
- * device may have transmitted frames using the PTK, e.g. replies to
- * ARP requests.
- *
- * Note that this function may only be called when no TX processing
- * can be done concurrently.
- */
-void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq);
-
 /**
  * ieee80211_set_key_rx_seq - set key RX sequence counter
  *
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index f9c4cb9c6e06..3df7b0392d30 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -932,51 +932,6 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid,
 }
 EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify);
 
-void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq)
-{
-	struct ieee80211_key *key;
-	u64 pn64;
-
-	if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV)))
-		return;
-
-	key = container_of(keyconf, struct ieee80211_key, conf);
-
-	switch (key->conf.cipher) {
-	case WLAN_CIPHER_SUITE_TKIP:
-		pn64 = atomic64_read(&key->conf.tx_pn);
-		seq->tkip.iv32 = TKIP_PN_TO_IV32(pn64);
-		seq->tkip.iv16 = TKIP_PN_TO_IV16(pn64);
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-	case WLAN_CIPHER_SUITE_CCMP_256:
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_cmac));
-	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_gmac));
-	case WLAN_CIPHER_SUITE_GCMP:
-	case WLAN_CIPHER_SUITE_GCMP_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), gcmp));
-		pn64 = atomic64_read(&key->conf.tx_pn);
-		seq->ccmp.pn[5] = pn64;
-		seq->ccmp.pn[4] = pn64 >> 8;
-		seq->ccmp.pn[3] = pn64 >> 16;
-		seq->ccmp.pn[2] = pn64 >> 24;
-		seq->ccmp.pn[1] = pn64 >> 32;
-		seq->ccmp.pn[0] = pn64 >> 40;
-		break;
-	default:
-		WARN_ON(1);
-	}
-}
-EXPORT_SYMBOL(ieee80211_get_key_tx_seq);
-
 void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq)
 {
@@ -1030,48 +985,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 }
 EXPORT_SYMBOL(ieee80211_get_key_rx_seq);
 
-void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq)
-{
-	struct ieee80211_key *key;
-	u64 pn64;
-
-	key = container_of(keyconf, struct ieee80211_key, conf);
-
-	switch (key->conf.cipher) {
-	case WLAN_CIPHER_SUITE_TKIP:
-		pn64 = (u64)seq->tkip.iv16 | ((u64)seq->tkip.iv32 << 16);
-		atomic64_set(&key->conf.tx_pn, pn64);
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-	case WLAN_CIPHER_SUITE_CCMP_256:
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_cmac));
-	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_gmac));
-	case WLAN_CIPHER_SUITE_GCMP:
-	case WLAN_CIPHER_SUITE_GCMP_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), gcmp));
-		pn64 = (u64)seq->ccmp.pn[5] |
-		       ((u64)seq->ccmp.pn[4] << 8) |
-		       ((u64)seq->ccmp.pn[3] << 16) |
-		       ((u64)seq->ccmp.pn[2] << 24) |
-		       ((u64)seq->ccmp.pn[1] << 32) |
-		       ((u64)seq->ccmp.pn[0] << 40);
-		atomic64_set(&key->conf.tx_pn, pn64);
-		break;
-	default:
-		WARN_ON(1);
-		break;
-	}
-}
-EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq);
-
 void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq)
 {
-- 
cgit v1.2.3


From 65554d07adfc22bb9e14f6df8c609a646f869a74 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 16 Feb 2016 12:48:17 +0200
Subject: mac80211: provide interface to driver to set VHT MU-MIMO data

Provide an interface to the lower level driver to set the VHT
MU-MIMO data. This is needed for example when there is an update
of the group data during low power state, where the management
frame will not be passed to the host at all.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 15 +++++++++++++++
 net/mac80211/vht.c     | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 66155d3ad7e6..23f2a5ecf669 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5445,6 +5445,21 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif)
 	return ieee80211_iftype_p2p(vif->type, vif->p2p);
 }
 
+/**
+ * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data
+ *
+ * @vif: the specified virtual interface
+ * @membership: 64 bits array - a bit is set if station is member of the group
+ * @position: 2 bits per group id indicating the position in the group
+ *
+ * Note: This function assumes that the given vif is valid and the position and
+ * membership data is of the correct size and are in the same byte order as the
+ * matching GroupId management frame.
+ * Calls to this function need to be serialized with RX path.
+ */
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+				const u8 *membership, const u8 *position);
+
 void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif,
 				   int rssi_min_thold,
 				   int rssi_max_thold);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 341d192cea52..f8f161179b5d 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -2,7 +2,7 @@
  * VHT handling
  *
  * Portions of this file
- * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -469,6 +469,20 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
 }
 
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+				const u8 *membership, const u8 *position)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+	if (WARN_ON_ONCE(!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)))
+		return;
+
+	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
+	memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
+}
+EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
+
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				 struct sta_info *sta, u8 opmode,
 				 enum ieee80211_band band)
-- 
cgit v1.2.3


From b5a33d52595f0cb153f09bf45a5dcd66a7418dbb Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 16 Feb 2016 12:48:18 +0200
Subject: mac80211: move MU_MIMO_OWNER flag to ieee80211_vif

Drivers may need to track which vif is using VHT MU-MIMO.
Move the flag indicationg the ownership of MU_MIMO to
ieee80211_vif.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  2 ++
 net/mac80211/ieee80211_i.h |  2 --
 net/mac80211/mlme.c        | 11 ++++++-----
 net/mac80211/util.c        |  2 +-
 net/mac80211/vht.c         |  7 +++----
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 23f2a5ecf669..0c09da34b67a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1382,6 +1382,7 @@ enum ieee80211_vif_flags {
  * @csa_active: marks whether a channel switch is going on. Internally it is
  *	write-protected by sdata_lock and local->mtx so holding either is fine
  *	for read access.
+ * @mu_mimo_owner: indicates interface owns MU-MIMO capability
  * @driver_flags: flags/capabilities the driver has for this interface,
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
@@ -1408,6 +1409,7 @@ struct ieee80211_vif {
 	u8 addr[ETH_ALEN];
 	bool p2p;
 	bool csa_active;
+	bool mu_mimo_owner;
 
 	u8 cab_queue;
 	u8 hw_queue[IEEE80211_NUM_ACS];
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a49c10361f1c..1630975c89f1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -716,7 +716,6 @@ struct ieee80211_if_mesh {
  *	back to wireless media and to the local net stack.
  * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
  * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
- * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability
  */
 enum ieee80211_sub_if_data_flags {
 	IEEE80211_SDATA_ALLMULTI		= BIT(0),
@@ -724,7 +723,6 @@ enum ieee80211_sub_if_data_flags {
 	IEEE80211_SDATA_DONT_BRIDGE_PACKETS	= BIT(3),
 	IEEE80211_SDATA_DISCONNECT_RESUME	= BIT(4),
 	IEEE80211_SDATA_IN_DRIVER		= BIT(5),
-	IEEE80211_SDATA_MU_MIMO_OWNER		= BIT(6),
 };
 
 /**
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 857089de475f..f41625bcd879 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6,7 +6,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -559,7 +559,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_sub_if_data *other;
 
 		list_for_each_entry_rcu(other, &local->interfaces, list) {
-			if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) {
+			if (other->vif.mu_mimo_owner) {
 				disable_mu_mimo = true;
 				break;
 			}
@@ -567,7 +567,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		if (disable_mu_mimo)
 			cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
 		else
-			sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER;
+			sdata->vif.mu_mimo_owner = true;
 	}
 
 	mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
@@ -2052,7 +2052,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(sdata->vif.bss_conf.mu_group.position, 0,
 	       sizeof(sdata->vif.bss_conf.mu_group.position));
 	changed |= BSS_CHANGED_MU_GROUPS;
-	sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+	sdata->vif.mu_mimo_owner = false;
 
 	sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
 
@@ -2509,7 +2509,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		eth_zero_addr(sdata->u.mgd.bssid);
 		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
-		sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+		sdata->vif.mu_mimo_owner = false;
+
 		mutex_lock(&sdata->local->mtx);
 		ieee80211_vif_release_channel(sdata);
 		mutex_unlock(&sdata->local->mtx);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f1e5b76eda70..89f71799df84 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1928,7 +1928,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_IDLE |
 			  BSS_CHANGED_TXPOWER;
 
-		if (sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)
+		if (sdata->vif.mu_mimo_owner)
 			changed |= BSS_CHANGED_MU_GROUPS;
 
 		switch (sdata->vif.type) {
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index f8f161179b5d..89e04d55aa18 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -450,7 +450,7 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 
-	if (!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER))
+	if (!sdata->vif.mu_mimo_owner)
 		return;
 
 	if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
@@ -472,10 +472,9 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
 				const u8 *membership, const u8 *position)
 {
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 
-	if (WARN_ON_ONCE(!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)))
+	if (WARN_ON_ONCE(!vif->mu_mimo_owner))
 		return;
 
 	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
-- 
cgit v1.2.3


From 0c9ca11b1ae8eb16c1b6bbae91991392d2321372 Mon Sep 17 00:00:00 2001
From: Beni Lev <beni.lev@intel.com>
Date: Wed, 17 Feb 2016 20:30:00 +0200
Subject: cfg80211: Add global RRM capability

Today, the supplicant will add the RRM capabilities
Information Element in the association request only if
Quiet period is supported (NL80211_FEATURE_QUIET).

Quiet is one of many RRM features, and there are other RRM
features that are not related to Quiet (e.g. neighbor
report). Therefore, requiring Quiet to enable RRM is too
restrictive.
Some of the features, like neighbor report, can be
supported by user space without any help from the kernel.
Hence adding the RRM capabilities IE to association request
should be the sole user space's decision.
Removing the RRM dependency on Quiet in the driver solves
this problem, but using an old driver with a user space
tool that would not require Quiet feature would be
problematic: the user space would add NL80211_ATTR_USE_RRM
in the association request even if the kernel doesn't
advertize NL80211_FEATURE_QUIET and the association would
be denied by the kernel.

This solution adds a global RRM capability, that tells user
space that it can request RRM capabilities IE publishment
without any specific feature support in the kernel.

Signed-off-by: Beni Lev <beni.lev@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  8 ++++++++
 net/wireless/nl80211.c       | 18 +++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7758969a2a8e..5a30a7563633 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1727,6 +1727,8 @@ enum nl80211_commands {
  *	underlying device supports these minimal RRM features:
  *		%NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES,
  *		%NL80211_FEATURE_QUIET,
+ *	Or, if global RRM is supported, see:
+ *		%NL80211_EXT_FEATURE_RRM
  *	If this flag is used, driver must add the Power Capabilities IE to the
  *	association request. In addition, it must also set the RRM capability
  *	flag in the association request's Capability Info field.
@@ -4402,12 +4404,18 @@ enum nl80211_feature_flags {
 /**
  * enum nl80211_ext_feature_index - bit index of extended features.
  * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates.
+ * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can
+ *	can request to use RRM (see %NL80211_ATTR_USE_RRM) with
+ *	%NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set
+ *	the ASSOC_REQ_USE_RRM flag in the association request even if
+ *	NL80211_FEATURE_QUIET is not advertized.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
 enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
+	NL80211_EXT_FEATURE_RRM,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 268cb493f6a5..90890f183c0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2016	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -7286,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM))
 			return -EINVAL;
 		req.flags |= ASSOC_REQ_USE_RRM;
 	}
@@ -7976,9 +7978,11 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM)) {
 			kzfree(connkeys);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 648b50dd6abf8e6e5b589bb8e6873a4596389dbe Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 25 Jan 2016 12:03:46 +0300
Subject: net: rfkill: add rfkill_find_type function

Helper for finding the type based on name. Useful if the
type needs to be determined based on device property.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
[modify rfkill_types array and BUILD_BUG_ON to not cause errors]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h | 15 +++++++++++++
 net/rfkill/core.c      | 58 ++++++++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 7af625f6d226..e6a0031d1b1f 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -213,6 +213,15 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
  * @rfkill: rfkill struct to query
  */
 bool rfkill_blocked(struct rfkill *rfkill);
+
+/**
+ * rfkill_find_type - Helpper for finding rfkill type by name
+ * @name: the name of the type
+ *
+ * Returns enum rfkill_type that conrresponds the name.
+ */
+enum rfkill_type rfkill_find_type(const char *name);
+
 #else /* !RFKILL */
 static inline struct rfkill * __must_check
 rfkill_alloc(const char *name,
@@ -269,6 +278,12 @@ static inline bool rfkill_blocked(struct rfkill *rfkill)
 {
 	return false;
 }
+
+static inline enum rfkill_type rfkill_find_type(const char *name)
+{
+	return RFKILL_TYPE_ALL;
+}
+
 #endif /* RFKILL || RFKILL_MODULE */
 
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a805831d5d9b..2a23479a49f2 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -572,6 +572,34 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 }
 EXPORT_SYMBOL(rfkill_set_states);
 
+static const char * const rfkill_types[] = {
+	NULL, /* RFKILL_TYPE_ALL */
+	"wlan",
+	"bluetooth",
+	"ultrawideband",
+	"wimax",
+	"wwan",
+	"gps",
+	"fm",
+	"nfc",
+};
+
+enum rfkill_type rfkill_find_type(const char *name)
+{
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES);
+
+	if (!name)
+		return RFKILL_TYPE_ALL;
+
+	for (i = 1; i < NUM_RFKILL_TYPES; i++)
+		if (!strcmp(name, rfkill_types[i]))
+			return i;
+	return RFKILL_TYPE_ALL;
+}
+EXPORT_SYMBOL(rfkill_find_type);
+
 static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
@@ -581,38 +609,12 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(name);
 
-static const char *rfkill_get_type_str(enum rfkill_type type)
-{
-	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1);
-
-	switch (type) {
-	case RFKILL_TYPE_WLAN:
-		return "wlan";
-	case RFKILL_TYPE_BLUETOOTH:
-		return "bluetooth";
-	case RFKILL_TYPE_UWB:
-		return "ultrawideband";
-	case RFKILL_TYPE_WIMAX:
-		return "wimax";
-	case RFKILL_TYPE_WWAN:
-		return "wwan";
-	case RFKILL_TYPE_GPS:
-		return "gps";
-	case RFKILL_TYPE_FM:
-		return "fm";
-	case RFKILL_TYPE_NFC:
-		return "nfc";
-	default:
-		BUG();
-	}
-}
-
 static ssize_t type_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
-	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+	return sprintf(buf, "%s\n", rfkill_types[rfkill->type]);
 }
 static DEVICE_ATTR_RO(type);
 
@@ -750,7 +752,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (error)
 		return error;
 	error = add_uevent_var(env, "RFKILL_TYPE=%s",
-			       rfkill_get_type_str(rfkill->type));
+			       rfkill_types[rfkill->type]);
 	if (error)
 		return error;
 	spin_lock_irqsave(&rfkill->lock, flags);
-- 
cgit v1.2.3


From fb2e6b7b7b02ab35a9d5355a69097a6f60c69d38 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 25 Jan 2016 12:03:49 +0300
Subject: net: rfkill: gpio: remove rfkill_gpio_platform_data

No more users for it.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill-gpio.h | 37 -------------------------------------
 net/rfkill/Kconfig          |  3 +--
 net/rfkill/rfkill-gpio.c    |  8 --------
 3 files changed, 1 insertion(+), 47 deletions(-)
 delete mode 100644 include/linux/rfkill-gpio.h

(limited to 'include')

diff --git a/include/linux/rfkill-gpio.h b/include/linux/rfkill-gpio.h
deleted file mode 100644
index 20bcb55498cd..000000000000
--- a/include/linux/rfkill-gpio.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2011, NVIDIA Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-
-#ifndef __RFKILL_GPIO_H
-#define __RFKILL_GPIO_H
-
-#include <linux/types.h>
-#include <linux/rfkill.h>
-
-/**
- * struct rfkill_gpio_platform_data - platform data for rfkill gpio device.
- * for unused gpio's, the expected value is -1.
- * @name:		name for the gpio rf kill instance
- */
-
-struct rfkill_gpio_platform_data {
-	char			*name;
-	enum rfkill_type	type;
-};
-
-#endif /* __RFKILL_GPIO_H */
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 598d374f6a35..868f1ad0415a 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -41,5 +41,4 @@ config RFKILL_GPIO
 	default n
 	help
 	  If you say yes here you get support of a generic gpio RFKILL
-	  driver. The platform should fill in the appropriate fields in the
-	  rfkill_gpio_platform_data structure and pass that to the driver.
+	  driver.
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 1a9c0316aad1..76c01cbd56e3 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -27,8 +27,6 @@
 #include <linux/acpi.h>
 #include <linux/gpio/consumer.h>
 
-#include <linux/rfkill-gpio.h>
-
 struct rfkill_gpio_data {
 	const char		*name;
 	enum rfkill_type	type;
@@ -89,7 +87,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev,
 
 static int rfkill_gpio_probe(struct platform_device *pdev)
 {
-	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
 	struct rfkill_gpio_data *rfkill;
 	struct gpio_desc *gpio;
 	const char *type_name;
@@ -111,11 +108,6 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 		ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill);
 		if (ret)
 			return ret;
-	} else if (pdata) {
-		rfkill->name = pdata->name;
-		rfkill->type = pdata->type;
-	} else {
-		return -ENODEV;
 	}
 
 	rfkill->clk = devm_clk_get(&pdev->dev, NULL);
-- 
cgit v1.2.3


From ada68c31ba9c02d7aabdd87db979fe670b499d54 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Mon, 22 Feb 2016 18:17:23 +0200
Subject: net/mlx5: Introduce a new header file for physical port functions

All the device physical port access functions are implemented in the
port.c file.
We just extract the exposure of these functions from driver.h into a
dedicated header file called port.h.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c              |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c |  1 +
 include/linux/mlx5/driver.h                    | 31 ------------
 include/linux/mlx5/port.h                      | 69 ++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 31 deletions(-)
 create mode 100644 include/linux/mlx5/port.h

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 03c418ccbc98..e1cea4415704 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -42,6 +42,7 @@
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index aac071a7e830..15f6cdb842d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -38,6 +38,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/cq.h>
+#include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/transobj.h>
 #include "wq.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index a87e773e93f3..1e863216ac4a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -32,6 +32,7 @@
 
 #include <linux/module.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/port.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1e3006dcf35d..02adc67720ce 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -794,37 +794,6 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
 			 int size_in, void *data_out, int size_out,
 			 u16 reg_num, int arg, int write);
 
-int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
-int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
-			 int ptys_size, int proto_mask, u8 local_port);
-int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
-			      u32 *proto_cap, int proto_mask);
-int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
-				u32 *proto_admin, int proto_mask);
-int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
-				    u8 *link_width_oper, u8 local_port);
-int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
-			       u8 *proto_oper, int proto_mask,
-			       u8 local_port);
-int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
-			int proto_mask);
-int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
-			       enum mlx5_port_status status);
-int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
-				 enum mlx5_port_status *status);
-
-int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
-void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
-			      u8 port);
-
-int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
-			      u8 *vl_hw_cap, u8 local_port);
-
-int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
-int mlx5_query_port_pause(struct mlx5_core_dev *dev,
-			  u32 *rx_pause, u32 *tx_pause);
-
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
new file mode 100644
index 000000000000..7accd4a65da5
--- /dev/null
+++ b/include/linux/mlx5/port.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_PORT_H__
+#define __MLX5_PORT_H__
+
+#include <linux/mlx5/driver.h>
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask, u8 local_port);
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask);
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask);
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port);
+int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
+			       u8 *proto_oper, int proto_mask,
+			       u8 local_port);
+int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
+			int proto_mask);
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status);
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status);
+
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+			      u8 port);
+
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port);
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause);
+
+#endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From ad909eb064219a64fd10e9c7d9f39a3042760025 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Mon, 22 Feb 2016 18:17:24 +0200
Subject: net/mlx5: Introduce physical port PFC access functions

Add access functions to set and query a physical port PFC
(Priority Flow Control) parameters.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 41 ++++++++++++++++++++++++++
 include/linux/mlx5/port.h                      |  4 +++
 2 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 1e863216ac4a..dae70500b6a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -364,3 +364,44 @@ int mlx5_query_port_pause(struct mlx5_core_dev *dev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_pause);
+
+int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pfctx, pfc_en_tx);
+	MLX5_SET(pfcc_reg, in, pfcrx, pfc_en_rx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_tx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_rx);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pfc);
+
+int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PFCC, 0, 0);
+	if (err)
+		return err;
+
+	if (pfc_en_tx)
+		*pfc_en_tx = MLX5_GET(pfcc_reg, out, pfctx);
+
+	if (pfc_en_rx)
+		*pfc_en_rx = MLX5_GET(pfcc_reg, out, pfcrx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pfc);
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 7accd4a65da5..4b3644caa936 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -66,4 +66,8 @@ int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
 int mlx5_query_port_pause(struct mlx5_core_dev *dev,
 			  u32 *rx_pause, u32 *tx_pause);
 
+int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx);
+int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx,
+			u8 *pfc_en_rx);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 4f3961eeafe0aca8f6b0933899ef0d91f561352d Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 22 Feb 2016 18:17:25 +0200
Subject: net/mlx5: Introduce physical port TC/prio access functions

Add access functions to set and query a physical port TC groups
and prio parameters.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 76 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  2 +
 include/linux/mlx5/mlx5_ifc.h                  | 49 ++++++++++++++++-
 include/linux/mlx5/port.h                      |  6 ++
 4 files changed, 132 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index dae70500b6a9..569100d3f57b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -405,3 +405,79 @@ int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_pfc);
+
+int mlx5_max_tc(struct mlx5_core_dev *mdev)
+{
+	u8 num_tc = MLX5_CAP_GEN(mdev, max_tc) ? : 8;
+
+	return num_tc - 1;
+}
+
+int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+	int i;
+
+	memset(in, 0, sizeof(in));
+	for (i = 0; i < 8; i++) {
+		if (prio_tc[i] > mlx5_max_tc(mdev))
+			return -EINVAL;
+
+		MLX5_SET(qtct_reg, in, prio, i);
+		MLX5_SET(qtct_reg, in, tclass, prio_tc[i]);
+
+		err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+					   sizeof(out), MLX5_REG_QTCT, 0, 1);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_prio_tc);
+
+static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
+				   int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -ENOTSUPP;
+
+	return mlx5_core_access_reg(mdev, in, inlen, out, sizeof(out),
+				    MLX5_REG_QETCR, 0, 1);
+}
+
+int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
+	int i;
+
+	memset(in, 0, sizeof(in));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].g, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].group, tc_group[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group);
+
+int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
+	int i;
+
+	memset(in, 0, sizeof(in));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].b, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].bw_allocation, tc_bw[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 02adc67720ce..a815da92d4eb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -99,6 +99,8 @@ enum {
 };
 
 enum {
+	MLX5_REG_QETCR		 = 0x4005,
+	MLX5_REG_QTCT		 = 0x400a,
 	MLX5_REG_PCAP		 = 0x5001,
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 51f1e540fc2b..ec957e059de8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -729,7 +729,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_1bf[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_at_1c7[0x18];
+	u8         reserved_at_1c7[0x4];
+	u8         max_tc[0x4];
+	u8         reserved_at_1cf[0x10];
 
 	u8         stat_rate_support[0x10];
 	u8         reserved_at_1ef[0xc];
@@ -7061,4 +7063,49 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         reserved_at_100[0x100];
 };
 
+struct mlx5_ifc_ets_tcn_config_reg_bits {
+	u8         g[0x1];
+	u8         b[0x1];
+	u8         r[0x1];
+	u8         reserved_at_3[0x9];
+	u8         group[0x4];
+	u8         reserved_at_10[0x9];
+	u8         bw_allocation[0x7];
+
+	u8         reserved_at_20[0xc];
+	u8         max_bw_units[0x4];
+	u8         reserved_at_30[0x8];
+	u8         max_bw_value[0x8];
+};
+
+struct mlx5_ifc_ets_global_config_reg_bits {
+	u8         reserved_at_0[0x2];
+	u8         r[0x1];
+	u8         reserved_at_3[0x1d];
+
+	u8         reserved_at_20[0xc];
+	u8         max_bw_units[0x4];
+	u8         reserved_at_30[0x8];
+	u8         max_bw_value[0x8];
+};
+
+struct mlx5_ifc_qetc_reg_bits {
+	u8                                         reserved_at_0[0x8];
+	u8                                         port_number[0x8];
+	u8                                         reserved_at_10[0x30];
+
+	struct mlx5_ifc_ets_tcn_config_reg_bits    tc_configuration[0x8];
+	struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
+};
+
+struct mlx5_ifc_qtct_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         port_number[0x8];
+	u8         reserved_at_10[0xd];
+	u8         prio[0x3];
+
+	u8         reserved_at_20[0x1d];
+	u8         tclass[0x3];
+};
+
 #endif /* MLX5_IFC_H */
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 4b3644caa936..0c67e699d017 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -70,4 +70,10 @@ int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx);
 int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx,
 			u8 *pfc_en_rx);
 
+int mlx5_max_tc(struct mlx5_core_dev *mdev);
+
+int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc);
+int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group);
+int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From d8880795dabf2381ed1e98348f6d9c7ea6fab950 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 22 Feb 2016 18:17:28 +0200
Subject: net/mlx5e: Implement DCBNL IEEE max rate

Add support for DCBNL IEEE get/set max rate.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 73 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 65 +++++++++++++++++++
 include/linux/mlx5/device.h                        |  6 ++
 include/linux/mlx5/port.h                          |  6 ++
 4 files changed, 150 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 39d8069ba9e3..3036f279a8fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -35,6 +35,9 @@
 
 #define MLX5E_MAX_PRIORITY 8
 
+#define MLX5E_100MB (100000)
+#define MLX5E_1GB   (1000000)
+
 static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev,
 				   struct ieee_ets *ets)
 {
@@ -219,9 +222,79 @@ static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode)
 	return 0;
 }
 
+static int mlx5e_dcbnl_ieee_getmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	int err;
+	int i;
+
+	err = mlx5_query_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+	if (err)
+		return err;
+
+	memset(maxrate->tc_maxrate, 0, sizeof(maxrate->tc_maxrate));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		switch (max_bw_unit[i]) {
+		case MLX5_100_MBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB;
+			break;
+		case MLX5_GBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB;
+			break;
+		case MLX5_BW_NO_LIMIT:
+			break;
+		default:
+			WARN(true, "non-supported BW unit");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	__u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB);
+	int i;
+
+	memset(max_bw_value, 0, sizeof(max_bw_value));
+	memset(max_bw_unit, 0, sizeof(max_bw_unit));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		if (!maxrate->tc_maxrate[i]) {
+			max_bw_unit[i]  = MLX5_BW_NO_LIMIT;
+			continue;
+		}
+		if (maxrate->tc_maxrate[i] < upper_limit_mbps) {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_100MB);
+			max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1;
+			max_bw_unit[i]  = MLX5_100_MBPS_UNIT;
+		} else {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_1GB);
+			max_bw_unit[i]  = MLX5_GBPS_UNIT;
+		}
+	}
+
+	return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+}
+
 const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
 	.ieee_getets	= mlx5e_dcbnl_ieee_getets,
 	.ieee_setets	= mlx5e_dcbnl_ieee_setets,
+	.ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate = mlx5e_dcbnl_ieee_setmaxrate,
 	.ieee_getpfc	= mlx5e_dcbnl_ieee_getpfc,
 	.ieee_setpfc	= mlx5e_dcbnl_ieee_setpfc,
 	.getdcbx	= mlx5e_dcbnl_getdcbx,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 569100d3f57b..d97605ef3efd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -450,6 +450,19 @@ static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
 				    MLX5_REG_QETCR, 0, 1);
 }
 
+static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out,
+				     int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -ENOTSUPP;
+
+	memset(in, 0, sizeof(in));
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen,
+				    MLX5_REG_QETCR, 0, 0);
+}
+
 int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group)
 {
 	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
@@ -481,3 +494,55 @@ int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw)
 	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc);
+
+int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				    u8 *max_bw_value,
+				    u8 *max_bw_units)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int i;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(qetc_reg, in, port_number, 1);
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, in, tc_configuration[i]);
+
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, r, 1);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_units,
+			 max_bw_units[i]);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_value,
+			 max_bw_value[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_port_ets_rate_limit);
+
+int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				   u8 *max_bw_value,
+				   u8 *max_bw_units)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+	int i;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, tc_configuration[i]);
+
+		max_bw_value[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_value);
+		max_bw_units[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_units);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 987764afa65c..bfc1ab0552d3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -350,6 +350,12 @@ enum {
 	MLX5_SET_PORT_PKEY_TABLE	= 20,
 };
 
+enum {
+	MLX5_BW_NO_LIMIT   = 0,
+	MLX5_100_MBPS_UNIT = 3,
+	MLX5_GBPS_UNIT	   = 4,
+};
+
 enum {
 	MLX5_MAX_PAGE_SHIFT		= 31
 };
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 0c67e699d017..595c7b2d9bfa 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -75,5 +75,11 @@ int mlx5_max_tc(struct mlx5_core_dev *mdev);
 int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc);
 int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group);
 int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw);
+int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				    u8 *max_bw_value,
+				    u8 *max_bw_unit);
+int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				   u8 *max_bw_value,
+				   u8 *max_bw_unit);
 
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 928cfe8745a62e60c1e8e06676a74724e7786024 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 22 Feb 2016 18:17:29 +0200
Subject: net/mlx5e: Wake On LAN support

Implement set/get WOL by ethtool and added the needed
device commands and structures to mlx5_ifc.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   6 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 125 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     |  38 +++++++
 include/linux/mlx5/device.h                        |  11 ++
 include/linux/mlx5/mlx5_ifc.h                      |  62 +++++++++-
 include/linux/mlx5/port.h                          |   2 +
 6 files changed, 243 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 037fc4cdf5af..9ce87c624450 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -560,6 +560,12 @@ const char *mlx5_command_str(int command)
 	case MLX5_CMD_OP_ACCESS_REG:
 		return "MLX5_CMD_OP_ACCESS_REG";
 
+	case MLX5_CMD_OP_SET_WOL_ROL:
+		return "SET_WOL_ROL";
+
+	case MLX5_CMD_OP_QUERY_WOL_ROL:
+		return "QUERY_WOL_ROL";
+
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 65624ac65b4c..e9760f895744 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -884,6 +884,129 @@ static int mlx5e_get_ts_info(struct net_device *dev,
 	return 0;
 }
 
+static __u32 mlx5e_get_wol_supported(struct mlx5_core_dev *mdev)
+{
+	__u32 ret = 0;
+
+	if (MLX5_CAP_GEN(mdev, wol_g))
+		ret |= WAKE_MAGIC;
+
+	if (MLX5_CAP_GEN(mdev, wol_s))
+		ret |= WAKE_MAGICSECURE;
+
+	if (MLX5_CAP_GEN(mdev, wol_a))
+		ret |= WAKE_ARP;
+
+	if (MLX5_CAP_GEN(mdev, wol_b))
+		ret |= WAKE_BCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_m))
+		ret |= WAKE_MCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_u))
+		ret |= WAKE_UCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_p))
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static __u32 mlx5e_refomrat_wol_mode_mlx5_to_linux(u8 mode)
+{
+	__u32 ret = 0;
+
+	if (mode & MLX5_WOL_MAGIC)
+		ret |= WAKE_MAGIC;
+
+	if (mode & MLX5_WOL_SECURED_MAGIC)
+		ret |= WAKE_MAGICSECURE;
+
+	if (mode & MLX5_WOL_ARP)
+		ret |= WAKE_ARP;
+
+	if (mode & MLX5_WOL_BROADCAST)
+		ret |= WAKE_BCAST;
+
+	if (mode & MLX5_WOL_MULTICAST)
+		ret |= WAKE_MCAST;
+
+	if (mode & MLX5_WOL_UNICAST)
+		ret |= WAKE_UCAST;
+
+	if (mode & MLX5_WOL_PHY_ACTIVITY)
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static u8 mlx5e_refomrat_wol_mode_linux_to_mlx5(__u32 mode)
+{
+	u8 ret = 0;
+
+	if (mode & WAKE_MAGIC)
+		ret |= MLX5_WOL_MAGIC;
+
+	if (mode & WAKE_MAGICSECURE)
+		ret |= MLX5_WOL_SECURED_MAGIC;
+
+	if (mode & WAKE_ARP)
+		ret |= MLX5_WOL_ARP;
+
+	if (mode & WAKE_BCAST)
+		ret |= MLX5_WOL_BROADCAST;
+
+	if (mode & WAKE_MCAST)
+		ret |= MLX5_WOL_MULTICAST;
+
+	if (mode & WAKE_UCAST)
+		ret |= MLX5_WOL_UNICAST;
+
+	if (mode & WAKE_PHY)
+		ret |= MLX5_WOL_PHY_ACTIVITY;
+
+	return ret;
+}
+
+static void mlx5e_get_wol(struct net_device *netdev,
+			  struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 mlx5_wol_mode;
+	int err;
+
+	memset(wol, 0, sizeof(*wol));
+
+	wol->supported = mlx5e_get_wol_supported(mdev);
+	if (!wol->supported)
+		return;
+
+	err = mlx5_query_port_wol(mdev, &mlx5_wol_mode);
+	if (err)
+		return;
+
+	wol->wolopts = mlx5e_refomrat_wol_mode_mlx5_to_linux(mlx5_wol_mode);
+}
+
+static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	__u32 wol_supported = mlx5e_get_wol_supported(mdev);
+	u32 mlx5_wol_mode;
+
+	if (!wol_supported)
+		return -ENOTSUPP;
+
+	if (wol->wolopts & ~wol_supported)
+		return -EINVAL;
+
+	mlx5_wol_mode = mlx5e_refomrat_wol_mode_linux_to_mlx5(wol->wolopts);
+
+	return mlx5_set_port_wol(mdev, mlx5_wol_mode);
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -908,4 +1031,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_pauseparam    = mlx5e_get_pauseparam,
 	.set_pauseparam    = mlx5e_set_pauseparam,
 	.get_ts_info       = mlx5e_get_ts_info,
+	.get_wol	   = mlx5e_get_wol,
+	.set_wol	   = mlx5e_set_wol,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index d97605ef3efd..e1f2e1059cfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -546,3 +546,41 @@ int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit);
+
+int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)];
+	u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL);
+	MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1);
+	MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode);
+
+	return mlx5_cmd_exec_check_status(mdev, in, sizeof(in),
+					  out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_wol);
+
+int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)];
+	u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL);
+
+	err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in),
+					 out, sizeof(out));
+
+	if (!err)
+		*wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_wol);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index bfc1ab0552d3..68a56bc37df2 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1183,6 +1183,17 @@ enum {
 	MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
 };
 
+enum mlx5_wol_mode {
+	MLX5_WOL_DISABLE        = 0,
+	MLX5_WOL_SECURED_MAGIC  = 1 << 1,
+	MLX5_WOL_MAGIC          = 1 << 2,
+	MLX5_WOL_ARP            = 1 << 3,
+	MLX5_WOL_BROADCAST      = 1 << 4,
+	MLX5_WOL_MULTICAST      = 1 << 5,
+	MLX5_WOL_UNICAST        = 1 << 6,
+	MLX5_WOL_PHY_ACTIVITY   = 1 << 7,
+};
+
 /* MLX5 DEV CAPs */
 
 /* TODO: EAT.ME */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ec957e059de8..03ffe9530365 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -166,6 +166,8 @@ enum {
 	MLX5_CMD_OP_SET_L2_TABLE_ENTRY            = 0x829,
 	MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY          = 0x82a,
 	MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY         = 0x82b,
+	MLX5_CMD_OP_SET_WOL_ROL                   = 0x830,
+	MLX5_CMD_OP_QUERY_WOL_ROL                 = 0x831,
 	MLX5_CMD_OP_CREATE_TIR                    = 0x900,
 	MLX5_CMD_OP_MODIFY_TIR                    = 0x901,
 	MLX5_CMD_OP_DESTROY_TIR                   = 0x902,
@@ -731,7 +733,17 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_msg[0x5];
 	u8         reserved_at_1c7[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1cf[0x10];
+	u8         reserved_at_1cf[0x6];
+	u8         rol_s[0x1];
+	u8         rol_g[0x1];
+	u8         reserved_at_1d7[0x1];
+	u8         wol_s[0x1];
+	u8         wol_g[0x1];
+	u8         wol_a[0x1];
+	u8         wol_b[0x1];
+	u8         wol_m[0x1];
+	u8         wol_u[0x1];
+	u8         wol_p[0x1];
 
 	u8         stat_rate_support[0x10];
 	u8         reserved_at_1ef[0xc];
@@ -6873,6 +6885,54 @@ struct mlx5_ifc_mtt_bits {
 	u8         rd_en[0x1];
 };
 
+struct mlx5_ifc_query_wol_rol_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         rol_mode[0x8];
+	u8         wol_mode[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_wol_rol_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_wol_rol_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_wol_rol_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         rol_mode_valid[0x1];
+	u8         wol_mode_valid[0x1];
+	u8         reserved_at_42[0xe];
+	u8         rol_mode[0x8];
+	u8         wol_mode[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
 enum {
 	MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER  = 0x0,
 	MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED     = 0x1,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 595c7b2d9bfa..a1d145abd4eb 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -81,5 +81,7 @@ int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 				   u8 *max_bw_value,
 				   u8 *max_bw_unit);
+int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode);
+int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
 
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 1d4150c02c5709fdfd80f10368a31867de35e72e Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 22 Feb 2016 15:57:52 -0800
Subject: net_sched: prepare tcf_hashinfo_destroy() for netns support

We only release the memory of the hashtable itself, not its
entries inside. This is not a problem yet since we only call
it in module release path, and module is refcount'ed by
actions. This would be a problem after we move the per module
hinfo into per netns in the latter patch.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  5 -----
 net/sched/act_api.c   | 32 +++++++++++++++++++++++++++++---
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9d446f136607..8c4e3ff723fb 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -65,11 +65,6 @@ static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
 	return 0;
 }
 
-static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf)
-{
-	kfree(hf->htab);
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 06e7c4a37245..acafaf7434fc 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -69,7 +69,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 			if (a->ops->cleanup)
 				a->ops->cleanup(a, bind);
 			tcf_hash_destroy(a);
-			ret = 1;
+			ret = ACT_P_DELETED;
 		}
 	}
 
@@ -302,6 +302,32 @@ void tcf_hash_insert(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_insert);
 
+static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
+{
+	struct tcf_hashinfo *hinfo = ops->hinfo;
+	struct tc_action a = {
+		.ops = ops,
+	};
+	int i;
+
+	for (i = 0; i < hinfo->hmask + 1; i++) {
+		struct tcf_common *p;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
+			int ret;
+
+			a.priv = p;
+			ret = __tcf_hash_release(&a, false, true);
+			if (ret == ACT_P_DELETED)
+				module_put(ops->owner);
+			else if (ret < 0)
+				return;
+		}
+	}
+	kfree(hinfo->htab);
+}
+
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
@@ -333,7 +359,7 @@ int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
-			tcf_hashinfo_destroy(act->hinfo);
+			tcf_hashinfo_destroy(act);
 			kfree(act->hinfo);
 			return -EEXIST;
 		}
@@ -353,7 +379,7 @@ int tcf_unregister_action(struct tc_action_ops *act)
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
 			list_del(&act->head);
-			tcf_hashinfo_destroy(act->hinfo);
+			tcf_hashinfo_destroy(act);
 			kfree(act->hinfo);
 			err = 0;
 			break;
-- 
cgit v1.2.3


From ddf97ccdd7cb7e00daba465a5c947b8d941dc2a4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 22 Feb 2016 15:57:53 -0800
Subject: net_sched: add network namespace support for tc actions

Currently tc actions are stored in a per-module hashtable,
therefore are visible to all network namespaces. This is
probably the last part of the tc subsystem which is not
aware of netns now. This patch makes them per-netns,
several tc action API's need to be adjusted for this.

The tc action API code is ugly due to historical reasons,
we need to refactor that code in the future.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h    |  58 ++++++++++++++++++----
 net/sched/act_api.c      | 113 +++++++++++++++++++++--------------------
 net/sched/act_bpf.c      |  52 +++++++++++++++++--
 net/sched/act_connmark.c |  54 +++++++++++++++++---
 net/sched/act_csum.c     |  59 +++++++++++++++++++---
 net/sched/act_gact.c     |  55 +++++++++++++++++---
 net/sched/act_ipt.c      | 127 ++++++++++++++++++++++++++++++++++++++++++-----
 net/sched/act_mirred.c   |  54 +++++++++++++++++---
 net/sched/act_nat.c      |  54 +++++++++++++++++---
 net/sched/act_pedit.c    |  54 +++++++++++++++++---
 net/sched/act_police.c   |  52 +++++++++++++++----
 net/sched/act_simple.c   |  55 +++++++++++++++++---
 net/sched/act_skbedit.c  |  54 +++++++++++++++++---
 net/sched/act_vlan.c     |  54 +++++++++++++++++---
 14 files changed, 746 insertions(+), 149 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8c4e3ff723fb..342be6c5ab5c 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -7,6 +7,8 @@
 
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 struct tcf_common {
 	struct hlist_node		tcfc_head;
@@ -87,31 +89,65 @@ struct tc_action {
 	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
 	__u32			order;
 	struct list_head	list;
+	struct tcf_hashinfo	*hinfo;
 };
 
 struct tc_action_ops {
 	struct list_head head;
-	struct tcf_hashinfo *hinfo;
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *);
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	void	(*cleanup)(struct tc_action *, int bind);
-	int     (*lookup)(struct tc_action *, u32);
+	int     (*lookup)(struct net *, struct tc_action *, u32);
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action *act, int ovr,
 			int bind);
-	int     (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *);
+	int     (*walk)(struct net *, struct sk_buff *,
+			struct netlink_callback *, int, struct tc_action *);
+};
+
+struct tc_action_net {
+	struct tcf_hashinfo *hinfo;
+	const struct tc_action_ops *ops;
 };
 
-int tcf_hash_search(struct tc_action *a, u32 index);
-u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
-int tcf_hash_check(u32 index, struct tc_action *a, int bind);
-int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind, bool cpustats);
+static inline
+int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops,
+		       unsigned int mask)
+{
+	int err = 0;
+
+	tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
+	if (!tn->hinfo)
+		return -ENOMEM;
+	tn->ops = ops;
+	err = tcf_hashinfo_init(tn->hinfo, mask);
+	if (err)
+		kfree(tn->hinfo);
+	return err;
+}
+
+void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
+			  struct tcf_hashinfo *hinfo);
+
+static inline void tc_action_net_exit(struct tc_action_net *tn)
+{
+	tcf_hashinfo_destroy(tn->ops, tn->hinfo);
+}
+
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+		       struct netlink_callback *cb, int type,
+		       struct tc_action *a);
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index);
+u32 tcf_hash_new_index(struct tc_action_net *tn);
+int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		   int bind);
+int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		    struct tc_action *a, int size, int bind, bool cpustats);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
-void tcf_hash_insert(struct tc_action *a);
+void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a);
 
 int __tcf_hash_release(struct tc_action *a, bool bind, bool strict);
 
@@ -120,8 +156,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind)
 	return __tcf_hash_release(a, bind, false);
 }
 
-int tcf_register_action(struct tc_action_ops *a, unsigned int mask);
-int tcf_unregister_action(struct tc_action_ops *a);
+int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
+int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops);
 int tcf_action_destroy(struct list_head *actions, int bind);
 int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
 		    struct tcf_result *res);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index acafaf7434fc..96066665e376 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head)
 	kfree(p);
 }
 
-static void tcf_hash_destroy(struct tc_action *a)
+static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfc_head);
@@ -68,7 +67,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
 			if (a->ops->cleanup)
 				a->ops->cleanup(a, bind);
-			tcf_hash_destroy(a);
+			tcf_hash_destroy(a->hinfo, a);
 			ret = ACT_P_DELETED;
 		}
 	}
@@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 }
 EXPORT_SYMBOL(__tcf_hash_release);
 
-static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			   struct tc_action *a)
+static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+			   struct netlink_callback *cb, struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -126,9 +124,9 @@ nla_put_failure:
 	goto done;
 }
 
-static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
+static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+			  struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct hlist_head *head;
 	struct hlist_node *n;
 	struct tcf_common *p;
@@ -163,18 +161,24 @@ nla_put_failure:
 	return ret;
 }
 
-static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			      int type, struct tc_action *a)
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+		       struct netlink_callback *cb, int type,
+		       struct tc_action *a)
 {
+	struct tcf_hashinfo *hinfo = tn->hinfo;
+
+	a->hinfo = hinfo;
+
 	if (type == RTM_DELACTION) {
-		return tcf_del_walker(skb, a);
+		return tcf_del_walker(hinfo, skb, a);
 	} else if (type == RTM_GETACTION) {
-		return tcf_dump_walker(skb, cb, a);
+		return tcf_dump_walker(hinfo, skb, cb, a);
 	} else {
 		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
 		return -EINVAL;
 	}
 }
+EXPORT_SYMBOL(tcf_generic_walker);
 
 static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 {
@@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 	return p;
 }
 
-u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
+u32 tcf_hash_new_index(struct tc_action_net *tn)
 {
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	u32 val = hinfo->index;
 
 	do {
@@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
 }
 EXPORT_SYMBOL(tcf_hash_new_index);
 
-int tcf_hash_search(struct tc_action *a, u32 index)
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
 
 	if (p) {
 		a->priv = p;
+		a->hinfo = hinfo;
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_search);
 
-int tcf_hash_check(u32 index, struct tc_action *a, int bind)
+int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		   int bind)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = NULL;
 	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
 		if (bind)
 			p->tcfc_bindcnt++;
 		p->tcfc_refcnt++;
 		a->priv = p;
+		a->hinfo = hinfo;
 		return 1;
 	}
 	return 0;
@@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
 
-int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind, bool cpustats)
+int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		    struct tc_action *a, int size, int bind, bool cpustats)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int err = -ENOMEM;
 
 	if (unlikely(!p))
@@ -272,7 +280,7 @@ err2:
 	}
 	spin_lock_init(&p->tcfc_lock);
 	INIT_HLIST_NODE(&p->tcfc_head);
-	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
+	p->tcfc_index = index ? index : tcf_hash_new_index(tn);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
 	if (est) {
@@ -286,14 +294,15 @@ err2:
 	}
 
 	a->priv = (void *) p;
+	a->hinfo = hinfo;
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_create);
 
-void tcf_hash_insert(struct tc_action *a)
+void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
 
 	spin_lock_bh(&hinfo->lock);
@@ -302,11 +311,12 @@ void tcf_hash_insert(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_insert);
 
-static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
+void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
+			  struct tcf_hashinfo *hinfo)
 {
-	struct tcf_hashinfo *hinfo = ops->hinfo;
 	struct tc_action a = {
 		.ops = ops,
+		.hinfo = hinfo,
 	};
 	int i;
 
@@ -327,60 +337,52 @@ static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
 	}
 	kfree(hinfo->htab);
 }
+EXPORT_SYMBOL(tcf_hashinfo_destroy);
 
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
-int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
+int tcf_register_action(struct tc_action_ops *act,
+			struct pernet_operations *ops)
 {
 	struct tc_action_ops *a;
-	int err;
+	int ret;
 
-	/* Must supply act, dump and init */
-	if (!act->act || !act->dump || !act->init)
+	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
 		return -EINVAL;
 
-	/* Supply defaults */
-	if (!act->lookup)
-		act->lookup = tcf_hash_search;
-	if (!act->walk)
-		act->walk = tcf_generic_walker;
-
-	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
-	if (!act->hinfo)
-		return -ENOMEM;
-	err = tcf_hashinfo_init(act->hinfo, mask);
-	if (err) {
-		kfree(act->hinfo);
-		return err;
-	}
-
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
-			tcf_hashinfo_destroy(act);
-			kfree(act->hinfo);
 			return -EEXIST;
 		}
 	}
 	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
+
+	ret = register_pernet_subsys(ops);
+	if (ret) {
+		tcf_unregister_action(act, ops);
+		return ret;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(tcf_register_action);
 
-int tcf_unregister_action(struct tc_action_ops *act)
+int tcf_unregister_action(struct tc_action_ops *act,
+			  struct pernet_operations *ops)
 {
 	struct tc_action_ops *a;
 	int err = -ENOENT;
 
+	unregister_pernet_subsys(ops);
+
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
 			list_del(&act->head);
-			tcf_hashinfo_destroy(act);
-			kfree(act->hinfo);
 			err = 0;
 			break;
 		}
@@ -747,8 +749,8 @@ static struct tc_action *create_a(int i)
 	return act;
 }
 
-static struct tc_action *
-tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
+static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
+					  struct nlmsghdr *n, u32 portid)
 {
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_action *a;
@@ -775,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
 	if (a->ops == NULL) /* could happen in batch of actions */
 		goto err_free;
 	err = -ENOENT;
-	if (a->ops->lookup(a, index) == 0)
+	if (a->ops->lookup(net, a, index) == 0)
 		goto err_mod;
 
 	module_put(a->ops->owner);
@@ -845,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	if (nest == NULL)
 		goto out_module_put;
 
-	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
+	err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a);
 	if (err < 0)
 		goto out_module_put;
 	if (err == 0)
@@ -923,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_get_1(tb[i], n, portid);
+		act = tcf_action_get_1(net, tb[i], n, portid);
 		if (IS_ERR(act)) {
 			ret = PTR_ERR(act);
 			goto err;
@@ -1070,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n)
 static int
 tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
@@ -1104,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nest == NULL)
 		goto out_module_put;
 
-	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+	ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a);
 	if (ret < 0)
 		goto out_module_put;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0bc6f912f870..8c9f1f0459ab 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -33,6 +33,8 @@ struct tcf_bpf_cfg {
 	bool is_ebpf;
 };
 
+static int bpf_net_id;
+
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		   struct tcf_result *res)
 {
@@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action *act,
 			int replace, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
 	struct tcf_bpf_cfg cfg, old;
 	struct tc_act_bpf *parm;
@@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
 
-	if (!tcf_hash_check(parm->index, act, bind)) {
-		ret = tcf_hash_create(parm->index, est, act,
+	if (!tcf_hash_check(tn, parm->index, act, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, act,
 				      sizeof(*prog), bind, true);
 		if (ret < 0)
 			return ret;
@@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	rcu_assign_pointer(prog->filter, cfg.filter);
 
 	if (res == ACT_P_CREATED) {
-		tcf_hash_insert(act);
+		tcf_hash_insert(tn, act);
 	} else {
 		/* make sure the program being replaced is no longer executing */
 		synchronize_rcu();
@@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
 	tcf_bpf_cfg_cleanup(&tmp);
 }
 
+static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.kind		=	"bpf",
 	.type		=	TCA_ACT_BPF,
@@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.dump		=	tcf_bpf_dump,
 	.cleanup	=	tcf_bpf_cleanup,
 	.init		=	tcf_bpf_init,
+	.walk		=	tcf_bpf_walker,
+	.lookup		=	tcf_bpf_search,
+};
+
+static __net_init int bpf_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK);
+}
+
+static void __net_exit bpf_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations bpf_net_ops = {
+	.init = bpf_init_net,
+	.exit = bpf_exit_net,
+	.id   = &bpf_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init bpf_init_module(void)
 {
-	return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK);
+	return tcf_register_action(&act_bpf_ops, &bpf_net_ops);
 }
 
 static void __exit bpf_cleanup_module(void)
 {
-	tcf_unregister_action(&act_bpf_ops);
+	tcf_unregister_action(&act_bpf_ops, &bpf_net_ops);
 }
 
 module_init(bpf_init_module);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index bb41699c6c49..c0ed93ce2391 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,6 +30,8 @@
 
 #define CONNMARK_TAB_MASK     3
 
+static int connmark_net_id;
+
 static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 			struct tcf_result *res)
 {
@@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action *a,
 			     int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
 	struct tcf_connmark_info *ci;
 	struct tc_connmark *parm;
@@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*ci), bind, false);
 		if (ret)
 			return ret;
 
@@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		ci->net = net;
 		ci->zone = parm->zone;
 
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 		ret = ACT_P_CREATED;
 	} else {
 		ci = to_connmark(a);
@@ -169,6 +172,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
+			       struct netlink_callback *cb, int type,
+			       struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_connmark_ops = {
 	.kind		=	"connmark",
 	.type		=	TCA_ACT_CONNMARK,
@@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = {
 	.act		=	tcf_connmark,
 	.dump		=	tcf_connmark_dump,
 	.init		=	tcf_connmark_init,
+	.walk		=	tcf_connmark_walker,
+	.lookup		=	tcf_connmark_search,
+};
+
+static __net_init int connmark_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK);
+}
+
+static void __net_exit connmark_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations connmark_net_ops = {
+	.init = connmark_init_net,
+	.exit = connmark_exit_net,
+	.id   = &connmark_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init connmark_init_module(void)
 {
-	return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK);
+	return tcf_register_action(&act_connmark_ops, &connmark_net_ops);
 }
 
 static void __exit connmark_cleanup_module(void)
 {
-	tcf_unregister_action(&act_connmark_ops);
+	tcf_unregister_action(&act_connmark_ops, &connmark_net_ops);
 }
 
 module_init(connmark_init_module);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index eeb3eb3ea9eb..d22426cdebc0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
 };
 
-static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
-			 struct tc_action *a, int ovr, int bind)
+static int csum_net_id;
+
+static int tcf_csum_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a, int ovr,
+			 int bind)
 {
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tc_csum *parm;
 	struct tcf_csum *p;
@@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 
 	return ret;
 }
@@ -555,6 +559,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_csum_ops = {
 	.kind		= "csum",
 	.type		= TCA_ACT_CSUM,
@@ -562,6 +582,29 @@ static struct tc_action_ops act_csum_ops = {
 	.act		= tcf_csum,
 	.dump		= tcf_csum_dump,
 	.init		= tcf_csum_init,
+	.walk		= tcf_csum_walker,
+	.lookup		= tcf_csum_search,
+};
+
+static __net_init int csum_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK);
+}
+
+static void __net_exit csum_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations csum_net_ops = {
+	.init = csum_init_net,
+	.exit = csum_exit_net,
+	.id   = &csum_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_DESCRIPTION("Checksum updating actions");
@@ -569,12 +612,12 @@ MODULE_LICENSE("GPL");
 
 static int __init csum_init_module(void)
 {
-	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
+	return tcf_register_action(&act_csum_ops, &csum_net_ops);
 }
 
 static void __exit csum_cleanup_module(void)
 {
-	tcf_unregister_action(&act_csum_ops);
+	tcf_unregister_action(&act_csum_ops, &csum_net_ops);
 }
 
 module_init(csum_init_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 5c1b05170736..887fc1f209ff 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,6 +25,8 @@
 
 #define GACT_TAB_MASK	15
 
+static int gact_net_id;
+
 #ifdef CONFIG_GACT_PROB
 static int gact_net_rand(struct tcf_gact *gact)
 {
@@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
 	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tc_gact *parm;
 	struct tcf_gact *gact;
@@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
-				      bind, true);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*gact), bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -183,6 +186,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
 	.type		=	TCA_ACT_GACT,
@@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = {
 	.act		=	tcf_gact,
 	.dump		=	tcf_gact_dump,
 	.init		=	tcf_gact_init,
+	.walk		=	tcf_gact_walker,
+	.lookup		=	tcf_gact_search,
+};
+
+static __net_init int gact_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK);
+}
+
+static void __net_exit gact_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations gact_net_ops = {
+	.init = gact_init_net,
+	.exit = gact_exit_net,
+	.id   = &gact_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -203,12 +245,13 @@ static int __init gact_init_module(void)
 #else
 	pr_info("GACT probability NOT on\n");
 #endif
-	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
+
+	return tcf_register_action(&act_gact_ops, &gact_net_ops);
 }
 
 static void __exit gact_cleanup_module(void)
 {
-	tcf_unregister_action(&act_gact_ops);
+	tcf_unregister_action(&act_gact_ops, &gact_net_ops);
 }
 
 module_init(gact_init_module);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d05869646515..89c41a1f3589 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,6 +30,10 @@
 
 #define IPT_TAB_MASK     15
 
+static int ipt_net_id;
+
+static int xt_net_id;
+
 static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
 	struct xt_tgchk_param par;
@@ -83,8 +87,9 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
 };
 
-static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action *a, int ovr, int bind)
+static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
+			  struct nlattr *est, struct tc_action *a, int ovr,
+			  int bind)
 {
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
@@ -113,8 +118,9 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_IPT_INDEX] != NULL)
 		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
-	if (!tcf_hash_check(index, a, bind) ) {
-		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
+	if (!tcf_hash_check(tn, index, a, bind)) {
+		ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
+				      false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -157,7 +163,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	ipt->tcfi_hook  = hook;
 	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 
 err3:
@@ -170,6 +176,24 @@ err1:
 	return err;
 }
 
+static int tcf_ipt_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a, int ovr,
+			int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
+static int tcf_xt_init(struct net *net, struct nlattr *nla,
+		       struct nlattr *est, struct tc_action *a, int ovr,
+		       int bind)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
 static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
@@ -260,6 +284,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
 	.type		=	TCA_ACT_IPT,
@@ -268,8 +308,47 @@ static struct tc_action_ops act_ipt_ops = {
 	.dump		=	tcf_ipt_dump,
 	.cleanup	=	tcf_ipt_release,
 	.init		=	tcf_ipt_init,
+	.walk		=	tcf_ipt_walker,
+	.lookup		=	tcf_ipt_search,
+};
+
+static __net_init int ipt_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit ipt_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ipt_net_ops = {
+	.init = ipt_init_net,
+	.exit = ipt_exit_net,
+	.id   = &ipt_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
+static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
+			 struct netlink_callback *cb, int type,
+			 struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_xt_ops = {
 	.kind		=	"xt",
 	.type		=	TCA_ACT_XT,
@@ -277,7 +356,30 @@ static struct tc_action_ops act_xt_ops = {
 	.act		=	tcf_ipt,
 	.dump		=	tcf_ipt_dump,
 	.cleanup	=	tcf_ipt_release,
-	.init		=	tcf_ipt_init,
+	.init		=	tcf_xt_init,
+	.walk		=	tcf_xt_walker,
+	.lookup		=	tcf_xt_search,
+};
+
+static __net_init int xt_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit xt_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations xt_net_ops = {
+	.init = xt_init_net,
+	.exit = xt_exit_net,
+	.id   = &xt_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
@@ -289,12 +391,13 @@ static int __init ipt_init_module(void)
 {
 	int ret1, ret2;
 
-	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+	ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops);
 	if (ret1 < 0)
-		printk("Failed to load xt action\n");
-	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+		pr_err("Failed to load xt action\n");
+
+	ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops);
 	if (ret2 < 0)
-		printk("Failed to load ipt action\n");
+		pr_err("Failed to load ipt action\n");
 
 	if (ret1 < 0 && ret2 < 0) {
 		return ret1;
@@ -304,8 +407,8 @@ static int __init ipt_init_module(void)
 
 static void __exit ipt_cleanup_module(void)
 {
-	tcf_unregister_action(&act_xt_ops);
-	tcf_unregister_action(&act_ipt_ops);
+	tcf_unregister_action(&act_ipt_ops, &ipt_net_ops);
+	tcf_unregister_action(&act_xt_ops, &xt_net_ops);
 }
 
 module_init(ipt_init_module);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 32fcdecdb9e2..6b284d991e0b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -50,10 +50,13 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
 };
 
+static int mirred_net_id;
+
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action *a, int ovr,
 			   int bind)
 {
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tc_mirred *parm;
 	struct tcf_mirred *m;
@@ -96,11 +99,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		dev = NULL;
 	}
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (dev == NULL)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
-				      bind, true);
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*m), bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -130,7 +133,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
 		spin_unlock_bh(&mirred_list_lock);
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	}
 
 	return ret;
@@ -221,6 +224,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static int mirred_device_event(struct notifier_block *unused,
 			       unsigned long event, void *ptr)
 {
@@ -257,6 +276,29 @@ static struct tc_action_ops act_mirred_ops = {
 	.dump		=	tcf_mirred_dump,
 	.cleanup	=	tcf_mirred_release,
 	.init		=	tcf_mirred_init,
+	.walk		=	tcf_mirred_walker,
+	.lookup		=	tcf_mirred_search,
+};
+
+static __net_init int mirred_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK);
+}
+
+static void __net_exit mirred_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations mirred_net_ops = {
+	.init = mirred_init_net,
+	.exit = mirred_exit_net,
+	.id   = &mirred_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
@@ -270,12 +312,12 @@ static int __init mirred_init_module(void)
 		return err;
 
 	pr_info("Mirror/redirect action on\n");
-	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
+	return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
 }
 
 static void __exit mirred_cleanup_module(void)
 {
-	tcf_unregister_action(&act_mirred_ops);
+	tcf_unregister_action(&act_mirred_ops, &mirred_net_ops);
 	unregister_netdevice_notifier(&mirred_device_notifier);
 }
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 27607b863aba..0f65cdfbfb1d 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,6 +31,8 @@
 
 #define NAT_TAB_MASK	15
 
+static int nat_net_id;
+
 static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
 };
@@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action *a, int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
 	struct tc_nat *parm;
 	int ret = 0, err;
@@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_NAT_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 
 	return ret;
 }
@@ -274,6 +277,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_nat_ops = {
 	.kind		=	"nat",
 	.type		=	TCA_ACT_NAT,
@@ -281,6 +300,29 @@ static struct tc_action_ops act_nat_ops = {
 	.act		=	tcf_nat,
 	.dump		=	tcf_nat_dump,
 	.init		=	tcf_nat_init,
+	.walk		=	tcf_nat_walker,
+	.lookup		=	tcf_nat_search,
+};
+
+static __net_init int nat_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK);
+}
+
+static void __net_exit nat_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations nat_net_ops = {
+	.init = nat_init_net,
+	.exit = nat_exit_net,
+	.id   = &nat_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_DESCRIPTION("Stateless NAT actions");
@@ -288,12 +330,12 @@ MODULE_LICENSE("GPL");
 
 static int __init nat_init_module(void)
 {
-	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
+	return tcf_register_action(&act_nat_ops, &nat_net_ops);
 }
 
 static void __exit nat_cleanup_module(void)
 {
-	tcf_unregister_action(&act_nat_ops);
+	tcf_unregister_action(&act_nat_ops, &nat_net_ops);
 }
 
 module_init(nat_init_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index e38a7701f154..429c3ab65142 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -25,6 +25,8 @@
 
 #define PEDIT_TAB_MASK	15
 
+static int pedit_net_id;
+
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
 };
@@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action *a,
 			  int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tc_pedit *parm;
 	int ret = 0, err;
@@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		p = to_pedit(a);
@@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	memcpy(p->tcfp_keys, parm->keys, ksize);
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -211,6 +214,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
+			    struct netlink_callback *cb, int type,
+			    struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
 	.type		=	TCA_ACT_PEDIT,
@@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = {
 	.dump		=	tcf_pedit_dump,
 	.cleanup	=	tcf_pedit_cleanup,
 	.init		=	tcf_pedit_init,
+	.walk		=	tcf_pedit_walker,
+	.lookup		=	tcf_pedit_search,
+};
+
+static __net_init int pedit_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK);
+}
+
+static void __net_exit pedit_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations pedit_net_ops = {
+	.init = pedit_init_net,
+	.exit = pedit_exit_net,
+	.id   = &pedit_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -227,12 +269,12 @@ MODULE_LICENSE("GPL");
 
 static int __init pedit_init_module(void)
 {
-	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
+	return tcf_register_action(&act_pedit_ops, &pedit_net_ops);
 }
 
 static void __exit pedit_cleanup_module(void)
 {
-	tcf_unregister_action(&act_pedit_ops);
+	tcf_unregister_action(&act_pedit_ops, &pedit_net_ops);
 }
 
 module_init(pedit_init_module);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 9a1c42a43f92..330f14e302e8 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,10 +55,14 @@ struct tc_police_compat {
 
 /* Each policer is serialized by its individual spinlock */
 
-static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			      int type, struct tc_action *a)
+static int police_net_id;
+
+static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
+				 struct netlink_callback *cb, int type,
+				 struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 	struct tc_police *parm;
 	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int size;
 
 	if (nla == NULL)
@@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_POLICE_TBF]);
 
 	if (parm->index) {
-		if (tcf_hash_search(a, parm->index)) {
+		if (tcf_hash_search(tn, a, parm->index)) {
 			police = to_police(a->priv);
 			if (bind) {
 				police->tcf_bindcnt += 1;
@@ -233,7 +238,7 @@ override:
 
 	police->tcfp_t_c = ktime_get_ns();
 	police->tcf_index = parm->index ? parm->index :
-		tcf_hash_new_index(hinfo);
+		tcf_hash_new_index(tn);
 	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	spin_lock_bh(&hinfo->lock);
 	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
@@ -342,6 +347,13 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_police_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 MODULE_AUTHOR("Alexey Kuznetsov");
 MODULE_DESCRIPTION("Policing actions");
 MODULE_LICENSE("GPL");
@@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = {
 	.act		=	tcf_act_police,
 	.dump		=	tcf_act_police_dump,
 	.init		=	tcf_act_police_locate,
-	.walk		=	tcf_act_police_walker
+	.walk		=	tcf_act_police_walker,
+	.lookup		=	tcf_police_search,
+};
+
+static __net_init int police_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK);
+}
+
+static void __net_exit police_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations police_net_ops = {
+	.init = police_init_net,
+	.exit = police_exit_net,
+	.id   = &police_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init
 police_init_module(void)
 {
-	return tcf_register_action(&act_police_ops, POL_TAB_MASK);
+	return tcf_register_action(&act_police_ops, &police_net_ops);
 }
 
 static void __exit
 police_cleanup_module(void)
 {
-	tcf_unregister_action(&act_police_ops);
+	tcf_unregister_action(&act_police_ops, &police_net_ops);
 }
 
 module_init(police_init_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index d6b708d6afdf..75b2be13fbcc 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,6 +26,8 @@
 
 #define SIMP_TAB_MASK     7
 
+static int simp_net_id;
+
 #define SIMP_MAX_DATA	32
 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
@@ -80,6 +82,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tc_defact *parm;
 	struct tcf_defact *d;
@@ -102,9 +105,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_DEF_PARMS]);
 	defdata = nla_data(tb[TCA_DEF_DATA]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*d), bind, false);
 		if (ret)
 			return ret;
 
@@ -129,7 +132,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -161,6 +164,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_simp_ops = {
 	.kind		=	"simple",
 	.type		=	TCA_ACT_SIMP,
@@ -169,6 +188,29 @@ static struct tc_action_ops act_simp_ops = {
 	.dump		=	tcf_simp_dump,
 	.cleanup	=	tcf_simp_release,
 	.init		=	tcf_simp_init,
+	.walk		=	tcf_simp_walker,
+	.lookup		=	tcf_simp_search,
+};
+
+static __net_init int simp_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK);
+}
+
+static void __net_exit simp_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations simp_net_ops = {
+	.init = simp_init_net,
+	.exit = simp_exit_net,
+	.id   = &simp_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -177,8 +219,7 @@ MODULE_LICENSE("GPL");
 
 static int __init simp_init_module(void)
 {
-	int ret;
-	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
+	int ret = tcf_register_action(&act_simp_ops, &simp_net_ops);
 	if (!ret)
 		pr_info("Simple TC action Loaded\n");
 	return ret;
@@ -186,7 +227,7 @@ static int __init simp_init_module(void)
 
 static void __exit simp_cleanup_module(void)
 {
-	tcf_unregister_action(&act_simp_ops);
+	tcf_unregister_action(&act_simp_ops, &simp_net_ops);
 }
 
 module_init(simp_init_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6751b5f8c046..cfcdbdc00c9b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,6 +29,8 @@
 
 #define SKBEDIT_TAB_MASK     15
 
+static int skbedit_net_id;
+
 static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
 {
@@ -61,6 +63,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action *a,
 			    int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
 	struct tc_skbedit *parm;
 	struct tcf_skbedit *d;
@@ -98,9 +101,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*d), bind, false);
 		if (ret)
 			return ret;
 
@@ -130,7 +133,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&d->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -173,6 +176,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
+			      struct netlink_callback *cb, int type,
+			      struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_skbedit_ops = {
 	.kind		=	"skbedit",
 	.type		=	TCA_ACT_SKBEDIT,
@@ -180,6 +199,29 @@ static struct tc_action_ops act_skbedit_ops = {
 	.act		=	tcf_skbedit,
 	.dump		=	tcf_skbedit_dump,
 	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_skbedit_walker,
+	.lookup		=	tcf_skbedit_search,
+};
+
+static __net_init int skbedit_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK);
+}
+
+static void __net_exit skbedit_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbedit_net_ops = {
+	.init = skbedit_init_net,
+	.exit = skbedit_exit_net,
+	.id   = &skbedit_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
@@ -188,12 +230,12 @@ MODULE_LICENSE("GPL");
 
 static int __init skbedit_init_module(void)
 {
-	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
+	return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops);
 }
 
 static void __exit skbedit_cleanup_module(void)
 {
-	tcf_unregister_action(&act_skbedit_ops);
+	tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops);
 }
 
 module_init(skbedit_init_module);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 796785e0bf96..bab8ae0cefc0 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,6 +21,8 @@
 
 #define VLAN_TAB_MASK     15
 
+static int vlan_net_id;
+
 static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
@@ -68,6 +70,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
 	struct tc_vlan *parm;
 	struct tcf_vlan *v;
@@ -115,9 +118,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	}
 	action = parm->v_action;
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*v), bind, false);
 		if (ret)
 			return ret;
 
@@ -143,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&v->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -181,6 +184,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_vlan_ops = {
 	.kind		=	"vlan",
 	.type		=	TCA_ACT_VLAN,
@@ -188,16 +207,39 @@ static struct tc_action_ops act_vlan_ops = {
 	.act		=	tcf_vlan,
 	.dump		=	tcf_vlan_dump,
 	.init		=	tcf_vlan_init,
+	.walk		=	tcf_vlan_walker,
+	.lookup		=	tcf_vlan_search,
+};
+
+static __net_init int vlan_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK);
+}
+
+static void __net_exit vlan_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations vlan_net_ops = {
+	.init = vlan_init_net,
+	.exit = vlan_exit_net,
+	.id   = &vlan_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init vlan_init_module(void)
 {
-	return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK);
+	return tcf_register_action(&act_vlan_ops, &vlan_net_ops);
 }
 
 static void __exit vlan_cleanup_module(void)
 {
-	tcf_unregister_action(&act_vlan_ops);
+	tcf_unregister_action(&act_vlan_ops, &vlan_net_ops);
 }
 
 module_init(vlan_init_module);
-- 
cgit v1.2.3


From 65aebfc002abc1827ac7c8644a2bba0459ce3ce2 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 23 Feb 2016 12:13:54 -0500
Subject: net: dsa: add port_vlan_dump routine

Similar to port_fdb_dump, add a port_vlan_dump function to DSA drivers
which gets passed the switchdev VLAN object and callback.

This function, if implemented, takes precedence over the soon legacy
vlan_getnext/port_pvid_get approach.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt | 4 ++++
 include/net/dsa.h                    | 3 +++
 net/dsa/slave.c                      | 3 +++
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index ebf21530471f..350a502e031f 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -554,6 +554,10 @@ Bridge VLAN filtering
 - port_vlan_del: bridge layer function invoked when a VLAN is removed from the
   given switch port
 
+- port_vlan_dump: bridge layer function invoked with a switchdev callback
+  function that the driver has to call for each VLAN the given port is a member
+  of. A switchdev object is used to carry the VID and bridge flags.
+
 - vlan_getnext: bridge layer function invoked to query the next configured VLAN
   in the switch, i.e. returns the bitmaps of members and untagged ports
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1c845d7bf0b2..ebc0d9ea96a1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -313,6 +313,9 @@ struct dsa_switch_driver {
 				 struct switchdev_trans *trans);
 	int	(*port_vlan_del)(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_vlan *vlan);
+	int	(*port_vlan_dump)(struct dsa_switch *ds, int port,
+				  struct switchdev_obj_port_vlan *vlan,
+				  int (*cb)(struct switchdev_obj *obj));
 	int	(*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid);
 	int	(*vlan_getnext)(struct dsa_switch *ds, u16 *vid,
 				unsigned long *ports, unsigned long *untagged);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 14ca9784ec0c..a9cbb72fb155 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -248,6 +248,9 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 	u16 pvid, vid = 0;
 	int err;
 
+	if (ds->drv->port_vlan_dump)
+		return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
+
 	if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From 477b184526a7f44164029eea720da0e0c888cac6 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 23 Feb 2016 12:13:56 -0500
Subject: net: dsa: drop vlan_getnext

The VLAN GetNext operation is specific to some switches, and thus can be
complicated to implement for some drivers.

Remove the support for the vlan_getnext/port_pvid_get approach in favor
of the generic and simpler port_vlan_dump function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt |  9 ---------
 include/net/dsa.h                    |  3 ---
 net/dsa/slave.c                      | 35 +----------------------------------
 3 files changed, 1 insertion(+), 46 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 350a502e031f..974e9c387d1e 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -542,12 +542,6 @@ Bridge layer
 Bridge VLAN filtering
 ---------------------
 
-- port_pvid_get: bridge layer function invoked when a Port-based VLAN ID is
-  queried for the given switch port
-
-- port_pvid_set: bridge layer function invoked when a Port-based VLAN ID needs
-  to be configured on the given switch port
-
 - port_vlan_add: bridge layer function invoked when a VLAN is configured
   (tagged or untagged) for the given switch port
 
@@ -558,9 +552,6 @@ Bridge VLAN filtering
   function that the driver has to call for each VLAN the given port is a member
   of. A switchdev object is used to carry the VID and bridge flags.
 
-- vlan_getnext: bridge layer function invoked to query the next configured VLAN
-  in the switch, i.e. returns the bitmaps of members and untagged ports
-
 - port_fdb_add: bridge layer function invoked when the bridge wants to install a
   Forwarding Database entry, the switch hardware should be programmed with the
   specified address in the specified VLAN Id in the forwarding database
diff --git a/include/net/dsa.h b/include/net/dsa.h
index ebc0d9ea96a1..3dd54867174a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -316,9 +316,6 @@ struct dsa_switch_driver {
 	int	(*port_vlan_dump)(struct dsa_switch *ds, int port,
 				  struct switchdev_obj_port_vlan *vlan,
 				  int (*cb)(struct switchdev_obj *obj));
-	int	(*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid);
-	int	(*vlan_getnext)(struct dsa_switch *ds, u16 *vid,
-				unsigned long *ports, unsigned long *untagged);
 
 	/*
 	 * Forwarding database
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a9cbb72fb155..cde29239b60d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -243,44 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
-	DECLARE_BITMAP(members, DSA_MAX_PORTS);
-	DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
-	u16 pvid, vid = 0;
-	int err;
 
 	if (ds->drv->port_vlan_dump)
 		return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
 
-	if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
-		return -EOPNOTSUPP;
-
-	err = ds->drv->port_pvid_get(ds, p->port, &pvid);
-	if (err)
-		return err;
-
-	for (;;) {
-		err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
-		if (err)
-			break;
-
-		if (!test_bit(p->port, members))
-			continue;
-
-		memset(vlan, 0, sizeof(*vlan));
-		vlan->vid_begin = vlan->vid_end = vid;
-
-		if (vid == pvid)
-			vlan->flags |= BRIDGE_VLAN_INFO_PVID;
-
-		if (test_bit(p->port, untagged))
-			vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-
-		err = cb(&vlan->obj);
-		if (err)
-			break;
-	}
-
-	return err == -ENOENT ? 0 : err;
+	return -EOPNOTSUPP;
 }
 
 static int dsa_slave_port_fdb_add(struct net_device *dev,
-- 
cgit v1.2.3


From f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 09:25:37 -0800
Subject: net: ipv6: Make address flushing on ifdown optional

Currently, all ipv6 addresses are flushed when the interface is configured
down, including global, static addresses:

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    << nothing; all addresses have been flushed>>

Add a new sysctl to make this behavior optional. The new setting defaults to
flush all addresses to maintain backwards compatibility. When the set global
addresses with no expire times are not flushed on an admin down. The sysctl
is per-interface or system-wide for all interfaces

    $ sysctl -w net.ipv6.conf.eth1.keep_addr_on_down=1
or
    $ sysctl -w net.ipv6.conf.all.keep_addr_on_down=1

Will keep addresses on eth1 on an admin down.

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST> mtu 1500 state DOWN qlen 1000
        inet6 2100:1::2/120 scope global tentative
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link tentative
           valid_lft forever preferred_lft forever

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   9 +++
 include/linux/ipv6.h                   |   1 +
 include/uapi/linux/ipv6.h              |   1 +
 net/ipv6/addrconf.c                    | 136 +++++++++++++++++++++++++++++----
 4 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 24ce97f42d35..d5df40c75aa4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1563,6 +1563,15 @@ temp_prefered_lft - INTEGER
 	Preferred lifetime (in seconds) for temporary addresses.
 	Default: 86400 (1 day)
 
+keep_addr_on_down - INTEGER
+	Keep all IPv6 addresses on an interface down event. If set static
+	global addresses with no expiration time are not flushed.
+	  >0 : enabled
+	   0 : system default
+	  <0 : disabled
+
+	Default: 0 (addresses are removed)
+
 max_desync_factor - INTEGER
 	Maximum value for DESYNC_FACTOR, which is a random value
 	that ensures that clients don't synchronize with each
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4b2267e1b7c3..7edc14fb66b6 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -62,6 +62,7 @@ struct ipv6_devconf {
 		struct in6_addr secret;
 	} stable_secret;
 	__s32		use_oif_addrs_only;
+	__s32		keep_addr_on_down;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index ec117b65d5a5..395876060f50 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -176,6 +176,7 @@ enum {
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_DROP_UNSOLICITED_NA,
+	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4751f8922362..a2d6f6c242af 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3168,6 +3170,55 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+static int fixup_permanent_addr(struct inet6_dev *idev,
+				struct inet6_ifaddr *ifp)
+{
+	if (!ifp->rt) {
+		struct rt6_info *rt;
+
+		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		if (unlikely(IS_ERR(rt)))
+			return PTR_ERR(rt);
+
+		ifp->rt = rt;
+	}
+
+	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      idev->dev, 0, 0);
+	}
+
+	addrconf_dad_start(ifp);
+
+	return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+	struct inet6_ifaddr *ifp, *tmp;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	write_lock_bh(&idev->lock);
+
+	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    fixup_permanent_addr(idev, ifp) < 0) {
+			write_unlock_bh(&idev->lock);
+			ipv6_del_addr(ifp);
+			write_lock_bh(&idev->lock);
+
+			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+					     idev->dev->name, &ifp->addr);
+		}
+	}
+
+	write_unlock_bh(&idev->lock);
+}
+
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3253,6 +3304,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
+		/* restore routes for permanent addresses */
+		addrconf_permanent_addr(dev);
+
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3356,7 +3410,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa;
+	struct inet6_ifaddr *ifa, *tmp;
+	struct list_head del_list;
+	int _keep_addr;
+	bool keep_addr;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3383,6 +3440,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
+	/* aggregate the system setting and interface setting */
+	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+	if (!_keep_addr)
+		_keep_addr = idev->cnf.keep_addr_on_down;
+
+	/* combine the user config with event to determine if permanent
+	 * addresses are to be removed from address hash table
+	 */
+	keep_addr = !(how || _keep_addr <= 0);
+
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3391,9 +3458,15 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
-				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				goto restart;
+				/* combined flag + permanent flag decide if
+				 * address is retained on a down event
+				 */
+				if (!keep_addr ||
+				    !(ifa->flags & IFA_F_PERMANENT)) {
+					hlist_del_init_rcu(&ifa->addr_lst);
+					goto restart;
+				}
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3427,31 +3500,53 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	while (!list_empty(&idev->addr_list)) {
-		ifa = list_first_entry(&idev->addr_list,
-				       struct inet6_ifaddr, if_list);
-		addrconf_del_dad_work(ifa);
+	/* re-combine the user config with event to determine if permanent
+	 * addresses are to be removed from the interface list
+	 */
+	keep_addr = (!how && _keep_addr > 0);
 
-		list_del(&ifa->if_list);
+	INIT_LIST_HEAD(&del_list);
+	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
-
 		spin_lock_bh(&ifa->lock);
-		state = ifa->state;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
+
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+			/* set state to skip the notifier below */
+			state = INET6_IFADDR_STATE_DEAD;
+			ifa->state = 0;
+			if (!(ifa->flags & IFA_F_NODAD))
+				ifa->flags |= IFA_F_TENTATIVE;
+		} else {
+			state = ifa->state;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
+
+			list_del(&ifa->if_list);
+			list_add(&ifa->if_list, &del_list);
+		}
+
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
-		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
+	/* now clean up addresses to be removed */
+	while (!list_empty(&del_list)) {
+		ifa = list_first_entry(&del_list,
+				       struct inet6_ifaddr, if_list);
+		list_del(&ifa->if_list);
+
+		in6_ifa_put(ifa);
+	}
+
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4716,6 +4811,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5197,10 +5293,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		dst_hold(&ifp->rt->dst);
-
-		ip6_del_rt(ifp->rt);
+		if (ifp->rt) {
+			dst_hold(&ifp->rt->dst);
 
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
@@ -5803,6 +5901,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname       = "keep_addr_on_down",
+			.data           = &ipv6_devconf.keep_addr_on_down,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From a87cb3e48ee86d29868d3f59cfb9ce1a8fa63314 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 24 Feb 2016 10:02:52 -0800
Subject: net: Facility to report route quality of connected sockets

This patch add the SO_CNX_ADVICE socket option (setsockopt only). The
purpose is to allow an application to give feedback to the kernel about
the quality of the network path for a connected socket. The value
argument indicates the type of quality report. For this initial patch
the only supported advice is a value of 1 which indicates "bad path,
please reroute"-- the action taken by the kernel is to call
dst_negative_advice which will attempt to choose a different ECMP route,
reset the TX hash for flow label and UDP source port in encapsulation,
etc.

This facility should be useful for connected UDP sockets where only the
application can provide any feedback about path quality. It could also
be useful for TCP applications that have additional knowledge about the
path outside of the normal TCP control loop.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/s390/include/uapi/asm/socket.h    | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 arch/xtensa/include/uapi/asm/socket.h  | 2 ++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 4 ++++
 14 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index c5fb9e6bc3a5..9e46d6e656d9 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 9de0796240a0..1fd147f09a38 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f02e4849ae83..afbc98f02d27 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -88,5 +88,7 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index bce29166de1b..0018fad9039f 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 14aa4a6bccf1..5fe42fc7b6c5 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 5910fe294e93..2027240aafbb 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -106,4 +106,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 58b1aa01ab9f..5129f23a9ee1 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index f9cf1223422c..9c935d717df9 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x402C
 #define SO_ATTACH_REUSEPORT_EBPF	0x402D
 
+#define SO_CNX_ADVICE		0x402E
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index dd54f28ecdec..1672e3398270 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index d02e89d14fef..41b51c2f4f1b 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index d270ee91968e..31aede3af088 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -84,6 +84,8 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x0035
 #define SO_ATTACH_REUSEPORT_EBPF	0x0036
 
+#define SO_CNX_ADVICE		0x0037
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index fd3b96d1153f..81435d995e11 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index fb8a41668382..67d632f1743d 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -90,4 +90,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 46dc8ad7d050..4493ff820c2c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -987,6 +987,10 @@ set_rcvbuf:
 		sk->sk_incoming_cpu = val;
 		break;
 
+	case SO_CNX_ADVICE:
+		if (val == 1)
+			dst_negative_advice(sk);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 3f1ac7a700d039c61d8d8b99f28d605d489a60cf Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:57:59 -0800
Subject: net: ethtool: add new ETHTOOL_xLINKSETTINGS API

This patch defines a new ETHTOOL_GLINKSETTINGS/SLINKSETTINGS API,
handled by the new get_link_ksettings/set_link_ksettings callbacks.
This API provides support for most legacy ethtool_cmd fields, adds
support for larger link mode masks (up to 4064 bits, variable length),
and removes ethtool_cmd deprecated
fields (transceiver/maxrxpkt/maxtxpkt).

This API is deprecating the legacy ETHTOOL_GSET/SSET API and provides
the following backward compatibility properties:
 - legacy ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks.
 - legacy ethtool with new get/set_link_ksettings drivers: the new
   driver callbacks are used, data internally converted to legacy
   ethtool_cmd. ETHTOOL_GSET will return only the 1st 32b of each link
   mode mask. ETHTOOL_SSET will fail if user tries to set the
   ethtool_cmd deprecated fields to
   non-0 (transceiver/maxrxpkt/maxtxpkt). A kernel warning is logged if
   driver sets higher bits.
 - future ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks, internally converted to new data
   structure. Deprecated fields (transceiver/maxrxpkt/maxtxpkt) will be
   ignored and seen as 0 from user space. Note that that "future"
   ethtool tool will not allow changes to these deprecated fields.
 - future ethtool with new drivers: direct call to the new callbacks.

By "future" ethtool, what is meant is:
 - query: first try ETHTOOL_GLINKSETTINGS, and revert to ETHTOOL_GSET if
   fails
 - set: query first and remember which of ETHTOOL_GLINKSETTINGS or
   ETHTOOL_GSET was successful
   + if ETHTOOL_GLINKSETTINGS was successful, then change config with
     ETHTOOL_SLINKSETTINGS. A failure there is final (do not try
     ETHTOOL_SSET).
   + otherwise ETHTOOL_GSET was successful, change config with
     ETHTOOL_SSET. A failure there is final (do not try
     ETHTOOL_SLINKSETTINGS).

The interaction user/kernel via the new API requires a small
ETHTOOL_GLINKSETTINGS handshake first to agree on the length of the link
mode bitmaps. If kernel doesn't agree with user, it returns the bitmap
length it is expecting from user as a negative length (and cmd field is
0). When kernel and user agree, kernel returns valid info in all
fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS).

Data structure crossing user/kernel boundary is 32/64-bit
agnostic. Converted internally to a legal kernel bitmap.

The internal __ethtool_get_settings kernel helper will gradually be
replaced by __ethtool_get_link_ksettings by the time the first
"link_settings" drivers start to appear. So this patch doesn't change
it, it will be removed before it needs to be changed.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  91 ++++++++-
 include/uapi/linux/ethtool.h | 322 +++++++++++++++++++++++-------
 net/core/ethtool.c           | 453 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 786 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 472d7d7b01c2..8a400a54c92e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <uapi/linux/ethtool.h>
 
@@ -40,9 +41,6 @@ struct compat_ethtool_rxnfc {
 
 #include <linux/rculist.h>
 
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -97,13 +95,74 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 
+/* number of link mode bits/ulongs handled internally by kernel */
+#define __ETHTOOL_LINK_MODE_MASK_NBITS			\
+	(__ETHTOOL_LINK_MODE_LAST + 1)
+
+/* declare a link mode bitmap */
+#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)		\
+	DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/* drivers must ignore base.cmd and base.link_mode_masks_nwords
+ * fields, but they are allowed to overwrite them (will be ignored).
+ */
+struct ethtool_link_ksettings {
+	struct ethtool_link_settings base;
+	struct {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	} link_modes;
+};
+
+/**
+ * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)		\
+	bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings
+ * link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)		\
+	__set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+/**
+ * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ *
+ * Returns true/false.
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)		\
+	test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+extern int
+__ethtool_get_link_ksettings(struct net_device *dev,
+			     struct ethtool_link_ksettings *link_ksettings);
+
+/* DEPRECATED, use __ethtool_get_link_ksettings */
+extern int __ethtool_get_settings(struct net_device *dev,
+				  struct ethtool_cmd *cmd);
+
 /**
  * struct ethtool_ops - optional netdev operations
- * @get_settings: Get various device settings including Ethernet link
+ * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Get various device settings including Ethernet link
  *	settings. The @cmd parameter is expected to have been cleared
- *	before get_settings is called. Returns a negative error code or
- *	zero.
- * @set_settings: Set various device settings including Ethernet link
+ *	before get_settings is called. Returns a negative error code
+ *	or zero.
+ * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Set various device settings including Ethernet link
  *	settings.  Returns a negative error code or zero.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
@@ -211,6 +270,19 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, ignore the inapplicable fields.
  *	Returns a negative error code or zero.
+ * @get_link_ksettings: When defined, takes precedence over the
+ *	%get_settings method. Get various device settings
+ *	including Ethernet link settings. The %cmd and
+ *	%link_mode_masks_nwords fields should be ignored (use
+ *	%__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
+ *	change to them will be overwritten by kernel. Returns a
+ *	negative error code or zero.
+ * @set_link_ksettings: When defined, takes precedence over the
+ *	%set_settings method. Set various device settings including
+ *	Ethernet link settings. The %cmd and %link_mode_masks_nwords
+ *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
+ *	instead of the latter), any change to them will be overwritten
+ *	by kernel. Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -293,6 +365,9 @@ struct ethtool_ops {
 					  struct ethtool_coalesce *);
 	int	(*set_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
-
+	int	(*get_link_ksettings)(struct net_device *,
+				      struct ethtool_link_ksettings *);
+	int	(*set_link_ksettings)(struct net_device *,
+				      const struct ethtool_link_ksettings *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f15ae02621a1..37fd6dc33de4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -21,7 +21,8 @@
  */
 
 /**
- * struct ethtool_cmd - link control and status
+ * struct ethtool_cmd - DEPRECATED, link control and status
+ * This structure is DEPRECATED, please use struct ethtool_link_settings.
  * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET
  * @supported: Bitmask of %SUPPORTED_* flags for the link modes,
  *	physical connectors and other link features for which the
@@ -1219,8 +1220,12 @@ struct ethtool_per_queue_op {
 };
 
 /* CMDs currently supported */
-#define ETHTOOL_GSET		0x00000001 /* Get settings. */
-#define ETHTOOL_SSET		0x00000002 /* Set settings. */
+#define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
+					    * Please use ETHTOOL_GLINKSETTINGS
+					    */
+#define ETHTOOL_SSET		0x00000002 /* DEPRECATED, Set settings.
+					    * Please use ETHTOOL_SLINKSETTINGS
+					    */
 #define ETHTOOL_GDRVINFO	0x00000003 /* Get driver info. */
 #define ETHTOOL_GREGS		0x00000004 /* Get NIC registers. */
 #define ETHTOOL_GWOL		0x00000005 /* Get wake-on-lan options. */
@@ -1302,73 +1307,139 @@ struct ethtool_per_queue_op {
 
 #define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
 
+#define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
+#define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
+
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
 
-#define SUPPORTED_10baseT_Half		(1 << 0)
-#define SUPPORTED_10baseT_Full		(1 << 1)
-#define SUPPORTED_100baseT_Half		(1 << 2)
-#define SUPPORTED_100baseT_Full		(1 << 3)
-#define SUPPORTED_1000baseT_Half	(1 << 4)
-#define SUPPORTED_1000baseT_Full	(1 << 5)
-#define SUPPORTED_Autoneg		(1 << 6)
-#define SUPPORTED_TP			(1 << 7)
-#define SUPPORTED_AUI			(1 << 8)
-#define SUPPORTED_MII			(1 << 9)
-#define SUPPORTED_FIBRE			(1 << 10)
-#define SUPPORTED_BNC			(1 << 11)
-#define SUPPORTED_10000baseT_Full	(1 << 12)
-#define SUPPORTED_Pause			(1 << 13)
-#define SUPPORTED_Asym_Pause		(1 << 14)
-#define SUPPORTED_2500baseX_Full	(1 << 15)
-#define SUPPORTED_Backplane		(1 << 16)
-#define SUPPORTED_1000baseKX_Full	(1 << 17)
-#define SUPPORTED_10000baseKX4_Full	(1 << 18)
-#define SUPPORTED_10000baseKR_Full	(1 << 19)
-#define SUPPORTED_10000baseR_FEC	(1 << 20)
-#define SUPPORTED_20000baseMLD2_Full	(1 << 21)
-#define SUPPORTED_20000baseKR2_Full	(1 << 22)
-#define SUPPORTED_40000baseKR4_Full	(1 << 23)
-#define SUPPORTED_40000baseCR4_Full	(1 << 24)
-#define SUPPORTED_40000baseSR4_Full	(1 << 25)
-#define SUPPORTED_40000baseLR4_Full	(1 << 26)
-#define SUPPORTED_56000baseKR4_Full	(1 << 27)
-#define SUPPORTED_56000baseCR4_Full	(1 << 28)
-#define SUPPORTED_56000baseSR4_Full	(1 << 29)
-#define SUPPORTED_56000baseLR4_Full	(1 << 30)
-
-#define ADVERTISED_10baseT_Half		(1 << 0)
-#define ADVERTISED_10baseT_Full		(1 << 1)
-#define ADVERTISED_100baseT_Half	(1 << 2)
-#define ADVERTISED_100baseT_Full	(1 << 3)
-#define ADVERTISED_1000baseT_Half	(1 << 4)
-#define ADVERTISED_1000baseT_Full	(1 << 5)
-#define ADVERTISED_Autoneg		(1 << 6)
-#define ADVERTISED_TP			(1 << 7)
-#define ADVERTISED_AUI			(1 << 8)
-#define ADVERTISED_MII			(1 << 9)
-#define ADVERTISED_FIBRE		(1 << 10)
-#define ADVERTISED_BNC			(1 << 11)
-#define ADVERTISED_10000baseT_Full	(1 << 12)
-#define ADVERTISED_Pause		(1 << 13)
-#define ADVERTISED_Asym_Pause		(1 << 14)
-#define ADVERTISED_2500baseX_Full	(1 << 15)
-#define ADVERTISED_Backplane		(1 << 16)
-#define ADVERTISED_1000baseKX_Full	(1 << 17)
-#define ADVERTISED_10000baseKX4_Full	(1 << 18)
-#define ADVERTISED_10000baseKR_Full	(1 << 19)
-#define ADVERTISED_10000baseR_FEC	(1 << 20)
-#define ADVERTISED_20000baseMLD2_Full	(1 << 21)
-#define ADVERTISED_20000baseKR2_Full	(1 << 22)
-#define ADVERTISED_40000baseKR4_Full	(1 << 23)
-#define ADVERTISED_40000baseCR4_Full	(1 << 24)
-#define ADVERTISED_40000baseSR4_Full	(1 << 25)
-#define ADVERTISED_40000baseLR4_Full	(1 << 26)
-#define ADVERTISED_56000baseKR4_Full	(1 << 27)
-#define ADVERTISED_56000baseCR4_Full	(1 << 28)
-#define ADVERTISED_56000baseSR4_Full	(1 << 29)
-#define ADVERTISED_56000baseLR4_Full	(1 << 30)
+/* Link mode bit indices */
+enum ethtool_link_mode_bit_indices {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT	= 0,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT	= 1,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT	= 2,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT	= 3,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT	= 4,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT	= 5,
+	ETHTOOL_LINK_MODE_Autoneg_BIT		= 6,
+	ETHTOOL_LINK_MODE_TP_BIT		= 7,
+	ETHTOOL_LINK_MODE_AUI_BIT		= 8,
+	ETHTOOL_LINK_MODE_MII_BIT		= 9,
+	ETHTOOL_LINK_MODE_FIBRE_BIT		= 10,
+	ETHTOOL_LINK_MODE_BNC_BIT		= 11,
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT	= 12,
+	ETHTOOL_LINK_MODE_Pause_BIT		= 13,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT	= 14,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT	= 15,
+	ETHTOOL_LINK_MODE_Backplane_BIT		= 16,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT	= 17,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT	= 18,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT	= 19,
+	ETHTOOL_LINK_MODE_10000baseR_FEC_BIT	= 20,
+	ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21,
+	ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT	= 22,
+	ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT	= 23,
+	ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT	= 24,
+	ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT	= 25,
+	ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT	= 26,
+	ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT	= 27,
+	ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT	= 28,
+	ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT	= 29,
+	ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT	= 30,
+
+	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+	 * macro for bits > 31. The only way to use indices > 31 is to
+	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+	 */
+
+	__ETHTOOL_LINK_MODE_LAST
+	  = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+};
+
+#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
+	(1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT))
+
+/* DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new SUPPORTED_* macro for bits > 31.
+ */
+#define SUPPORTED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define SUPPORTED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define SUPPORTED_100baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define SUPPORTED_100baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define SUPPORTED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define SUPPORTED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define SUPPORTED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define SUPPORTED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define SUPPORTED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define SUPPORTED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define SUPPORTED_FIBRE			__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define SUPPORTED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define SUPPORTED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define SUPPORTED_Pause			__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define SUPPORTED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define SUPPORTED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define SUPPORTED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define SUPPORTED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define SUPPORTED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define SUPPORTED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define SUPPORTED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define SUPPORTED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define SUPPORTED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define SUPPORTED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define SUPPORTED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define SUPPORTED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define SUPPORTED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define SUPPORTED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define SUPPORTED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define SUPPORTED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define SUPPORTED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new SUPPORTED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/*
+ * DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new ADERTISE_* macro for bits > 31.
+ */
+#define ADVERTISED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define ADVERTISED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define ADVERTISED_100baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define ADVERTISED_100baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define ADVERTISED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define ADVERTISED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define ADVERTISED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define ADVERTISED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define ADVERTISED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define ADVERTISED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define ADVERTISED_FIBRE		__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define ADVERTISED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define ADVERTISED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define ADVERTISED_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define ADVERTISED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define ADVERTISED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define ADVERTISED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define ADVERTISED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define ADVERTISED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define ADVERTISED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define ADVERTISED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define ADVERTISED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define ADVERTISED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define ADVERTISED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define ADVERTISED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define ADVERTISED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define ADVERTISED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define ADVERTISED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define ADVERTISED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define ADVERTISED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define ADVERTISED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new ADVERTISED_* macro for bits > 31, see
+ * notice above.
+ */
 
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
@@ -1533,4 +1604,123 @@ enum ethtool_reset_flags {
 };
 #define ETH_RESET_SHARED_SHIFT	16
 
+
+/**
+ * struct ethtool_link_settings - link control and status
+ *
+ * IMPORTANT, Backward compatibility notice: When implementing new
+ *	user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and
+ *	if it succeeds use %ETHTOOL_SLINKSETTINGS to change link
+ *	settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS
+ *	succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in
+ *	that case.  Conversely, if %ETHTOOL_GLINKSETTINGS fails, use
+ *	%ETHTOOL_GSET to query and %ETHTOOL_SSET to change link
+ *	settings; do not use %ETHTOOL_SLINKSETTINGS if
+ *	%ETHTOOL_GLINKSETTINGS failed: stick to
+ *	%ETHTOOL_GSET/%ETHTOOL_SSET in that case.
+ *
+ * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS
+ * @speed: Link speed (Mbps)
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ *	applicable.  For clause 45 PHYs this is the PRTAD.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ *	either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ *	protocols supported by the interface; 0 if unknown.
+ *	Read-only.
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
+ *	value will be %ETH_TP_MDI_INVALID.  Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ *	%ETH_TP_MDI_*.  If MDI(-X) control is not implemented, reads
+ *	yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ *	When written successfully, the link should be renegotiated if
+ *	necessary.
+ * @link_mode_masks_nwords: Number of 32-bit words for each of the
+ *	supported, advertising, lp_advertising link mode bitmaps. For
+ *	%ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
+ *	(>= 0); on return, if handshake in progress, negative if
+ *	request size unsupported by kernel: absolute value indicates
+ *	kernel recommended size and cmd field is 0, as well as all the
+ *	other fields; otherwise (handshake completed), strictly
+ *	positive to indicate size used by kernel and cmd field is
+ *	%ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
+ *	%ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
+ *	value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
+ *	refused. For drivers: ignore this field (use kernel's
+ *	__ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will
+ *	be overwritten by kernel.
+ * @supported: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features for which the interface
+ *	supports autonegotiation or auto-detection.  Read-only.
+ * @advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features that are advertised through
+ *	autonegotiation or enabled for auto-detection.
+ * @lp_advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, and other
+ *	link features that the link partner advertised through
+ *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes.  If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted.  For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
+ * are not available in %ethtool_link_settings. Until all drivers are
+ * converted to ignore them or to the new %ethtool_link_settings API,
+ * for both queries and changes, users should always try
+ * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
+ * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
+ * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
+ * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
+ * either %ethtool_cmd or %ethtool_link_settings).
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver.  They should use
+ * %ETHTOOL_GLINKSETTINGS to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SLINKSETTINGS.
+ *
+ * Drivers that implement %get_link_ksettings and/or
+ * %set_link_ksettings should ignore the @cmd
+ * and @link_mode_masks_nwords fields (any change to them overwritten
+ * by kernel), and rely only on kernel's internal
+ * %__ETHTOOL_LINK_MODE_MASK_NBITS and
+ * %ethtool_link_mode_mask_t. Drivers that implement
+ * %set_link_ksettings() should validate all fields other than @cmd
+ * and @link_mode_masks_nwords that are not described as read-only or
+ * deprecated, and must ignore all fields described as read-only.
+ */
+struct ethtool_link_settings {
+	__u32	cmd;
+	__u32	speed;
+	__u8	duplex;
+	__u8	port;
+	__u8	phy_address;
+	__u8	autoneg;
+	__u8	mdio_support;
+	__u8	eth_tp_mdix;
+	__u8	eth_tp_mdix_ctrl;
+	__s8	link_mode_masks_nwords;
+	__u32	reserved[8];
+	__u32	link_mode_masks[0];
+	/* layout of link_mode_masks fields:
+	 * __u32 map_supported[link_mode_masks_nwords];
+	 * __u32 map_advertising[link_mode_masks_nwords];
+	 * __u32 map_lp_advertising[link_mode_masks_nwords];
+	 */
+};
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2406101002b1..edcec56ed228 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -387,6 +387,359 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					    const unsigned long *src)
+{
+	bool retval = true;
+
+	/* TODO: following test will soon always be true */
+	if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+		bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		bitmap_fill(ext, 32);
+		bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		if (bitmap_intersects(ext, src,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+			/* src mask goes beyond bit 31 */
+			retval = false;
+		}
+	}
+	*legacy_u32 = src[0];
+	return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+	struct ethtool_link_ksettings *link_ksettings,
+	const struct ethtool_cmd *legacy_settings)
+{
+	bool retval = true;
+
+	memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+	/* This is used to tell users that driver is still using these
+	 * deprecated legacy fields, and they should not use
+	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+	 */
+	if (legacy_settings->transceiver ||
+	    legacy_settings->maxtxpkt ||
+	    legacy_settings->maxrxpkt)
+		retval = false;
+
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.supported,
+		legacy_settings->supported);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.advertising,
+		legacy_settings->advertising);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.lp_advertising,
+		legacy_settings->lp_advertising);
+	link_ksettings->base.speed
+		= ethtool_cmd_speed(legacy_settings);
+	link_ksettings->base.duplex
+		= legacy_settings->duplex;
+	link_ksettings->base.port
+		= legacy_settings->port;
+	link_ksettings->base.phy_address
+		= legacy_settings->phy_address;
+	link_ksettings->base.autoneg
+		= legacy_settings->autoneg;
+	link_ksettings->base.mdio_support
+		= legacy_settings->mdio_support;
+	link_ksettings->base.eth_tp_mdix
+		= legacy_settings->eth_tp_mdix;
+	link_ksettings->base.eth_tp_mdix_ctrl
+		= legacy_settings->eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+	struct ethtool_cmd *legacy_settings,
+	const struct ethtool_link_ksettings *link_ksettings)
+{
+	bool retval = true;
+
+	memset(legacy_settings, 0, sizeof(*legacy_settings));
+	/* this also clears the deprecated fields in legacy structure:
+	 * __u8		transceiver;
+	 * __u32	maxtxpkt;
+	 * __u32	maxrxpkt;
+	 */
+
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->supported,
+		link_ksettings->link_modes.supported);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->advertising,
+		link_ksettings->link_modes.advertising);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->lp_advertising,
+		link_ksettings->link_modes.lp_advertising);
+	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+	legacy_settings->duplex
+		= link_ksettings->base.duplex;
+	legacy_settings->port
+		= link_ksettings->base.port;
+	legacy_settings->phy_address
+		= link_ksettings->base.phy_address;
+	legacy_settings->autoneg
+		= link_ksettings->base.autoneg;
+	legacy_settings->mdio_support
+		= link_ksettings->base.mdio_support;
+	legacy_settings->eth_tp_mdix
+		= link_ksettings->base.eth_tp_mdix;
+	legacy_settings->eth_tp_mdix_ctrl
+		= link_ksettings->base.eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32			\
+	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+	struct ethtool_link_settings base;
+	struct {
+		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+	} link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *link_ksettings)
+{
+	int err;
+	struct ethtool_cmd cmd;
+
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		memset(link_ksettings, 0, sizeof(*link_ksettings));
+		return dev->ethtool_ops->get_link_ksettings(dev,
+							    link_ksettings);
+	}
+
+	/* driver doesn't support %ethtool_link_ksettings API. revert to
+	 * legacy %ethtool_cmd API, unless it's not supported either.
+	 * TODO: remove when ethtool_ops::get_settings disappears internally
+	 */
+	err = __ethtool_get_settings(dev, &cmd);
+	if (err < 0)
+		return err;
+
+	/* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+	 */
+	convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+	return err;
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+					 const void __user *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+		return -EFAULT;
+
+	memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+	bitmap_from_u32array(to->link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+
+	return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+			      const struct ethtool_link_ksettings *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+	bitmap_to_u32array(link_usettings.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err = 0;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->get_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* handle bitmap nbits handshake */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords) {
+		/* wrong link mode nbits requested */
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		/* keep cmd field reset to 0 */
+		/* send back number of words required as negative val */
+		compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+				   "need too many bits for link modes!");
+		link_ksettings.base.link_mode_masks_nwords
+			= -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+		/* copy the base fields back to user, not the link
+		 * mode bitmaps
+		 */
+		if (copy_to_user(useraddr, &link_ksettings.base,
+				 sizeof(link_ksettings.base)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	/* handshake successful: user/kernel agree on
+	 * link_mode_masks_nwords
+	 */
+
+	memset(&link_ksettings, 0, sizeof(link_ksettings));
+	err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+	if (err < 0)
+		return err;
+
+	/* make sure we tell the right values to user */
+	link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+	link_ksettings.base.link_mode_masks_nwords
+		= __ETHTOOL_LINK_MODE_MASK_NU32;
+
+	return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->set_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* make sure nbits field has expected value */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	/* copy the whole structure, now that we know it has expected
+	 * format
+	 */
+	err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+	if (err)
+		return err;
+
+	/* re-check nwords field, just in case */
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+/* Internal kernel helper to query a device ethtool_cmd settings.
+ *
+ * Note about transition to ethtool_link_settings API: We do not need
+ * (or want) this function to support "dev" instances that implement
+ * the ethtool_link_settings API as we will update the drivers calling
+ * this function to call __ethtool_get_link_ksettings instead, before
+ * the first drivers implement ethtool_ops::get_link_ksettings.
+ *
+ * TODO 1: at least make this function static when no driver is using it
+ * TODO 2: remove when ethtool_ops::get_settings disappears internally
+ */
 int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	ASSERT_RTNL();
@@ -400,30 +753,112 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(__ethtool_get_settings);
 
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+	char name[sizeof(current->comm)];
+
+	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+		     get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
-	int err;
 	struct ethtool_cmd cmd;
 
-	err = __ethtool_get_settings(dev, &cmd);
-	if (err < 0)
-		return err;
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		/* First, use link_ksettings API if it is supported */
+		int err;
+		struct ethtool_link_ksettings link_ksettings;
+
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		err = dev->ethtool_ops->get_link_ksettings(dev,
+							   &link_ksettings);
+		if (err < 0)
+			return err;
+		if (!convert_link_ksettings_to_legacy_settings(&cmd,
+							       &link_ksettings))
+			warn_incomplete_ethtool_legacy_settings_conversion(
+				"link modes are only partially reported");
+
+		/* send a sensible cmd tag back to user */
+		cmd.cmd = ETHTOOL_GSET;
+	} else {
+		int err;
+		/* TODO: return -EOPNOTSUPP when
+		 * ethtool_ops::get_settings disappears internally
+		 */
+
+		/* driver doesn't support %ethtool_link_ksettings
+		 * API. revert to legacy %ethtool_cmd API, unless it's
+		 * not supported either.
+		 */
+		err = __ethtool_get_settings(dev, &cmd);
+		if (err < 0)
+			return err;
+	}
 
 	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
 		return -EFAULT;
+
 	return 0;
 }
 
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
 static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_cmd cmd;
 
-	if (!dev->ethtool_ops->set_settings)
-		return -EOPNOTSUPP;
+	ASSERT_RTNL();
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
+	/* first, try new %ethtool_link_ksettings API. */
+	if (dev->ethtool_ops->set_link_ksettings) {
+		struct ethtool_link_ksettings link_ksettings;
+
+		if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+							       &cmd))
+			return -EINVAL;
+
+		link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+		link_ksettings.base.link_mode_masks_nwords
+			= __ETHTOOL_LINK_MODE_MASK_NU32;
+		return dev->ethtool_ops->set_link_ksettings(dev,
+							    &link_ksettings);
+	}
+
+	/* legacy %ethtool_cmd API */
+
+	/* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+	 * disappears internally
+	 */
+
+	if (!dev->ethtool_ops->set_settings)
+		return -EOPNOTSUPP;
+
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
@@ -2252,6 +2687,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PERQUEUE:
 		rc = ethtool_set_per_queue(dev, useraddr);
 		break;
+	case ETHTOOL_GLINKSETTINGS:
+		rc = ethtool_get_link_ksettings(dev, useraddr);
+		break;
+	case ETHTOOL_SLINKSETTINGS:
+		rc = ethtool_set_link_ksettings(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 17605b961f766757fddec20810453fc51b266e77 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:07 -0800
Subject: net: rdma: use __ethtool_get_ksettings

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/rdma/ib_addr.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index c34c9002460c..931a47ba4571 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -262,24 +262,22 @@ static inline enum ib_mtu iboe_get_mtu(int mtu)
 
 static inline int iboe_get_rate(struct net_device *dev)
 {
-	struct ethtool_cmd cmd;
-	u32 speed;
+	struct ethtool_link_ksettings cmd;
 	int err;
 
 	rtnl_lock();
-	err = __ethtool_get_settings(dev, &cmd);
+	err = __ethtool_get_link_ksettings(dev, &cmd);
 	rtnl_unlock();
 	if (err)
 		return IB_RATE_PORT_CURRENT;
 
-	speed = ethtool_cmd_speed(&cmd);
-	if (speed >= 40000)
+	if (cmd.base.speed >= 40000)
 		return IB_RATE_40_GBPS;
-	else if (speed >= 30000)
+	else if (cmd.base.speed >= 30000)
 		return IB_RATE_30_GBPS;
-	else if (speed >= 20000)
+	else if (cmd.base.speed >= 20000)
 		return IB_RATE_20_GBPS;
-	else if (speed >= 10000)
+	else if (cmd.base.speed >= 10000)
 		return IB_RATE_10_GBPS;
 	else
 		return IB_RATE_PORT_CURRENT;
-- 
cgit v1.2.3


From 3237fc63a3297d472a8cec33cb914f20570cfc23 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:11 -0800
Subject: net: ethtool: remove unused __ethtool_get_settings

replaced by __ethtool_get_link_ksettings.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  4 ----
 net/core/ethtool.c      | 45 ++++++++++++++-------------------------------
 2 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8a400a54c92e..e2b7bf27c03e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -150,10 +150,6 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
-/* DEPRECATED, use __ethtool_get_link_ksettings */
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * struct ethtool_ops - optional netdev operations
  * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index edcec56ed228..2966cd0d7c93 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -551,7 +551,12 @@ int __ethtool_get_link_ksettings(struct net_device *dev,
 	 * legacy %ethtool_cmd API, unless it's not supported either.
 	 * TODO: remove when ethtool_ops::get_settings disappears internally
 	 */
-	err = __ethtool_get_settings(dev, &cmd);
+	if (!dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = ETHTOOL_GSET;
+	err = dev->ethtool_ops->get_settings(dev, &cmd);
 	if (err < 0)
 		return err;
 
@@ -729,30 +734,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 }
 
-/* Internal kernel helper to query a device ethtool_cmd settings.
- *
- * Note about transition to ethtool_link_settings API: We do not need
- * (or want) this function to support "dev" instances that implement
- * the ethtool_link_settings API as we will update the drivers calling
- * this function to call __ethtool_get_link_ksettings instead, before
- * the first drivers implement ethtool_ops::get_link_ksettings.
- *
- * TODO 1: at least make this function static when no driver is using it
- * TODO 2: remove when ethtool_ops::get_settings disappears internally
- */
-int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-	ASSERT_RTNL();
-
-	if (!dev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	memset(cmd, 0, sizeof(struct ethtool_cmd));
-	cmd->cmd = ETHTOOL_GSET;
-	return dev->ethtool_ops->get_settings(dev, cmd);
-}
-EXPORT_SYMBOL(__ethtool_get_settings);
-
 static void
 warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
 {
@@ -796,16 +777,18 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 		/* send a sensible cmd tag back to user */
 		cmd.cmd = ETHTOOL_GSET;
 	} else {
-		int err;
-		/* TODO: return -EOPNOTSUPP when
-		 * ethtool_ops::get_settings disappears internally
-		 */
-
 		/* driver doesn't support %ethtool_link_ksettings
 		 * API. revert to legacy %ethtool_cmd API, unless it's
 		 * not supported either.
 		 */
-		err = __ethtool_get_settings(dev, &cmd);
+		int err;
+
+		if (!dev->ethtool_ops->get_settings)
+			return -EOPNOTSUPP;
+
+		memset(&cmd, 0, sizeof(cmd));
+		cmd.cmd = ETHTOOL_GSET;
+		err = dev->ethtool_ops->get_settings(dev, &cmd);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3


From 3f2fb9a834cb1fcddbae22deca7fde136944dc89 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 11:47:02 -0800
Subject: net: l3mdev: address selection should only consider devices in L3
 domain

David Lamparter noted a use case where the source address selection fails
to pick an address from a VRF interface - unnumbered interfaces.

Relevant commands from his script:
    ip addr add 9.9.9.9/32 dev lo
    ip link set lo up

    ip link add name vrf0 type vrf table 101
    ip rule add oif vrf0 table 101
    ip rule add iif vrf0 table 101
    ip link set vrf0 up
    ip addr add 10.0.0.3/32 dev vrf0

    ip link add name dummy2 type dummy
    ip link set dummy2 master vrf0 up

    --> note dummy2 has no address - unnumbered device

    ip route add 10.2.2.2/32 dev dummy2 table 101
    ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02

    tcpdump -ni dummy2 &

And using ping instead of his socat example:
    $ ping -I vrf0 -c1 10.2.2.2
    ping: Warning: source address might be selected on device other than vrf0.
    PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data.

>From tcpdump:
    12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64

Note the source address is from lo and is not a VRF local address. With
this patch:

    $ ping -I vrf0 -c1 10.2.2.2
    PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data.

>From tcpdump:
    12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64

Now the source address comes from vrf0.

The ipv4 function for selecting source address takes a const argument.
Removing the const requires touching a lot of places, so instead
l3mdev_master_ifindex_rcu is changed to take a const argument and then
do the typecast to non-const as required by netdev_master_upper_dev_get_rcu.
This is similar to what l3mdev_fib_table_rcu does.

IPv6 for unnumbered interfaces appears to be selecting the addresses
properly.

Cc: David Lamparter <david@opensourcerouting.org>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h |  4 ++--
 net/ipv4/devinet.c   |  5 +++++
 net/l3mdev/l3mdev.c  | 11 +++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 5567d46b3cff..c43a9c73de5e 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -39,7 +39,7 @@ struct l3mdev_ops {
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 
-int l3mdev_master_ifindex_rcu(struct net_device *dev);
+int l3mdev_master_ifindex_rcu(const struct net_device *dev);
 static inline int l3mdev_master_ifindex(struct net_device *dev)
 {
 	int ifindex;
@@ -179,7 +179,7 @@ struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
 
 #else
 
-static inline int l3mdev_master_ifindex_rcu(struct net_device *dev)
+static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
 {
 	return 0;
 }
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 29b8d3a7b19b..18d510fa7ee2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1194,6 +1194,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 	__be32 addr = 0;
 	struct in_device *in_dev;
 	struct net *net = dev_net(dev);
+	int master_idx;
 
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
@@ -1214,12 +1215,16 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 	if (addr)
 		goto out_unlock;
 no_in_dev:
+	master_idx = l3mdev_master_ifindex_rcu(dev);
 
 	/* Not loopback addresses on loopback should be preferred
 	   in this case. It is important that lo is the first interface
 	   in dev_base list.
 	 */
 	for_each_netdev_rcu(net, dev) {
+		if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+			continue;
+
 		in_dev = __in_dev_get_rcu(dev);
 		if (!in_dev)
 			continue;
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8e5ead366e7f..e925037fa0df 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -17,7 +17,7 @@
  *	@dev: targeted interface
  */
 
-int l3mdev_master_ifindex_rcu(struct net_device *dev)
+int l3mdev_master_ifindex_rcu(const struct net_device *dev)
 {
 	int ifindex = 0;
 
@@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev)
 		ifindex = dev->ifindex;
 	} else if (netif_is_l3_slave(dev)) {
 		struct net_device *master;
+		struct net_device *_dev = (struct net_device *)dev;
 
-		master = netdev_master_upper_dev_get_rcu(dev);
+		/* netdev_master_upper_dev_get_rcu calls
+		 * list_first_or_null_rcu to walk the upper dev list.
+		 * list_first_or_null_rcu does not handle a const arg. We aren't
+		 * making changes, just want the master device from that list so
+		 * typecast to remove the const
+		 */
+		master = netdev_master_upper_dev_get_rcu(_dev);
 		if (master)
 			ifindex = master->ifindex;
 	}
-- 
cgit v1.2.3


From b07edbe1cf3dae9ba81f24888e2f2a9dbe778918 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Feb 2016 17:24:08 +0100
Subject: netfilter: meta: add PRANDOM support

Can be used to randomly match packets e.g. for statistic traffic sampling.

See commit 3ad0040573b0c00f8848
("bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs")
for more info why this doesn't use prandom_u32 directly.

Unlike bpf nft_meta can be built as a module, so add an EXPORT_SYMBOL
for prandom_seed_full_state too.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 lib/random32.c                           |  1 +
 net/netfilter/nft_meta.c                 | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be41ffc128b8..b19be0a098c0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -681,6 +681,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_IIFGROUP: packet input interface group
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
+ * @NFT_META_PRANDOM: a 32bit pseudo-random number
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -707,6 +708,7 @@ enum nft_meta_keys {
 	NFT_META_IIFGROUP,
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
+	NFT_META_PRANDOM,
 };
 
 /**
diff --git a/lib/random32.c b/lib/random32.c
index 12111910ccd0..510d1ce7d4d2 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -255,6 +255,7 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
 		prandom_warmup(state);
 	}
 }
+EXPORT_SYMBOL(prandom_seed_full_state);
 
 /*
  *	Generate better values after random number generator
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe885bf271c5..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -28,6 +28,8 @@
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
+	case NFT_META_PRANDOM: {
+		struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+		*dest = prandom_u32_state(state);
+		break;
+	}
 	default:
 		WARN_ON(1);
 		goto err;
@@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_OIFNAME:
 		len = IFNAMSIZ;
 		break;
+	case NFT_META_PRANDOM:
+		prandom_init_once(&nft_prandom_state);
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 86a7996cc8a078793670d82ed97d5a99bb4e8496 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:00 -0800
Subject: net_sched: introduce qdisc_replace() helper

Remove nearly duplicated code and prepare for the following patch.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 17 +++++++++++++++++
 net/sched/sch_cbq.c       |  7 +------
 net/sched/sch_drr.c       |  6 +-----
 net/sched/sch_dsmark.c    |  8 +-------
 net/sched/sch_hfsc.c      |  6 +-----
 net/sched/sch_htb.c       |  9 +--------
 net/sched/sch_multiq.c    |  8 +-------
 net/sched/sch_netem.c     | 10 +---------
 net/sched/sch_prio.c      |  8 +-------
 net/sched/sch_qfq.c       |  6 +-----
 net/sched/sch_red.c       |  7 +------
 net/sched/sch_sfb.c       |  7 +------
 net/sched/sch_tbf.c       |  8 +-------
 13 files changed, 29 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 636a362a0e03..8fdad9f7a2fb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -707,6 +707,23 @@ static inline void qdisc_reset_queue(struct Qdisc *sch)
 	sch->qstats.backlog = 0;
 }
 
+static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
+					  struct Qdisc **pold)
+{
+	struct Qdisc *old;
+
+	sch_tree_lock(sch);
+	old = *pold;
+	*pold = new;
+	if (old != NULL) {
+		qdisc_tree_decrease_qlen(old, old->q.qlen);
+		qdisc_reset(old);
+	}
+	sch_tree_unlock(sch);
+
+	return old;
+}
+
 static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch,
 					      struct sk_buff_head *list)
 {
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index c538d9e4a8f6..7f8474cdce32 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 			new->reshape_fail = cbq_reshape_fail;
 #endif
 	}
-	sch_tree_lock(sch);
-	*old = cl->q;
-	cl->q = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
 
+	*old = qdisc_replace(sch, new, &cl->q);
 	return 0;
 }
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a1cd778240cd..b96c9a8e70ab 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -226,11 +226,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	drr_purge_queue(cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index f357f34d02d2..cfddb1c635c3 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	*old = p->q;
-	p->q = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &p->q);
 	return 0;
 }
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index b7ebe2c87586..089f3b667d36 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1215,11 +1215,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	hfsc_purge_queue(sch, cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 15ccd7f8fb2a..0efbcf358cd0 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1163,14 +1163,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 				     cl->common.classid)) == NULL)
 		return -ENOBUFS;
 
-	sch_tree_lock(sch);
-	*old = cl->un.leaf.q;
-	cl->un.leaf.q = new;
-	if (*old != NULL) {
-		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-		qdisc_reset(*old);
-	}
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
 	return 0;
 }
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 4e904ca0af9d..a0103a138563 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -303,13 +303,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->queues[band];
-	q->queues[band] = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->queues[band]);
 	return 0;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5abd1d9de989..0a6ddaf7f561 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1037,15 +1037,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	if (*old) {
-		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-		qdisc_reset(*old);
-	}
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index ba6487f2741f..1b4aaec64a24 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -268,13 +268,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->queues[band];
-	q->queues[band] = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->queues[band]);
 	return 0;
 }
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 3dc3a6e56052..b5c52caf2e73 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -617,11 +617,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	qfq_purge_queue(cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 6c0534cc7758..d5abcee454d8 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -313,12 +313,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 5bbb6332ec57..0e74e55fda15 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -606,12 +606,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a4afde14e865..56a1aef3495f 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -502,13 +502,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2ccccf5fb43ff62b2b96cc58d95fc0b3596516e4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:01 -0800
Subject: net_sched: update hierarchical backlog too

When the bottom qdisc decides to, for example, drop some packet,
it calls qdisc_tree_decrease_qlen() to update the queue length
for all its ancestors, we need to update the backlog too to
keep the stats on root qdisc accurate.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/codel.h       |  4 ++++
 include/net/sch_generic.h |  5 +++--
 net/sched/sch_api.c       |  8 +++++---
 net/sched/sch_cbq.c       |  5 +++--
 net/sched/sch_choke.c     |  6 ++++--
 net/sched/sch_codel.c     | 10 ++++++----
 net/sched/sch_drr.c       |  3 ++-
 net/sched/sch_fq.c        |  4 +++-
 net/sched/sch_fq_codel.c  | 17 ++++++++++++-----
 net/sched/sch_hfsc.c      |  3 ++-
 net/sched/sch_hhf.c       | 10 +++++++---
 net/sched/sch_htb.c       | 10 ++++++----
 net/sched/sch_multiq.c    |  8 +++++---
 net/sched/sch_netem.c     |  3 ++-
 net/sched/sch_pie.c       |  5 +++--
 net/sched/sch_prio.c      |  7 ++++---
 net/sched/sch_qfq.c       |  3 ++-
 net/sched/sch_red.c       |  3 ++-
 net/sched/sch_sfb.c       |  3 ++-
 net/sched/sch_sfq.c       | 16 +++++++++-------
 net/sched/sch_tbf.c       |  7 +++++--
 21 files changed, 91 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/codel.h b/include/net/codel.h
index 267e70210061..d168aca115cc 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -162,12 +162,14 @@ struct codel_vars {
  * struct codel_stats - contains codel shared variables and stats
  * @maxpacket:	largest packet we've seen so far
  * @drop_count:	temp count of dropped packets in dequeue()
+ * @drop_len:	bytes of dropped packets in dequeue()
  * ecn_mark:	number of packets we ECN marked instead of dropping
  * ce_mark:	number of packets CE marked because sojourn time was above ce_threshold
  */
 struct codel_stats {
 	u32		maxpacket;
 	u32		drop_count;
+	u32		drop_len;
 	u32		ecn_mark;
 	u32		ce_mark;
 };
@@ -308,6 +310,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 								  vars->rec_inv_sqrt);
 					goto end;
 				}
+				stats->drop_len += qdisc_pkt_len(skb);
 				qdisc_drop(skb, sch);
 				stats->drop_count++;
 				skb = dequeue_func(vars, sch);
@@ -330,6 +333,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 		if (params->ecn && INET_ECN_set_ce(skb)) {
 			stats->ecn_mark++;
 		} else {
+			stats->drop_len += qdisc_pkt_len(skb);
 			qdisc_drop(skb, sch);
 			stats->drop_count++;
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 8fdad9f7a2fb..e5bba897d206 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -396,7 +396,8 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 			      struct Qdisc *qdisc);
 void qdisc_reset(struct Qdisc *qdisc);
 void qdisc_destroy(struct Qdisc *qdisc);
-void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n);
+void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
+			       unsigned int len);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops);
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
@@ -716,7 +717,7 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_decrease_qlen(old, old->q.qlen);
+		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
 		qdisc_reset(old);
 	}
 	sch_tree_unlock(sch);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index de1e176e35cc..3b180ff72f79 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -744,14 +744,15 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
 	return 0;
 }
 
-void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
+			       unsigned int len)
 {
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	u32 parentid;
 	int drops;
 
-	if (n == 0)
+	if (n == 0 && len == 0)
 		return;
 	drops = max_t(int, n, 0);
 	rcu_read_lock();
@@ -774,11 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 			cops->put(sch, cl);
 		}
 		sch->q.qlen -= n;
+		sch->qstats.backlog -= len;
 		__qdisc_qstats_drop(sch, drops);
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
+EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 
 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 			       struct nlmsghdr *n, u32 clid,
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 7f8474cdce32..baafddf229ce 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1909,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = (struct cbq_class *)arg;
-	unsigned int qlen;
+	unsigned int qlen, backlog;
 
 	if (cl->filters || cl->children || cl == &q->link)
 		return -EBUSY;
@@ -1917,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	sch_tree_lock(sch);
 
 	qlen = cl->q->q.qlen;
+	backlog = cl->q->qstats.backlog;
 	qdisc_reset(cl->q);
-	qdisc_tree_decrease_qlen(cl->q, qlen);
+	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
 
 	if (cl->next_alive)
 		cbq_deactivate_class(cl);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 5ffb8b8337c7..0a08c860eee4 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 		choke_zap_tail_holes(q);
 
 	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
 	qdisc_drop(skb, sch);
-	qdisc_tree_decrease_qlen(sch, 1);
 	--sch->q.qlen;
 }
 
@@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 		old = q->tab;
 		if (old) {
 			unsigned int oqlen = sch->q.qlen, tail = 0;
+			unsigned dropped = 0;
 
 			while (q->head != q->tail) {
 				struct sk_buff *skb = q->tab[q->head];
@@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 					ntab[tail++] = skb;
 					continue;
 				}
+				dropped += qdisc_pkt_len(skb);
 				qdisc_qstats_backlog_dec(sch, skb);
 				--sch->q.qlen;
 				qdisc_drop(skb, sch);
 			}
-			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
 			q->head = 0;
 			q->tail = tail;
 		}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 535007d5f0b5..9b7e2980ee5c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
 
 	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
 
-	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
 	 */
 	if (q->stats.drop_count && sch->q.qlen) {
-		qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+		qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
 		q->stats.drop_count = 0;
+		q->stats.drop_len = 0;
 	}
 	if (skb)
 		qdisc_bstats_update(sch, skb);
@@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CODEL_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	int err;
 
 	if (!opt)
@@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_drop(skb, sch);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b96c9a8e70ab..a63e879e8975 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
 static void drr_purge_queue(struct drr_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 109b2322778f..3c6a47d66a04 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_MAX + 1];
 	int err, drop_count = 0;
+	unsigned drop_len = 0;
 	u32 fq_log;
 
 	if (!opt)
@@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 
 		if (!skb)
 			break;
+		drop_len += qdisc_pkt_len(skb);
 		kfree_skb(skb);
 		drop_count++;
 	}
-	qdisc_tree_decrease_qlen(sch, drop_count);
+	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
 
 	sch_tree_unlock(sch);
 	return err;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 4c834e93dafb..d3fc8f9dd3d4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
-	unsigned int idx;
+	unsigned int idx, prev_backlog;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
 
@@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (++sch->q.qlen <= sch->limit)
 		return NET_XMIT_SUCCESS;
 
+	prev_backlog = sch->qstats.backlog;
 	q->drop_overlimit++;
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
@@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
 	struct fq_codel_flow *flow;
 	struct list_head *head;
 	u32 prev_drop_count, prev_ecn_mark;
+	unsigned int prev_backlog;
 
 begin:
 	head = &q->new_flows;
@@ -259,6 +261,7 @@ begin:
 
 	prev_drop_count = q->cstats.drop_count;
 	prev_ecn_mark = q->cstats.ecn_mark;
+	prev_backlog = sch->qstats.backlog;
 
 	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
 			    dequeue);
@@ -276,12 +279,14 @@ begin:
 	}
 	qdisc_bstats_update(sch, skb);
 	flow->deficit -= qdisc_pkt_len(skb);
-	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
 	 */
 	if (q->cstats.drop_count && sch->q.qlen) {
-		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+		qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
+					  q->cstats.drop_len);
 		q->cstats.drop_count = 0;
+		q->cstats.drop_len = 0;
 	}
 	return skb;
 }
@@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = fq_codel_dequeue(sch);
 
+		q->cstats.drop_len += qdisc_pkt_len(skb);
 		kfree_skb(skb);
 		q->cstats.drop_count++;
 	}
-	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+	qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
 	q->cstats.drop_count = 0;
+	q->cstats.drop_len = 0;
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 089f3b667d36..d783d7cc3348 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -895,9 +895,10 @@ static void
 hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static void
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 86b04e31e60b..13d6f83ec491 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	enum wdrr_bucket_idx idx;
 	struct wdrr_bucket *bucket;
+	unsigned int prev_backlog;
 
 	idx = hhf_classify(skb, sch);
 
@@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (++sch->q.qlen <= sch->limit)
 		return NET_XMIT_SUCCESS;
 
+	prev_backlog = sch->qstats.backlog;
 	q->drop_overlimit++;
 	/* Return Congestion Notification only if we dropped a packet from this
 	 * bucket.
@@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this. */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HHF_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, prev_backlog;
 	int err;
 	u64 non_hh_quantum;
 	u32 new_quantum = q->quantum;
@@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	qlen = sch->q.qlen;
+	prev_backlog = sch->qstats.backlog;
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = hhf_dequeue(sch);
 
 		kfree_skb(skb);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
+				  prev_backlog - sch->qstats.backlog);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0efbcf358cd0..846a7f98cef9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1265,7 +1265,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
-	unsigned int qlen;
 	struct Qdisc *new_q = NULL;
 	int last_child = 0;
 
@@ -1285,9 +1284,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 	sch_tree_lock(sch);
 
 	if (!cl->level) {
-		qlen = cl->un.leaf.q->q.qlen;
+		unsigned int qlen = cl->un.leaf.q->q.qlen;
+		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
+
 		qdisc_reset(cl->un.leaf.q);
-		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
 	}
 
 	/* delete from hash and active; remainder in destroy_class */
@@ -1421,10 +1422,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
 			unsigned int qlen = parent->un.leaf.q->q.qlen;
+			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
 
 			/* turn parent into inner node */
 			qdisc_reset(parent->un.leaf.q);
-			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
 			qdisc_destroy(parent->un.leaf.q);
 			if (parent->prio_activity)
 				htb_deactivate(q, parent);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index a0103a138563..bcdd54bb101c 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 		if (q->queues[i] != &noop_qdisc) {
 			struct Qdisc *child = q->queues[i];
 			q->queues[i] = &noop_qdisc;
-			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_tree_reduce_backlog(child, child->q.qlen,
+						  child->qstats.backlog);
 			qdisc_destroy(child);
 		}
 	}
@@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 				q->queues[i] = child;
 
 				if (old != &noop_qdisc) {
-					qdisc_tree_decrease_qlen(old,
-								 old->q.qlen);
+					qdisc_tree_reduce_backlog(old,
+								  old->q.qlen,
+								  old->qstats.backlog);
 					qdisc_destroy(old);
 				}
 				sch_tree_unlock(sch);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0a6ddaf7f561..9640bb39a5d2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -598,7 +598,8 @@ deliver:
 				if (unlikely(err != NET_XMIT_SUCCESS)) {
 					if (net_xmit_drop_count(err)) {
 						qdisc_qstats_drop(sch);
-						qdisc_tree_decrease_qlen(sch, 1);
+						qdisc_tree_reduce_backlog(sch, 1,
+									  qdisc_pkt_len(skb));
 					}
 				}
 				goto tfifo_dequeue;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index b783a446d884..71ae3b9629f9 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_PIE_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	int err;
 
 	if (!opt)
@@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_drop(skb, sch);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 1b4aaec64a24..fee1b15506b2 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 		struct Qdisc *child = q->queues[i];
 		q->queues[i] = &noop_qdisc;
 		if (child != &noop_qdisc) {
-			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
 			qdisc_destroy(child);
 		}
 	}
@@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 				q->queues[i] = child;
 
 				if (old != &noop_qdisc) {
-					qdisc_tree_decrease_qlen(old,
-								 old->q.qlen);
+					qdisc_tree_reduce_backlog(old,
+								  old->q.qlen,
+								  old->qstats.backlog);
 					qdisc_destroy(old);
 				}
 				sch_tree_unlock(sch);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b5c52caf2e73..8d2d8d953432 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 static void qfq_purge_queue(struct qfq_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d5abcee454d8..8c0508c0e287 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
 	if (child) {
-		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
 	}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 0e74e55fda15..c69611640fa5 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
 
 	sch_tree_lock(sch);
 
-	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+	qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+				  q->qdisc->qstats.backlog);
 	qdisc_destroy(q->qdisc);
 	q->qdisc = child;
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3abab534eb5c..498f0a2cb47f 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -346,7 +346,7 @@ static int
 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned int hash;
+	unsigned int hash, dropped;
 	sfq_index x, qlen;
 	struct sfq_slot *slot;
 	int uninitialized_var(ret);
@@ -461,7 +461,7 @@ enqueue:
 		return NET_XMIT_SUCCESS;
 
 	qlen = slot->qlen;
-	sfq_drop(sch);
+	dropped = sfq_drop(sch);
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
 	 */
@@ -469,7 +469,7 @@ enqueue:
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, dropped);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch)
 	struct sfq_slot *slot;
 	struct sk_buff_head list;
 	int dropped = 0;
+	unsigned int drop_len = 0;
 
 	__skb_queue_head_init(&list);
 
@@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch)
 			if (x >= SFQ_MAX_FLOWS) {
 drop:
 				qdisc_qstats_backlog_dec(sch, skb);
+				drop_len += qdisc_pkt_len(skb);
 				kfree_skb(skb);
 				dropped++;
 				continue;
@@ -594,7 +596,7 @@ drop:
 		}
 	}
 	sch->q.qlen -= dropped;
-	qdisc_tree_decrease_qlen(sch, dropped);
+	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 }
 
 static void sfq_perturbation(unsigned long arg)
@@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	struct tc_sfq_qopt *ctl = nla_data(opt);
 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	struct red_parms *p = NULL;
 
 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
@@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > q->limit)
-		sfq_drop(sch);
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+		dropped += sfq_drop(sch);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	del_timer(&q->perturb_timer);
 	if (q->perturb_period) {
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 56a1aef3495f..c2fbde742f37 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *segs, *nskb;
 	netdev_features_t features = netif_skb_features(skb);
+	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 	int ret, nb;
 
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
@@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 		nskb = segs->next;
 		segs->next = NULL;
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		len += segs->len;
 		ret = qdisc_enqueue(segs, q->qdisc);
 		if (ret != NET_XMIT_SUCCESS) {
 			if (net_xmit_drop_count(ret))
@@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	}
 	sch->q.qlen += nb;
 	if (nb > 1)
-		qdisc_tree_decrease_qlen(sch, 1 - nb);
+		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 	consume_skb(skb);
 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 }
@@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 
 	sch_tree_lock(sch);
 	if (child) {
-		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
 	}
-- 
cgit v1.2.3


From 871b642adebe300be2e50aa5f65a418510f636ec Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 26 Feb 2016 10:45:37 +0100
Subject: netdev: introduce ndo_set_rx_headroom

This method allows the controlling device (i.e. the bridge) to specify
additional headroom to be allocated for skb head on frame reception.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e52077ffe5ed..efe7cec111fa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1093,6 +1093,12 @@ struct tc_to_netdev {
  *	This function is used to get egress tunnel information for given skb.
  *	This is useful for retrieving outer tunnel header parameters while
  *	sampling packet.
+ * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
+ *	This function is used to specify the headroom that the skb must
+ *	consider when allocation skb during packet reception. Setting
+ *	appropriate rx headroom value allows avoiding skb head copy on
+ *	forward. Setting a negative value reset the rx headroom to the
+ *	default value.
  *
  */
 struct net_device_ops {
@@ -1278,6 +1284,8 @@ struct net_device_ops {
 							 bool proto_down);
 	int			(*ndo_fill_metadata_dst)(struct net_device *dev,
 						       struct sk_buff *skb);
+	void			(*ndo_set_rx_headroom)(struct net_device *dev,
+						       int needed_headroom);
 };
 
 /**
@@ -1315,6 +1323,8 @@ struct net_device_ops {
  * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
  * @IFF_TEAM: device is a team device
  * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
+ * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
+ *	entity (i.e. the master device for bridged veth)
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1343,6 +1353,7 @@ enum netdev_priv_flags {
 	IFF_L3MDEV_SLAVE		= 1<<23,
 	IFF_TEAM			= 1<<24,
 	IFF_RXFH_CONFIGURED		= 1<<25,
+	IFF_PHONY_HEADROOM		= 1<<26,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
 				    void *accel_priv);
 
+/* returns the headroom that the master device needs to take in account
+ * when forwarding to this dev
+ */
+static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
+}
+
+static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+	if (dev->netdev_ops->ndo_set_rx_headroom)
+		dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
+}
+
+/* set the device rx headroom to the dev's default */
+static inline void netdev_reset_rx_headroom(struct net_device *dev)
+{
+	netdev_set_rx_headroom(dev, -1);
+}
+
 /*
  * Net namespace inlines
  */
-- 
cgit v1.2.3


From 6843e7a2abe7cac10c19702ffec90018df6f040d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:53:49 -0800
Subject: net: sched: consolidate offload decision in cls_u32

The offload decision was originally very basic and tied to if the dev
implemented the appropriate ndo op hook. The next step is to allow
the user to more flexibly define if any paticular rule should be
offloaded or not. In order to have this logic in one function lift
the current check into a helper routine tc_should_offload().

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 5 +++++
 net/sched/cls_u32.c   | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2121df574262..e64d20b81047 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,4 +392,9 @@ struct tc_cls_u32_offload {
 	};
 };
 
+static inline bool tc_should_offload(struct net_device *dev)
+{
+	return dev->netdev_ops->ndo_setup_tc;
+}
+
 #endif
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d54bc942ea87..24e888b9b728 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -434,7 +434,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -451,7 +451,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +471,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -491,7 +491,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
-- 
cgit v1.2.3


From 2b6ab0d3aae6bf1e08118060b0c5565778cd6b21 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:54:13 -0800
Subject: net: cls_u32: move TC offload feature bit into cls_u32 offload logic

In the original series drivers would get offload requests for cls_u32
rules even if the feature bit is disabled. This meant the driver had
to do a boiler plate check on the feature bit before adding/deleting
the rule.

This patch lifts the check into the core code and removes it from the
driver specific case.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ---
 include/net/pkt_cls.h                         | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index cf4b729c92d7..b893ff8e65f5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8400,9 +8400,6 @@ int __ixgbe_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 
 	if (TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS) &&
 	    tc->type == TC_SETUP_CLSU32) {
-		if (!(dev->features & NETIF_F_HW_TC))
-			return -EINVAL;
-
 		switch (tc->cls_u32->command) {
 		case TC_CLSU32_NEW_KNODE:
 		case TC_CLSU32_REPLACE_KNODE:
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e64d20b81047..6096e96fb78b 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -394,6 +394,9 @@ struct tc_cls_u32_offload {
 
 static inline bool tc_should_offload(struct net_device *dev)
 {
+	if (!(dev->features & NETIF_F_HW_TC))
+		return false;
+
 	return dev->netdev_ops->ndo_setup_tc;
 }
 
-- 
cgit v1.2.3


From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:54:39 -0800
Subject: net: sched: cls_u32 add bit to specify software only rules

In the initial implementation the only way to stop a rule from being
inserted into the hardware table was via the device feature flag.
However this doesn't work well when working on an end host system
where packets are expect to hit both the hardware and software
datapaths.

For example we can imagine a rule that will match an IP address and
increment a field. If we install this rule in both hardware and
software we may increment the field twice. To date we have only
added support for the drop action so we have been able to ignore
these cases. But as we extend the action support we will hit this
example plus more such cases. Arguably these are not even corner
cases in many working systems these cases will be common.

To avoid forcing the driver to always abort (i.e. the above example)
this patch adds a flag to add a rule in software only. A careful
user can use this flag to build software and hardware datapaths
that work together. One example we have found particularly useful
is to use hardware resources to set the skb->mark on the skb when
the match may be expensive to run in software but a mark lookup
in a hash table is cheap. The idea here is hardware can do in one
lookup what the u32 classifier may need to traverse multiple lists
and hash tables to compute. The flag is only passed down on inserts.
On deletion to avoid stale references in hardware we always try
to remove a rule if it exists.

The flags field is part of the classifier specific options. Although
it is tempting to lift this into the generic structure doing this
proves difficult do to how the tc netlink attributes are implemented
along with how the dump/change routines are called. There is also
precedence for putting seemingly generic pieces in the specific
classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So
although not ideal I've left FLAGS in the u32 options as well as it
simplifies the code greatly and user space has already learned how
to manage these bits ala 'tc' tool.

Another thing if trying to update a rule we require the flags to
be unchanged. This is to force user space, software u32 and
the hardware u32 to keep in sync. Thanks to Simon Horman for
catching this case.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 13 +++++++++++--
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_u32.c          | 37 +++++++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6096e96fb78b..bea14eee373e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,12 +392,21 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(struct net_device *dev)
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW 1
+
+static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 {
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 
-	return dev->netdev_ops->ndo_setup_tc;
+	if (flags & TCA_CLS_FLAGS_SKIP_HW)
+		return false;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		return false;
+
+	return true;
 }
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 439873775d49..9874f5680926 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -172,6 +172,7 @@ enum {
 	TCA_U32_INDEV,
 	TCA_U32_PCNT,
 	TCA_U32_MARK,
+	TCA_U32_FLAGS,
 	__TCA_U32_MAX
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 24e888b9b728..563cdad76448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -59,6 +59,7 @@ struct tc_u_knode {
 #ifdef CONFIG_CLS_U32_PERF
 	struct tc_u32_pcnt __percpu *pf;
 #endif
+	u32			flags;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -434,7 +435,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -442,7 +443,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+				 struct tc_u_hnode *h,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -451,7 +454,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +474,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -482,7 +485,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+				 struct tc_u_knode *n,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -491,7 +496,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
@@ -679,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -786,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 #endif
 	new->fshift = n->fshift;
 	new->res = n->res;
+	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
 
 	/* bump reference count as long as we hold pointer to structure */
@@ -825,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid;
+	u32 htid, flags = 0;
 	int err;
 #ifdef CONFIG_CLS_U32_PERF
 	size_t size;
@@ -838,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_U32_FLAGS])
+		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
 		struct tc_u_knode *new;
@@ -845,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
+		if (n->flags != flags)
+			return -EINVAL;
+
 		new = u32_init_knode(tp, n);
 		if (!new)
 			return -ENOMEM;
@@ -861,7 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new);
+		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -889,7 +902,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht);
+		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
@@ -940,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	n->flags = flags;
 	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	n->tp = tp;
 
@@ -972,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n);
+		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
@@ -1077,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
 			goto nla_put_failure;
 
+		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+			goto nla_put_failure;
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((n->val || n->mask)) {
 			struct tc_u32_mark mark = {.val = n->val,
-- 
cgit v1.2.3


From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 26 Feb 2016 17:32:23 +0100
Subject: Introduce devlink infrastructure

Introduce devlink infrastructure for drivers to register and expose to
userspace via generic Netlink interface.

There are two basic objects defined:
devlink - one instance for every "parent device", for example switch ASIC
devlink port - one instance for every physical port of the device.

This initial portion implements basic get/dump of objects to userspace.
Also, port splitter and port type setting is implemented.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   8 +
 include/net/devlink.h        | 140 ++++++++
 include/uapi/linux/devlink.h |  72 +++++
 net/Kconfig                  |   7 +
 net/core/Makefile            |   1 +
 net/core/devlink.c           | 738 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 966 insertions(+)
 create mode 100644 include/net/devlink.h
 create mode 100644 include/uapi/linux/devlink.h
 create mode 100644 net/core/devlink.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 12b764f4c93c..e45682745263 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3499,6 +3499,14 @@ F:	include/linux/device-mapper.h
 F:	include/linux/dm-*.h
 F:	include/uapi/linux/dm-*.h
 
+DEVLINK
+M:	Jiri Pirko <jiri@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/devlink.c
+F:	include/net/devlink.h
+F:	include/uapi/linux/devlink.h
+
 DIALOG SEMICONDUCTOR DRIVERS
 M:	Support Opensource <support.opensource@diasemi.com>
 W:	http://www.dialog-semiconductor.com/products
diff --git a/include/net/devlink.h b/include/net/devlink.h
new file mode 100644
index 000000000000..c37d257891d6
--- /dev/null
+++ b/include/net/devlink.h
@@ -0,0 +1,140 @@
+/*
+ * include/net/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _NET_DEVLINK_H_
+#define _NET_DEVLINK_H_
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <uapi/linux/devlink.h>
+
+struct devlink_ops;
+
+struct devlink {
+	struct list_head list;
+	struct list_head port_list;
+	const struct devlink_ops *ops;
+	struct device *dev;
+	possible_net_t _net;
+	char priv[0] __aligned(NETDEV_ALIGN);
+};
+
+struct devlink_port {
+	struct list_head list;
+	struct devlink *devlink;
+	unsigned index;
+	bool registered;
+	enum devlink_port_type type;
+	enum devlink_port_type desired_type;
+	void *type_dev;
+	bool split;
+	u32 split_group;
+};
+
+struct devlink_ops {
+	size_t priv_size;
+	int (*port_type_set)(struct devlink_port *devlink_port,
+			     enum devlink_port_type port_type);
+	int (*port_split)(struct devlink *devlink, unsigned int port_index,
+			  unsigned int count);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+};
+
+static inline void *devlink_priv(struct devlink *devlink)
+{
+	BUG_ON(!devlink);
+	return &devlink->priv;
+}
+
+static inline struct devlink *priv_to_devlink(void *priv)
+{
+	BUG_ON(!priv);
+	return container_of(priv, struct devlink, priv);
+}
+
+struct ib_device;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
+int devlink_register(struct devlink *devlink, struct device *dev);
+void devlink_unregister(struct devlink *devlink);
+void devlink_free(struct devlink *devlink);
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index);
+void devlink_port_unregister(struct devlink_port *devlink_port);
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev);
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev);
+void devlink_port_type_clear(struct devlink_port *devlink_port);
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group);
+
+#else
+
+static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
+					    size_t priv_size)
+{
+	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
+}
+
+static inline int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	return 0;
+}
+
+static inline void devlink_unregister(struct devlink *devlink)
+{
+}
+
+static inline void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+
+static inline int devlink_port_register(struct devlink *devlink,
+					struct devlink_port *devlink_port,
+					unsigned int port_index)
+{
+	return 0;
+}
+
+static inline void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+					     struct net_device *netdev)
+{
+}
+
+static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+					    struct ib_device *ibdev)
+{
+}
+
+static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_split_set(struct devlink_port *devlink_port,
+					  u32 split_group)
+{
+}
+
+#endif
+
+#endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
new file mode 100644
index 000000000000..c9fee5781eb1
--- /dev/null
+++ b/include/uapi/linux/devlink.h
@@ -0,0 +1,72 @@
+/*
+ * include/uapi/linux/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_DEVLINK_H_
+#define _UAPI_LINUX_DEVLINK_H_
+
+#define DEVLINK_GENL_NAME "devlink"
+#define DEVLINK_GENL_VERSION 0x1
+#define DEVLINK_GENL_MCGRP_CONFIG_NAME "config"
+
+enum devlink_command {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_CMD_UNSPEC,
+
+	DEVLINK_CMD_GET,		/* can dump */
+	DEVLINK_CMD_SET,
+	DEVLINK_CMD_NEW,
+	DEVLINK_CMD_DEL,
+
+	DEVLINK_CMD_PORT_GET,		/* can dump */
+	DEVLINK_CMD_PORT_SET,
+	DEVLINK_CMD_PORT_NEW,
+	DEVLINK_CMD_PORT_DEL,
+
+	DEVLINK_CMD_PORT_SPLIT,
+	DEVLINK_CMD_PORT_UNSPLIT,
+
+	/* add new commands above here */
+
+	__DEVLINK_CMD_MAX,
+	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
+};
+
+enum devlink_port_type {
+	DEVLINK_PORT_TYPE_NOTSET,
+	DEVLINK_PORT_TYPE_AUTO,
+	DEVLINK_PORT_TYPE_ETH,
+	DEVLINK_PORT_TYPE_IB,
+};
+
+enum devlink_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_ATTR_UNSPEC,
+
+	/* bus name + dev name together are a handle for devlink entity */
+	DEVLINK_ATTR_BUS_NAME,			/* string */
+	DEVLINK_ATTR_DEV_NAME,			/* string */
+
+	DEVLINK_ATTR_PORT_INDEX,		/* u32 */
+	DEVLINK_ATTR_PORT_TYPE,			/* u16 */
+	DEVLINK_ATTR_PORT_DESIRED_TYPE,		/* u16 */
+	DEVLINK_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+	DEVLINK_ATTR_PORT_NETDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_IBDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_SPLIT_COUNT,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_GROUP,		/* u32 */
+
+	/* add new attributes above here, update the policy in devlink.c */
+
+	__DEVLINK_ATTR_MAX,
+	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
+};
+
+#endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index b80efecfc1a0..6c9cfb0d7639 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -396,6 +396,13 @@ config DST_CACHE
 	bool "dst cache"
 	default n
 
+config NET_DEVLINK
+	tristate "Network physical/parent device Netlink interface"
+	help
+	  Network physical/parent device Netlink interface provides
+	  infrastructure to support access to physical chip-wide config and
+	  monitoring.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 7a8fb8aef992..014422e2561f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..590fa561cb7f
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+	return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+					      struct nlattr **attrs)
+{
+	struct devlink *devlink;
+	char *busname;
+	char *devname;
+
+	if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+
+	busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+	devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+		    strcmp(dev_name(devlink->dev), devname) == 0 &&
+		    net_eq(devlink_net(devlink), net))
+			return devlink;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+	return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+						      int port_index)
+{
+	struct devlink_port *devlink_port;
+
+	list_for_each_entry(devlink_port, &devlink->port_list, list) {
+		if (devlink_port->index == port_index)
+			return devlink_port;
+	}
+	return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+	return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+							struct nlattr **attrs)
+{
+	if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+		u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+		struct devlink_port *devlink_port;
+
+		devlink_port = devlink_port_get_by_index(devlink, port_index);
+		if (!devlink_port)
+			return ERR_PTR(-ENODEV);
+		return devlink_port;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+						       struct genl_info *info)
+{
+	return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT	BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink;
+
+	mutex_lock(&devlink_mutex);
+	devlink = devlink_get_from_info(info);
+	if (IS_ERR(devlink)) {
+		mutex_unlock(&devlink_mutex);
+		return PTR_ERR(devlink);
+	}
+	info->user_ptr[0] = devlink;
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+		struct devlink_port *devlink_port;
+
+		mutex_lock(&devlink_port_mutex);
+		devlink_port = devlink_port_get_from_info(devlink, info);
+		if (IS_ERR(devlink_port)) {
+			mutex_unlock(&devlink_port_mutex);
+			mutex_unlock(&devlink_mutex);
+			return PTR_ERR(devlink_port);
+		}
+		info->user_ptr[1] = devlink_port;
+	}
+	return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+				 struct sk_buff *skb, struct genl_info *info)
+{
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+		mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+	DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+	[DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+			   enum devlink_command cmd, u32 portid,
+			   u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+				struct devlink_port *devlink_port,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+		goto nla_put_failure;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+		goto nla_put_failure;
+	if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+	    nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+			devlink_port->desired_type))
+		goto nla_put_failure;
+	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+		struct net_device *netdev = devlink_port->type_dev;
+
+		if (netdev &&
+		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+				 netdev->ifindex) ||
+		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+				    netdev->name)))
+			goto nla_put_failure;
+	}
+	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+		struct ib_device *ibdev = devlink_port->type_dev;
+
+		if (ibdev &&
+		    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+				   ibdev->name))
+			goto nla_put_failure;
+	}
+	if (devlink_port->split &&
+	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+			devlink_port->split_group))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+				enum devlink_command cmd)
+{
+	struct devlink *devlink = devlink_port->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink_port->registered)
+		return;
+
+	WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+			      info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+				     struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+				      NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port,
+				   DEVLINK_CMD_PORT_NEW,
+				   info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	struct devlink_port *devlink_port;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink_port_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		list_for_each_entry(devlink_port, &devlink->port_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_port_fill(msg, devlink, devlink_port,
+						   DEVLINK_CMD_NEW,
+						   NETLINK_CB(cb->skb).portid,
+						   cb->nlh->nlmsg_seq,
+						   NLM_F_MULTI);
+			if (err)
+				goto out;
+			idx++;
+		}
+	}
+out:
+	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+				 struct devlink_port *devlink_port,
+				 enum devlink_port_type port_type)
+
+{
+	int err;
+
+	if (devlink->ops && devlink->ops->port_type_set) {
+		if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+			return -EINVAL;
+		err = devlink->ops->port_type_set(devlink_port, port_type);
+		if (err)
+			return err;
+		devlink_port->desired_type = port_type;
+		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	int err;
+
+	if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+		enum devlink_port_type port_type;
+
+		port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+		err = devlink_port_type_set(devlink, devlink_port, port_type);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+			      u32 port_index, u32 count)
+
+{
+	if (devlink->ops && devlink->ops->port_split)
+		return devlink->ops->port_split(devlink, port_index, count);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+	u32 count;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+	    !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+	return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+	if (devlink->ops && devlink->ops->port_unsplit)
+		return devlink->ops->port_unsplit(devlink, port_index);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+	[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+	{
+		.cmd = DEVLINK_CMD_GET,
+		.doit = devlink_nl_cmd_get_doit,
+		.dumpit = devlink_nl_cmd_get_dumpit,
+		.policy = devlink_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_GET,
+		.doit = devlink_nl_cmd_port_get_doit,
+		.dumpit = devlink_nl_cmd_port_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SET,
+		.doit = devlink_nl_cmd_port_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SPLIT,
+		.doit = devlink_nl_cmd_port_split_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
+		.doit = devlink_nl_cmd_port_unsplit_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+/**
+ *	devlink_alloc - Allocate new devlink instance resources
+ *
+ *	@ops: ops
+ *	@priv_size: size of user private data
+ *
+ *	Allocate new devlink instance resources, including devlink index
+ *	and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+	struct devlink *devlink;
+
+	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+	if (!devlink)
+		return NULL;
+	devlink->ops = ops;
+	devlink_net_set(devlink, &init_net);
+	INIT_LIST_HEAD(&devlink->port_list);
+	return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ *	devlink_register - Register devlink instance
+ *
+ *	@devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	mutex_lock(&devlink_mutex);
+	devlink->dev = dev;
+	list_add_tail(&devlink->list, &devlink_list);
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	mutex_unlock(&devlink_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ *	devlink_unregister - Unregister devlink instance
+ *
+ *	@devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+	list_del(&devlink->list);
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ *	devlink_free - Free devlink instance resources
+ *
+ *	@devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ *	devlink_port_register - Register devlink port
+ *
+ *	@devlink: devlink
+ *	@devlink_port: devlink port
+ *	@port_index
+ *
+ *	Register devlink port with provided port index. User can use
+ *	any indexing, even hw-related one. devlink_port structure
+ *	is convenient to be embedded inside user driver private structure.
+ *	Note that the caller should take care of zeroing the devlink_port
+ *	structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index)
+{
+	mutex_lock(&devlink_port_mutex);
+	if (devlink_port_index_exists(devlink, port_index)) {
+		mutex_unlock(&devlink_port_mutex);
+		return -EEXIST;
+	}
+	devlink_port->devlink = devlink;
+	devlink_port->index = port_index;
+	devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+	devlink_port->registered = true;
+	list_add_tail(&devlink_port->list, &devlink->port_list);
+	mutex_unlock(&devlink_port_mutex);
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ *	devlink_port_unregister - Unregister devlink port
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+	mutex_lock(&devlink_port_mutex);
+	list_del(&devlink_port->list);
+	mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+				    enum devlink_port_type type,
+				    void *type_dev)
+{
+	devlink_port->type = type;
+	devlink_port->type_dev = type_dev;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ *	devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ *	@devlink_port: devlink port
+ *	@netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ *	devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ *	@devlink_port: devlink port
+ *	@ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ *	devlink_port_type_clear - Clear port type
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ *	devlink_port_split_set - Set port is split
+ *
+ *	@devlink_port: devlink port
+ *	@split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group)
+{
+	devlink_port->split = true;
+	devlink_port->split_group = split_group;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+	return genl_register_family_with_ops_groups(&devlink_nl_family,
+						    devlink_nl_ops,
+						    devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+	genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
-- 
cgit v1.2.3


From 09d4d087cd4869859fcc5dfc692f0830550a1b48 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 26 Feb 2016 17:32:24 +0100
Subject: mlx4: Implement devlink interface

Implement newly introduced devlink interface. Add devlink port instances
for every port and set the port types accordingly.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
v2->v3:
-add dev param to devlink_register (api change)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c              |  7 ++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  8 ++++-
 drivers/net/ethernet/mellanox/mlx4/intf.c      |  9 ++++++
 drivers/net/ethernet/mellanox/mlx4/main.c      | 44 +++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  2 ++
 include/linux/mlx4/driver.h                    |  3 ++
 6 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 1c7ab6cabbb8..a15a7b37d386 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -41,6 +41,7 @@
 #include <linux/if_vlan.h>
 #include <net/ipv6.h>
 #include <net/addrconf.h>
+#include <net/devlink.h>
 
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
@@ -2519,6 +2520,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	}
 
 	ibdev->ib_active = true;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i),
+					 &ibdev->ib_dev);
 
 	if (mlx4_is_mfunc(ibdev->dev))
 		init_pkeys(ibdev);
@@ -2643,7 +2647,10 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 {
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p;
+	int i;
 
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		devlink_port_type_clear(mlx4_get_devlink_port(dev, i));
 	ibdev->ib_active = false;
 	flush_workqueue(wq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 96d95cb36c52..e26b110e27da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -40,6 +40,7 @@
 #include <net/ip.h>
 #include <net/busy_poll.h>
 #include <net/vxlan.h>
+#include <net/devlink.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/device.h>
@@ -2033,8 +2034,11 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
 
 	/* Unregister device - this will close the port if it was up */
-	if (priv->registered)
+	if (priv->registered) {
+		devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev,
+							      priv->port));
 		unregister_netdev(dev);
+	}
 
 	if (priv->allocated)
 		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
@@ -3051,6 +3055,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	}
 
 	priv->registered = 1;
+	devlink_port_type_eth_set(mlx4_get_devlink_port(mdev->dev, priv->port),
+				  dev);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 0472941af820..dec77d6f0ac9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/errno.h>
+#include <net/devlink.h>
 
 #include "mlx4.h"
 
@@ -249,3 +250,11 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int
 	return result;
 }
 EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);
+
+struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+
+	return &info->devlink_port;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_devlink_port);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2cc3c626c3fe..4f5cfe4989ce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -42,6 +42,7 @@
 #include <linux/io-mapping.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <net/devlink.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
@@ -2881,8 +2882,13 @@ no_msi:
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 {
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
-	int err = 0;
+	int err;
+
+	err = devlink_port_register(devlink, &info->devlink_port, port);
+	if (err)
+		return err;
 
 	info->dev = dev;
 	info->port = port;
@@ -2907,6 +2913,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
 	}
 
@@ -3680,21 +3687,23 @@ err_disable_pdev:
 
 static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
+	struct devlink *devlink;
 	struct mlx4_priv *priv;
 	struct mlx4_dev *dev;
 	int ret;
 
 	printk_once(KERN_INFO "%s", mlx4_version);
 
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (!priv)
+	devlink = devlink_alloc(NULL, sizeof(*priv));
+	if (!devlink)
 		return -ENOMEM;
+	priv = devlink_priv(devlink);
 
 	dev       = &priv->dev;
 	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
 	if (!dev->persist) {
-		kfree(priv);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_devlink_free;
 	}
 	dev->persist->pdev = pdev;
 	dev->persist->dev = dev;
@@ -3703,14 +3712,23 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	mutex_init(&dev->persist->device_state_mutex);
 	mutex_init(&dev->persist->interface_state_mutex);
 
+	ret = devlink_register(devlink, &pdev->dev);
+	if (ret)
+		goto err_persist_free;
+
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
-	if (ret) {
-		kfree(dev->persist);
-		kfree(priv);
-	} else {
-		pci_save_state(pdev);
-	}
+	if (ret)
+		goto err_devlink_unregister;
 
+	pci_save_state(pdev);
+	return 0;
+
+err_devlink_unregister:
+	devlink_unregister(devlink);
+err_persist_free:
+	kfree(dev->persist);
+err_devlink_free:
+	devlink_free(devlink);
 	return ret;
 }
 
@@ -3811,6 +3829,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct devlink *devlink = priv_to_devlink(priv);
 	int active_vfs = 0;
 
 	mutex_lock(&persist->interface_state_mutex);
@@ -3841,8 +3860,9 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
+	devlink_unregister(devlink);
 	kfree(dev->persist);
-	kfree(priv);
+	devlink_free(devlink);
 	pci_set_drvdata(pdev, NULL);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 7baef52db6b7..ef9683101ead 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -45,6 +45,7 @@
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
+#include <net/devlink.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
@@ -828,6 +829,7 @@ struct mlx4_port_info {
 	struct mlx4_roce_gid_table gid_table;
 	int			base_qpn;
 	struct cpu_rmap		*rmap;
+	struct devlink_port	devlink_port;
 };
 
 struct mlx4_sense {
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 2e8af001c5da..bd0e7075ea6d 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -33,6 +33,7 @@
 #ifndef MLX4_DRIVER_H
 #define MLX4_DRIVER_H
 
+#include <net/devlink.h>
 #include <linux/mlx4/device.h>
 
 struct mlx4_dev;
@@ -89,6 +90,8 @@ int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
 
 void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
 
+struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port);
+
 static inline u64 mlx4_mac_to_u64(u8 *addr)
 {
 	u64 mac = 0;
-- 
cgit v1.2.3


From fb2dabad69f099fb9c03a44276778911da50ba29 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 26 Feb 2016 13:16:00 -0500
Subject: net: dsa: support VLAN filtering switchdev attr

When a user explicitly requests VLAN filtering with something like:

    # echo 1 > /sys/class/net/<bridge>/bridge/vlan_filtering

Switchdev propagates a SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING port
attribute.

Add support for it in the DSA layer with a new port_vlan_filtering
function to let drivers toggle 802.1Q filtering on user demand.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3dd54867174a..26c0a3fa009a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,6 +305,8 @@ struct dsa_switch_driver {
 	/*
 	 * VLAN support
 	 */
+	int	(*port_vlan_filtering)(struct dsa_switch *ds, int port,
+				       bool vlan_filtering);
 	int	(*port_vlan_prepare)(struct dsa_switch *ds, int port,
 				     const struct switchdev_obj_port_vlan *vlan,
 				     struct switchdev_trans *trans);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index cde29239b60d..27bf03d11670 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -317,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)
 	return ret;
 }
 
+static int dsa_slave_vlan_filtering(struct net_device *dev,
+				    const struct switchdev_attr *attr,
+				    struct switchdev_trans *trans)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	if (ds->drv->port_vlan_filtering)
+		return ds->drv->port_vlan_filtering(ds, p->port,
+						    attr->u.vlan_filtering);
+
+	return 0;
+}
+
 static int dsa_slave_port_attr_set(struct net_device *dev,
 				   const struct switchdev_attr *attr,
 				   struct switchdev_trans *trans)
@@ -333,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 			ret = ds->drv->port_stp_update(ds, p->port,
 						       attr->u.stp_state);
 		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		ret = dsa_slave_vlan_filtering(dev, attr, trans);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3


From 7f0aec7a668419bdbff12de6e8016544f874e708 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:01 +0100
Subject: bridge: mcast: use names for the different multicast_router types

Using raw values makes it difficult to extend and also understand the
code, give them names and do explicit per-option manipulation in
br_multicast_set_port_router.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  7 +++++
 net/bridge/br_multicast.c      | 61 +++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 0890b217580d..e47f3bc7f323 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -177,6 +177,13 @@ enum {
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
 
+/* multicast router types */
+enum {
+	MDB_RTR_TYPE_DISABLED,
+	MDB_RTR_TYPE_TEMP_QUERY,
+	MDB_RTR_TYPE_PERM,
+};
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8b6e4249be1b..71c109b0943f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,7 +759,7 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != 1 ||
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
@@ -912,7 +912,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 
 void br_multicast_add_port(struct net_bridge_port *port)
 {
-	port->multicast_router = 1;
+	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 	setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
 		    (unsigned long)port);
@@ -959,7 +959,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 #if IS_ENABLED(CONFIG_IPV6)
 	br_multicast_enable(&port->ip6_own_query);
 #endif
-	if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+	if (port->multicast_router == MDB_RTR_TYPE_PERM &&
+	    hlist_unhashed(&port->rlist))
 		br_multicast_add_router(br, port);
 
 out:
@@ -1227,13 +1228,13 @@ static void br_multicast_mark_router(struct net_bridge *br,
 	unsigned long now = jiffies;
 
 	if (!port) {
-		if (br->multicast_router == 1)
+		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
 			mod_timer(&br->multicast_router_timer,
 				  now + br->multicast_querier_interval);
 		return;
 	}
 
-	if (port->multicast_router != 1)
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1713,7 +1714,7 @@ void br_multicast_init(struct net_bridge *br)
 	br->hash_elasticity = 4;
 	br->hash_max = 512;
 
-	br->multicast_router = 1;
+	br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	br->multicast_querier = 0;
 	br->multicast_query_use_ifaddr = 0;
 	br->multicast_last_member_count = 2;
@@ -1823,11 +1824,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	spin_lock_bh(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 2:
+	case MDB_RTR_TYPE_DISABLED:
+	case MDB_RTR_TYPE_PERM:
 		del_timer(&br->multicast_router_timer);
 		/* fall through */
-	case 1:
+	case MDB_RTR_TYPE_TEMP_QUERY:
 		br->multicast_router = val;
 		err = 0;
 		break;
@@ -1838,6 +1839,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	return err;
 }
 
+static void __del_port_router(struct net_bridge_port *p)
+{
+	if (hlist_unhashed(&p->rlist))
+		return;
+	hlist_del_init_rcu(&p->rlist);
+	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+}
+
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
@@ -1846,29 +1855,25 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 	spin_lock(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 1:
-	case 2:
-		p->multicast_router = val;
-		err = 0;
-
-		if (val < 2 && !hlist_unhashed(&p->rlist)) {
-			hlist_del_init_rcu(&p->rlist);
-			br_rtr_notify(br->dev, p, RTM_DELMDB);
-		}
-
-		if (val == 1)
-			break;
-
+	case MDB_RTR_TYPE_DISABLED:
+		p->multicast_router = MDB_RTR_TYPE_DISABLED;
+		__del_port_router(p);
+		del_timer(&p->multicast_router_timer);
+		break;
+	case MDB_RTR_TYPE_TEMP_QUERY:
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+		__del_port_router(p);
+		break;
+	case MDB_RTR_TYPE_PERM:
+		p->multicast_router = MDB_RTR_TYPE_PERM;
 		del_timer(&p->multicast_router_timer);
-
-		if (val == 0)
-			break;
-
 		br_multicast_add_router(br, p);
 		break;
+	default:
+		goto unlock;
 	}
-
+	err = 0;
+unlock:
 	spin_unlock(&br->multicast_lock);
 
 	return err;
-- 
cgit v1.2.3


From a55d8246abcc910346771175b521ee2bce5a69b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:03 +0100
Subject: bridge: mcast: add support for temporary port router

Add support for a temporary router port which doesn't depend only on the
incoming query. It can be refreshed if set to the same value, which is
a no-op for the rest.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e47f3bc7f323..74ee03a47e79 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ enum {
 	MDB_RTR_TYPE_DISABLED,
 	MDB_RTR_TYPE_TEMP_QUERY,
 	MDB_RTR_TYPE_PERM,
+	MDB_RTR_TYPE_TEMP
 };
 
 enum {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f1140cf5168d..a4c15df2b792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
 
 	hlist_del_init_rcu(&port->rlist);
 	br_rtr_notify(br->dev, port, RTM_DELMDB);
+	/* Don't allow timer refresh if the router expired */
+	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 out:
 	spin_unlock(&br->multicast_lock);
@@ -981,6 +985,9 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 	if (!hlist_unhashed(&port->rlist)) {
 		hlist_del_init_rcu(&port->rlist);
 		br_rtr_notify(br->dev, port, RTM_DELMDB);
+		/* Don't allow timer refresh if disabling */
+		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	}
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
@@ -1234,7 +1241,8 @@ static void br_multicast_mark_router(struct net_bridge *br,
 		return;
 	}
 
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1850,10 +1858,15 @@ static void __del_port_router(struct net_bridge_port *p)
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
+	unsigned long now = jiffies;
 	int err = -EINVAL;
 
 	spin_lock(&br->multicast_lock);
 	if (p->multicast_router == val) {
+		/* Refresh the temp router port timer */
+		if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+			mod_timer(&p->multicast_router_timer,
+				  now + br->multicast_querier_interval);
 		err = 0;
 		goto unlock;
 	}
@@ -1872,6 +1885,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 		del_timer(&p->multicast_router_timer);
 		br_multicast_add_router(br, p);
 		break;
+	case MDB_RTR_TYPE_TEMP:
+		p->multicast_router = MDB_RTR_TYPE_TEMP;
+		br_multicast_mark_router(br, p);
+		break;
 	default:
 		goto unlock;
 	}
-- 
cgit v1.2.3


From 59f78f9f6c2e80dcf0f520be85b660f856217b79 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:04 +0100
Subject: bridge: mcast: add support for more router port information dumping

Allow for more multicast router port information to be dumped such as
timer and type attributes. For that that purpose we need to extend the
MDBA_ROUTER_PORT attribute similar to how it was done for the mdb entries
recently. The new format is thus:
[MDBA_ROUTER_PORT] = { <- nested attribute
    u32 ifindex <- router port ifindex for user-space compatibility
    [MDBA_ROUTER_PATTR attributes]
}
This way it remains compatible with older users (they'll simply retrieve
the u32 in the beginning) and new users can parse the remaining
attributes. It would also allow to add future extensions to the router
port without breaking compatibility.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 14 +++++++++++++-
 net/bridge/br_mdb.c            | 16 ++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 74ee03a47e79..0536eefff9bf 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -144,7 +144,10 @@ struct bridge_vlan_info {
  *     }
  * }
  * [MDBA_ROUTER] = {
- *    [MDBA_ROUTER_PORT]
+ *    [MDBA_ROUTER_PORT] = {
+ *        u32 ifindex
+ *        [MDBA_ROUTER_PATTR attributes]
+ *    }
  * }
  */
 enum {
@@ -192,6 +195,15 @@ enum {
 };
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
+/* router port attributes */
+enum {
+	MDBA_ROUTER_PATTR_UNSPEC,
+	MDBA_ROUTER_PATTR_TIMER,
+	MDBA_ROUTER_PATTR_TYPE,
+	__MDBA_ROUTER_PATTR_MAX
+};
+#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
+
 struct br_port_msg {
 	__u8  family;
 	__u32 ifindex;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 73786e2fe065..253bc77eda3b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p;
-	struct nlattr *nest;
+	struct nlattr *nest, *port_nest;
 
 	if (!br->multicast_router || hlist_empty(&br->router_list))
 		return 0;
@@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 		return -EMSGSIZE;
 
 	hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
-		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+		if (!p)
+			continue;
+		port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+		if (!port_nest)
 			goto fail;
+		if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+		    nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+				br_timer_value(&p->multicast_router_timer)) ||
+		    nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+			       p->multicast_router)) {
+			nla_nest_cancel(skb, port_nest);
+			goto fail;
+		}
+		nla_nest_end(skb, port_nest);
 	}
 
 	nla_nest_end(skb, nest);
-- 
cgit v1.2.3


From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:54 -0500
Subject: introduce IFE action

This action allows for a sending side to encapsulate arbitrary metadata
which is decapsulated by the receiving end.
The sender runs in encoding mode and the receiver in decode mode.
Both sender and receiver must specify the same ethertype.
At some point we hope to have a registered ethertype and we'll
then provide a default so the user doesnt have to specify it.
For now we enforce the user specify it.

Lets show example usage where we encode icmp from a sender towards
a receiver with an skbmark of 17; both sender and receiver use
ethertype of 0xdead to interop.

YYYY: Lets start with Receiver-side policy config:
xxx: add an ingress qdisc
sudo tc qdisc add dev $ETH ingress

xxx: any packets with ethertype 0xdead will be subjected to ife decoding
xxx: we then restart the classification so we can match on icmp at prio 3
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

xxx: on restarting the classification from above if it was an icmp
xxx: packet, then match it here and continue to the next rule at prio 4
xxx: which will match based on skb mark of 17
sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

xxx: match on skbmark of 0x11 (decimal 17) and accept
sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \
handle 0x11 fw flowid 1:1 \
action ok

xxx: Lets show the decoding policy
sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead
xxx:
filter pref 2 u32
filter pref 2 u32 fh 800: ht divisor 1
filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1  (rule hit 0 success 0)
  match 00000000/00000000 at 0 (success 0 )
        action order 1: ife decode action reclassify
         index 1 ref 1 bind 1 installed 14 sec used 14 sec
         type: 0x0
         Metadata: allow mark allow hash allow prio allow qmap
        Action statistics:
        Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0
xxx:
Observe that above lists all metadatum it can decode. Typically these
submodules will already be compiled into a monolithic kernel or
loaded as modules

YYYY: Lets show the sender side now ..

xxx: Add an egress qdisc on the sender netdev
sudo tc qdisc add dev $ETH root handle 1: prio
xxx:
xxx: Match all icmp packets to 192.168.122.237/24, then
xxx: tag the packet with skb mark of decimal 17, then
xxx: Encode it with:
xxx:	ethertype 0xdead
xxx:	add skb->mark to whitelist of metadatum to send
xxx:	rewrite target dst MAC address to 02:15:15:15:15:15
xxx:
sudo $TC filter add dev $ETH parent 1: protocol ip prio 10  u32 \
match ip dst 192.168.122.237/24 \
match ip protocol 1 0xff \
flowid 1:2 \
action skbedit mark 17 \
action ife encode \
type 0xDEAD \
allow mark \
dst 02:15:15:15:15:15

xxx: Lets show the encoding policy
sudo tc -s filter ls dev $ETH parent 1: protocol ip
xxx:
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2  (rule hit 0 success 0)
  match c0a87aed/ffffffff at 16 (success 0 )
  match 00010000/00ff0000 at 8 (success 0 )

	action order 1:  skbedit mark 17
	 index 6 ref 1 bind 1
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

	action order 2: ife encode action pipe
	 index 3 ref 1 bind 1
	 dst MAC: 02:15:15:15:15:15 type: 0xDEAD
 	 Metadata: allow mark
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0
xxx:

test by sending ping from sender to destination

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |  61 +++
 include/uapi/linux/tc_act/tc_ife.h |  38 ++
 net/sched/Kconfig                  |  12 +
 net/sched/Makefile                 |   1 +
 net/sched/act_ife.c                | 870 +++++++++++++++++++++++++++++++++++++
 5 files changed, 982 insertions(+)
 create mode 100644 include/net/tc_act/tc_ife.h
 create mode 100644 include/uapi/linux/tc_act/tc_ife.h
 create mode 100644 net/sched/act_ife.c

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
new file mode 100644
index 000000000000..dc9a09aefb33
--- /dev/null
+++ b/include/net/tc_act/tc_ife.h
@@ -0,0 +1,61 @@
+#ifndef __NET_TC_IFE_H
+#define __NET_TC_IFE_H
+
+#include <net/act_api.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+
+#define IFE_METAHDRLEN 2
+struct tcf_ife_info {
+	struct tcf_common common;
+	u8 eth_dst[ETH_ALEN];
+	u8 eth_src[ETH_ALEN];
+	u16 eth_type;
+	u16 flags;
+	/* list of metaids allowed */
+	struct list_head metalist;
+};
+#define to_ife(a) \
+	container_of(a->priv, struct tcf_ife_info, common)
+
+struct tcf_meta_info {
+	const struct tcf_meta_ops *ops;
+	void *metaval;
+	u16 metaid;
+	struct list_head metalist;
+};
+
+struct tcf_meta_ops {
+	u16 metaid; /*Maintainer provided ID */
+	u16 metatype; /*netlink attribute type (look at net/netlink.h) */
+	const char *name;
+	const char *synopsis;
+	struct list_head list;
+	int	(*check_presence)(struct sk_buff *, struct tcf_meta_info *);
+	int	(*encode)(struct sk_buff *, void *, struct tcf_meta_info *);
+	int	(*decode)(struct sk_buff *, void *, u16 len);
+	int	(*get)(struct sk_buff *skb, struct tcf_meta_info *mi);
+	int	(*alloc)(struct tcf_meta_info *, void *);
+	void	(*release)(struct tcf_meta_info *);
+	int	(*validate)(void *val, int len);
+	struct module	*owner;
+};
+
+#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ifemeta" __stringify_1(metan))
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval);
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval);
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
+int ife_validate_meta_u32(void *val, int len);
+int ife_validate_meta_u16(void *val, int len);
+void ife_release_meta_gen(struct tcf_meta_info *mi);
+int register_ife_op(struct tcf_meta_ops *mops);
+int unregister_ife_op(struct tcf_meta_ops *mops);
+
+#endif /* __NET_TC_IFE_H */
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
new file mode 100644
index 000000000000..d648ff66586f
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -0,0 +1,38 @@
+#ifndef __UAPI_TC_IFE_H
+#define __UAPI_TC_IFE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_IFE 25
+/* Flag bits for now just encoding/decoding; mutually exclusive */
+#define IFE_ENCODE 1
+#define IFE_DECODE 0
+
+struct tc_ife {
+	tc_gen;
+	__u16 flags;
+};
+
+/*XXX: We need to encode the total number of bytes consumed */
+enum {
+	TCA_IFE_UNSPEC,
+	TCA_IFE_PARMS,
+	TCA_IFE_TM,
+	TCA_IFE_DMAC,
+	TCA_IFE_SMAC,
+	TCA_IFE_TYPE,
+	TCA_IFE_METALST,
+	__TCA_IFE_MAX
+};
+#define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
+
+#define IFE_META_SKBMARK 1
+#define IFE_META_HASHID 2
+#define	IFE_META_PRIO 3
+#define	IFE_META_QMAP 4
+/*Can be overridden at runtime by module option*/
+#define	__IFE_META_MAX 5
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 82830824fb1f..4d48ef57e564 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -739,6 +739,18 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_IFE
+        tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow for sourcing and terminating metadata
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ife.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 690c1689e090..3d176671b0e1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000000..6e7ec257790d
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,870 @@
+/*
+ * net/sched/ife.c	Inter-FE action based on ForCES WG InterFE LFB
+ *
+ *		Refer to:
+ *		draft-ietf-forces-interfelfb-03
+ *		and
+ *		netdev01 paper:
+ *		"Distributing Linux Traffic Control Classifier-Action
+ *		Subsystem"
+ *		Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+
+#define IFE_TAB_MASK 15
+
+static int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+	[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+	[TCA_IFE_DMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_SMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+/* Caller takes care of presenting data in network order
+*/
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	u32 *tlv = (u32 *)(skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *)tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | totlen;
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+V == 2+2+4 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u32 edata = metaval;
+
+	if (mi->metaval)
+		edata = *(u32 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htonl(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+	kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+	if (len == 4)
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+	/* length will include padding */
+	if (len == NLA_ALIGN(2))
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+	struct tcf_meta_ops *o;
+
+	read_lock(&ife_mod_lock);
+	list_for_each_entry(o, &ifeoplist, list) {
+		if (o->metaid == metaid) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			read_unlock(&ife_mod_lock);
+			return o;
+		}
+	}
+	read_unlock(&ife_mod_lock);
+
+	return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+
+	if (!mops->metaid || !mops->metatype || !mops->name ||
+	    !mops->check_presence || !mops->encode || !mops->decode ||
+	    !mops->get || !mops->alloc)
+		return -EINVAL;
+
+	write_lock(&ife_mod_lock);
+
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid ||
+		    (strcmp(mops->name, m->name) == 0)) {
+			write_unlock(&ife_mod_lock);
+			return -EEXIST;
+		}
+	}
+
+	if (!mops->release)
+		mops->release = ife_release_meta_gen;
+
+	list_add_tail(&mops->list, &ifeoplist);
+	write_unlock(&ife_mod_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+	int err = -ENOENT;
+
+	write_lock(&ife_mod_lock);
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid) {
+			list_del(&mops->list);
+			err = 0;
+			break;
+		}
+	}
+	write_unlock(&ife_mod_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+	int ret = 0;
+	/* XXX: unfortunately cant use nla_policy at this point
+	* because a length of 0 is valid in the case of
+	* "allow". "use" semantics do enforce for proper
+	* length and i couldve use nla_policy but it makes it hard
+	* to use it just for that..
+	*/
+	if (ops->validate)
+		return ops->validate(val, len);
+
+	if (ops->metatype == NLA_U32)
+		ret = ife_validate_meta_u32(val, len);
+	else if (ops->metatype == NLA_U16)
+		ret = ife_validate_meta_u16(val, len);
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
+				void *val, int len)
+{
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops) {
+		ret = -ENOENT;
+#ifdef CONFIG_MODULES
+		spin_unlock_bh(&ife->tcf_lock);
+		rtnl_unlock();
+		request_module("ifemeta%u", metaid);
+		rtnl_lock();
+		spin_lock_bh(&ife->tcf_lock);
+		ops = find_ife_oplist(metaid);
+#endif
+	}
+
+	if (ops) {
+		ret = 0;
+		if (len)
+			ret = ife_validate_metatype(ops, val, len);
+
+		module_put(ops->owner);
+	}
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+			int len)
+{
+	struct tcf_meta_info *mi = NULL;
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops)
+		return -ENOENT;
+
+	mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+	if (!mi) {
+		/*put back what find_ife_oplist took */
+		module_put(ops->owner);
+		return -ENOMEM;
+	}
+
+	mi->metaid = metaid;
+	mi->ops = ops;
+	if (len > 0) {
+		ret = ops->alloc(mi, metaval);
+		if (ret != 0) {
+			kfree(mi);
+			module_put(ops->owner);
+			return ret;
+		}
+	}
+
+	list_add_tail(&mi->metalist, &ife->metalist);
+
+	return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife)
+{
+	struct tcf_meta_ops *o;
+	int rc = 0;
+	int installed = 0;
+
+	list_for_each_entry(o, &ifeoplist, list) {
+		rc = add_metainfo(ife, o->metaid, NULL, 0);
+		if (rc == 0)
+			installed += 1;
+	}
+
+	if (installed)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e;
+	struct nlattr *nest;
+	unsigned char *b = skb_tail_pointer(skb);
+	int total_encoded = 0;
+
+	/*can only happen on decode */
+	if (list_empty(&ife->metalist))
+		return 0;
+
+	nest = nla_nest_start(skb, TCA_IFE_METALST);
+	if (!nest)
+		goto out_nlmsg_trim;
+
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (!e->ops->get(skb, e))
+			total_encoded += 1;
+	}
+
+	if (!total_encoded)
+		goto out_nlmsg_trim;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+	struct tcf_meta_info *e, *n;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		module_put(e->ops->owner);
+		list_del(&e->metalist);
+		if (e->metaval) {
+			if (e->ops->release)
+				e->ops->release(e);
+			else
+				kfree(e->metaval);
+		}
+		kfree(e);
+	}
+}
+
+static void tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	spin_lock_bh(&ife->tcf_lock);
+	_tcf_ife_cleanup(a, bind);
+	spin_unlock_bh(&ife->tcf_lock);
+}
+
+/* under ife->tcf_lock */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+{
+	int len = 0;
+	int rc = 0;
+	int i = 0;
+	void *val;
+
+	for (i = 1; i < max_metacnt; i++) {
+		if (tb[i]) {
+			val = nla_data(tb[i]);
+			len = nla_len(tb[i]);
+
+			rc = load_metaops_and_vet(ife, i, val, len);
+			if (rc != 0)
+				return rc;
+
+			rc = add_metainfo(ife, i, val, len);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a,
+			int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	struct nlattr *tb[TCA_IFE_MAX + 1];
+	struct nlattr *tb2[IFE_META_MAX + 1];
+	struct tcf_ife_info *ife;
+	struct tc_ife *parm;
+	u16 ife_type = 0;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	int ret = 0;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_IFE_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_IFE_PARMS]);
+
+	if (parm->flags & IFE_ENCODE) {
+		/* Until we get issued the ethertype, we cant have
+		 * a default..
+		**/
+		if (!tb[TCA_IFE_TYPE]) {
+			pr_info("You MUST pass etherype for encoding\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+				      bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)	/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	ife = to_ife(a);
+	ife->flags = parm->flags;
+
+	if (parm->flags & IFE_ENCODE) {
+		ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+		if (tb[TCA_IFE_DMAC])
+			daddr = nla_data(tb[TCA_IFE_DMAC]);
+		if (tb[TCA_IFE_SMAC])
+			saddr = nla_data(tb[TCA_IFE_SMAC]);
+	}
+
+	spin_lock_bh(&ife->tcf_lock);
+	ife->tcf_action = parm->action;
+
+	if (parm->flags & IFE_ENCODE) {
+		if (daddr)
+			ether_addr_copy(ife->eth_dst, daddr);
+		else
+			eth_zero_addr(ife->eth_dst);
+
+		if (saddr)
+			ether_addr_copy(ife->eth_src, saddr);
+		else
+			eth_zero_addr(ife->eth_src);
+
+		ife->eth_type = ife_type;
+	}
+
+	if (ret == ACT_P_CREATED)
+		INIT_LIST_HEAD(&ife->metalist);
+
+	if (tb[TCA_IFE_METALST]) {
+		err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+				       NULL);
+		if (err) {
+metadata_parse_err:
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+
+		err = populate_metalist(ife, tb2);
+		if (err)
+			goto metadata_parse_err;
+
+	} else {
+		/* if no passed metadata allow list or passed allow-all
+		 * then here we process by adding as many supported metadatum
+		 * as we can. You better have at least one else we are
+		 * going to bail out
+		 */
+		err = use_all_metadata(ife);
+		if (err) {
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+	}
+
+	spin_unlock_bh(&ife->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, a);
+
+	return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ife_info *ife = a->priv;
+	struct tc_ife opt = {
+		.index = ife->tcf_index,
+		.refcnt = ife->tcf_refcnt - ref,
+		.bindcnt = ife->tcf_bindcnt - bind,
+		.action = ife->tcf_action,
+		.flags = ife->flags,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+	if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	if (!is_zero_ether_addr(ife->eth_dst)) {
+		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+			goto nla_put_failure;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src)) {
+		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+			goto nla_put_failure;
+	}
+
+	if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+		goto nla_put_failure;
+
+	if (dump_metalist(skb, ife)) {
+		/*ignore failure to dump metalist */
+		pr_info("Failed to dump metalist\n");
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+		       u16 metaid, u16 mlen, void *mdata)
+{
+	struct tcf_meta_info *e;
+
+	/* XXX: use hash to speed up */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (metaid == e->metaid) {
+			if (e->ops) {
+				/* We check for decode presence already */
+				return e->ops->decode(skb, mdata, mlen);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
+	u16 ifehdrln = ifehdr->metalen;
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	spin_unlock(&ife->tcf_lock);
+
+	ifehdrln = ntohs(ifehdrln);
+	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	skb_set_mac_header(skb, ifehdrln);
+	__skb_pull(skb, ifehdrln);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	ifehdrln -= IFE_METAHDRLEN;
+
+	while (ifehdrln > 0) {
+		u8 *tlvdata = (u8 *)tlv;
+		u16 mtype = tlv->type;
+		u16 mlen = tlv->len;
+
+		mtype = ntohs(mtype);
+		mlen = ntohs(mlen);
+
+		if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
+				       (void *)(tlvdata + 4))) {
+			/* abuse overlimits to count when we receive metadata
+			 * but dont have an ops for it
+			 */
+			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
+					    mtype, mlen);
+			ife->tcf_qstats.overlimits++;
+		}
+
+		tlvdata += mlen;
+		ifehdrln -= mlen;
+		tlv = (struct meta_tlvhdr *)tlvdata;
+	}
+
+	skb_reset_network_header(skb);
+	return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e, *n;
+	int tot_run_sz = 0, run_sz = 0;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		if (e->ops->check_presence) {
+			run_sz = e->ops->check_presence(skb, e);
+			tot_run_sz += run_sz;
+		}
+	}
+
+	return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ethhdr *oethh;	/* outer ether header */
+	struct ethhdr *iethh;	/* inner eth header */
+	struct tcf_meta_info *e;
+	/*
+	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	   where ORIGDATA = original ethernet header ...
+	 */
+	u16 metalen = ife_get_sz(skb, ife);
+	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+	unsigned int skboff = skb->dev->hard_header_len;
+	u32 at = G_TC_AT(skb->tc_verd);
+	int new_len = skb->len + hdrm;
+	bool exceed_mtu = false;
+	int err;
+
+	if (at & AT_EGRESS) {
+		if (new_len > skb->dev->mtu)
+			exceed_mtu = true;
+	}
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+
+	if (!metalen) {		/* no metadata to send */
+		/* abuse overlimits to count when we allow packet
+		 * with no metadata
+		 */
+		ife->tcf_qstats.overlimits++;
+		spin_unlock(&ife->tcf_lock);
+		return action;
+	}
+	/* could be stupid policy setup or mtu config
+	 * so lets be conservative.. */
+	if ((action == TC_ACT_SHOT) || exceed_mtu) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	iethh = eth_hdr(skb);
+
+	err = skb_cow_head(skb, hdrm);
+	if (unlikely(err)) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	if (!(at & AT_EGRESS))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	__skb_push(skb, hdrm);
+	memcpy(skb->data, iethh, skb->mac_len);
+	skb_reset_mac_header(skb);
+	oethh = eth_hdr(skb);
+
+	/*total metadata length */
+	metalen += IFE_METAHDRLEN;
+	metalen = htons(metalen);
+	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+	skboff += IFE_METAHDRLEN;
+
+	/* XXX: we dont have a clever way of telling encode to
+	 * not repeat some of the computations that are done by
+	 * ops->presence_check...
+	 */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (e->ops->encode) {
+			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+					     e);
+		}
+		if (err < 0) {
+			/* too corrupt to keep around if overwritten */
+			ife->tcf_qstats.drops++;
+			spin_unlock(&ife->tcf_lock);
+			return TC_ACT_SHOT;
+		}
+		skboff += err;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src))
+		ether_addr_copy(oethh->h_source, ife->eth_src);
+	else
+		ether_addr_copy(oethh->h_source, iethh->h_source);
+	if (!is_zero_ether_addr(ife->eth_dst))
+		ether_addr_copy(oethh->h_dest, ife->eth_dst);
+	else
+		ether_addr_copy(oethh->h_dest, iethh->h_dest);
+	oethh->h_proto = htons(ife->eth_type);
+
+	if (!(at & AT_EGRESS))
+		skb_pull(skb, skb->dev->hard_header_len);
+
+	spin_unlock(&ife->tcf_lock);
+
+	return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	if (ife->flags & IFE_ENCODE)
+		return tcf_ife_encode(skb, a, res);
+
+	if (!(ife->flags & IFE_ENCODE))
+		return tcf_ife_decode(skb, a, res);
+
+	pr_info_ratelimited("unknown failure(policy neither de/encode\n");
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	ife->tcf_qstats.drops++;
+	spin_unlock(&ife->tcf_lock);
+
+	return TC_ACT_SHOT;
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+	.kind = "ife",
+	.type = TCA_ACT_IFE,
+	.owner = THIS_MODULE,
+	.act = tcf_ife_act,
+	.dump = tcf_ife_dump,
+	.cleanup = tcf_ife_cleanup,
+	.init = tcf_ife_init,
+	.walk = tcf_ife_walker,
+	.lookup = tcf_ife_search,
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+}
+
+static void __net_exit ife_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ife_net_ops = {
+	.init = ife_init_net,
+	.exit = ife_exit_net,
+	.id   = &ife_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+	return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 822c868532cae2cc1c51f4f18ab61c194d98aaf6 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 27 Feb 2016 00:32:15 -0800
Subject: net: ipv4: Convert IP network timestamps to be y2038 safe

ICMP timestamp messages and IP source route options require
timestamps to be in milliseconds modulo 24 hours from
midnight UT format.

Add inet_current_timestamp() function to support this. The function
returns the required timestamp in network byte order.

Timestamp calculation is also changed to call ktime_get_real_ts64()
which uses struct timespec64. struct timespec64 is y2038 safe.
Previously it called getnstimeofday() which uses struct timespec.
struct timespec is not y2038 safe.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: James Morris <jmorris@namei.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      |  2 ++
 net/ipv4/af_inet.c    | 26 ++++++++++++++++++++++++++
 net/ipv4/icmp.c       |  5 +----
 net/ipv4/ip_options.c | 14 ++++++--------
 4 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index cbb134b2f0e4..fad74d323bd6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -240,6 +240,8 @@ static inline int inet_is_local_reserved_port(struct net *net, int port)
 }
 #endif
 
+__be32 inet_current_timestamp(void);
+
 /* From inetpeer.c */
 extern int inet_peer_threshold;
 extern int inet_peer_minttl;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 209d1ed28954..0cc923f83e10 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1380,6 +1380,32 @@ out:
 	return pp;
 }
 
+#define SECONDS_PER_DAY	86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+	u32 secs;
+	u32 msecs;
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+
+	/* Get secs since midnight. */
+	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+	/* Convert to msecs. */
+	msecs = secs * MSEC_PER_SEC;
+	/* Convert nsec to msec. */
+	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+	/* Convert to network byte order. */
+	return htons(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
 int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 {
 	if (sk->sk_family == AF_INET)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c908..6333489771ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb)
  */
 static bool icmp_timestamp(struct sk_buff *skb)
 {
-	struct timespec tv;
 	struct icmp_bxm icmp_param;
 	/*
 	 *	Too short.
@@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
 	/*
 	 *	Fill in the current time as ms since midnight UT:
 	 */
-	getnstimeofday(&tv);
-	icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
-					 tv.tv_nsec / NSEC_PER_MSEC);
+	icmp_param.data.times[1] = inet_current_timestamp();
 	icmp_param.data.times[2] = icmp_param.data.times[1];
 	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
 		BUG();
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bd246792360b..4d158ff1def1 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
 		if (opt->ts_needaddr)
 			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
 		if (opt->ts_needtime) {
-			struct timespec tv;
 			__be32 midtime;
-			getnstimeofday(&tv);
-			midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+
+			midtime = inet_current_timestamp();
 			memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
 		}
 		return;
@@ -415,11 +414,10 @@ int ip_options_compile(struct net *net,
 					break;
 				}
 				if (timeptr) {
-					struct timespec tv;
-					u32  midtime;
-					getnstimeofday(&tv);
-					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
-					put_unaligned_be32(midtime, timeptr);
+					__be32 midtime;
+
+					midtime = inet_current_timestamp();
+					memcpy(timeptr, &midtime, 4);
 					opt->is_changed = 1;
 				}
 			} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
-- 
cgit v1.2.3


From 6b6c07bdcdc97ccac2596063bfc32a5faddfe884 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 2 Mar 2016 00:13:39 +0200
Subject: net/mlx5: Make command timeout way shorter

The command timeout is terribly long, whole two hours. Make it 60s so if
things do go wrong, the user gets feedback in relatively short time, so
they can take corrective actions and/or investigate using tools and such.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a815da92d4eb..3388a43b78f6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -54,7 +54,7 @@ enum {
 	/* one minute for the sake of bringup. Generally, commands must always
 	 * complete and we may need to increase this timeout value
 	 */
-	MLX5_CMD_TIMEOUT_MSEC	= 7200 * 1000,
+	MLX5_CMD_TIMEOUT_MSEC	= 60 * 1000,
 	MLX5_CMD_WQ_MAX_NAME	= 32,
 };
 
-- 
cgit v1.2.3


From 0ba422410bbf7081c3c7d7b2dcc10e9eb5cb46f7 Mon Sep 17 00:00:00 2001
From: Moshe Lazer <moshel@mellanox.com>
Date: Wed, 2 Mar 2016 00:13:40 +0200
Subject: net/mlx5: Fix global UAR mapping

Avoid double mapping of io mapped memory, Device page may be
mapped to non-cached(NC) or to write-combining(WC).
The code before this fix tries to map it both to WC and NC
contrary to what stated in Intel's software developer manual.

Here we remove the global WC mapping of all UARS
"dev->priv.bf_mapping", since UAR mapping should be decided
per UAR (e.g we want different mappings for EQs, CQs vs QPs).

Caller will now have to choose whether to map via
write-combining API or not.

mlx5e SQs will choose write-combining in order to perform
BlueFlame writes.

Fixes: 88a85f99e51f ('TX latency optimization to save DMA reads')
Signed-off-by: Moshe Lazer <moshel@mellanox.com>
Reviewed-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 16 +++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 28 +---------------------
 drivers/net/ethernet/mellanox/mlx5/core/uar.c     | 29 ++++++++++++++---------
 include/linux/mlx5/driver.h                       |  5 ++--
 6 files changed, 36 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index b289660568cf..9c0e80e64b43 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -388,6 +388,7 @@ struct mlx5e_sq_dma {
 
 enum {
 	MLX5E_SQ_STATE_WAKE_TXQ_ENABLE,
+	MLX5E_SQ_STATE_BF_ENABLE,
 };
 
 struct mlx5e_sq {
@@ -416,7 +417,6 @@ struct mlx5e_sq {
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
 	void __iomem              *uar_map;
-	void __iomem              *uar_bf_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
 	u16                        bf_buf_size;
@@ -664,16 +664,12 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 * doorbell
 	 */
 	wmb();
-
-	if (bf_sz) {
-		__iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
-
-		/* flush the write-combining mapped buffer */
-		wmb();
-
-	} else {
+	if (bf_sz)
+		__iowrite64_copy(sq->uar_map + ofst, &wqe->ctrl, bf_sz);
+	else
 		mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
-	}
+	/* flush the write-combining mapped buffer */
+	wmb();
 
 	sq->bf_offset ^= sq->bf_buf_size;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b20a35bd1d4f..5063c0e0f8ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -548,7 +548,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	int txq_ix;
 	int err;
 
-	err = mlx5_alloc_map_uar(mdev, &sq->uar);
+	err = mlx5_alloc_map_uar(mdev, &sq->uar, true);
 	if (err)
 		return err;
 
@@ -560,8 +560,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 		goto err_unmap_free_uar;
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
-	sq->uar_map     = sq->uar.map;
-	sq->uar_bf_map  = sq->uar.bf_map;
+	if (sq->uar.bf_map) {
+		set_bit(MLX5E_SQ_STATE_BF_ENABLE, &sq->state);
+		sq->uar_map = sq->uar.bf_map;
+	} else {
+		sq->uar_map = sq->uar.map;
+	}
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 
@@ -2418,7 +2422,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 
 	priv = netdev_priv(netdev);
 
-	err = mlx5_alloc_map_uar(mdev, &priv->cq_uar);
+	err = mlx5_alloc_map_uar(mdev, &priv->cq_uar, false);
 	if (err) {
 		mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err);
 		goto err_free_netdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index a05c070cbc2f..c34f4f3e9537 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -303,7 +303,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
 		int bf_sz = 0;
 
-		if (bf && sq->uar_bf_map)
+		if (bf && test_bit(MLX5E_SQ_STATE_BF_ENABLE, &sq->state))
 			bf_sz = wi->num_wqebbs << 3;
 
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 1545a944c309..8b7133de498e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -767,22 +767,6 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 	return -ENOTSUPP;
 }
 
-static int map_bf_area(struct mlx5_core_dev *dev)
-{
-	resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
-	resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
-
-	dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
-
-	return dev->priv.bf_mapping ? 0 : -ENOMEM;
-}
-
-static void unmap_bf_area(struct mlx5_core_dev *dev)
-{
-	if (dev->priv.bf_mapping)
-		io_mapping_free(dev->priv.bf_mapping);
-}
-
 static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
 	struct mlx5_device_context *dev_ctx;
@@ -1103,14 +1087,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto err_stop_eqs;
 	}
 
-	if (map_bf_area(dev))
-		dev_err(&pdev->dev, "Failed to map blue flame area\n");
-
 	err = mlx5_irq_set_affinity_hints(dev);
-	if (err) {
+	if (err)
 		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_unmap_bf_area;
-	}
 
 	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
 
@@ -1169,10 +1148,6 @@ err_fs:
 	mlx5_cleanup_qp_table(dev);
 	mlx5_cleanup_cq_table(dev);
 	mlx5_irq_clear_affinity_hints(dev);
-
-err_unmap_bf_area:
-	unmap_bf_area(dev);
-
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1242,7 +1217,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_cleanup_qp_table(dev);
 	mlx5_cleanup_cq_table(dev);
 	mlx5_irq_clear_affinity_hints(dev);
-	unmap_bf_area(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_free_uuars(dev, &priv->uuari);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index eb05c845ece9..8ba080e441a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -226,7 +226,8 @@ int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
 	return 0;
 }
 
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
+int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
+		       bool map_wc)
 {
 	phys_addr_t pfn;
 	phys_addr_t uar_bar_start;
@@ -240,20 +241,26 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 
 	uar_bar_start = pci_resource_start(mdev->pdev, 0);
 	pfn           = (uar_bar_start >> PAGE_SHIFT) + uar->index;
-	uar->map      = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-	if (!uar->map) {
-		mlx5_core_warn(mdev, "ioremap() failed, %d\n", err);
-		err = -ENOMEM;
-		goto err_free_uar;
-	}
 
-	if (mdev->priv.bf_mapping)
-		uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
-						uar->index << PAGE_SHIFT);
+	if (map_wc) {
+		uar->bf_map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->bf_map) {
+			mlx5_core_warn(mdev, "ioremap_wc() failed\n");
+			uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+			if (!uar->map)
+				goto err_free_uar;
+		}
+	} else {
+		uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->map)
+			goto err_free_uar;
+	}
 
 	return 0;
 
 err_free_uar:
+	mlx5_core_warn(mdev, "ioremap() failed\n");
+	err = -ENOMEM;
 	mlx5_cmd_free_uar(mdev, uar->index);
 
 	return err;
@@ -262,8 +269,8 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar);
 
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 {
-	io_mapping_unmap(uar->bf_map);
 	iounmap(uar->map);
+	iounmap(uar->bf_map);
 	mlx5_cmd_free_uar(mdev, uar->index);
 }
 EXPORT_SYMBOL(mlx5_unmap_free_uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3388a43b78f6..bb1a880a5bc5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -460,8 +460,6 @@ struct mlx5_priv {
 	struct mlx5_uuar_info	uuari;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
-	struct io_mapping	*bf_mapping;
-
 	/* pages stuff */
 	struct workqueue_struct *pg_wq;
 	struct rb_root		page_root;
@@ -719,7 +717,8 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
 int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
+int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
+		       bool map_wc);
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 64d4e3431e686dc37ce388ba531c4c4e866fb141 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 27 Feb 2016 20:19:54 -0800
Subject: net: remove skb_sender_cpu_clear()

After commit 52bd2d62ce67 ("net: better skb->sender_cpu and skb->napi_id cohabitation")
skb_sender_cpu_clear() becomes empty and can be removed.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 4 ----
 net/bridge/br_forward.c         | 1 -
 net/core/filter.c               | 2 --
 net/core/skbuff.c               | 1 -
 net/ipv4/ip_forward.c           | 1 -
 net/ipv6/ip6_output.c           | 1 -
 net/netfilter/ipvs/ip_vs_xmit.c | 6 ------
 net/netfilter/nf_dup_netdev.c   | 1 -
 net/sched/act_mirred.c          | 1 -
 9 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eab4f8fbed58..797cefb888fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1161,10 +1161,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 	to->l4_hash = from->l4_hash;
 };
 
-static inline void skb_sender_cpu_clear(struct sk_buff *skb)
-{
-}
-
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
 {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index fcdb86dd5a23..f47759f05b6d 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -44,7 +44,6 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb
 
 	skb_push(skb, ETH_HLEN);
 	br_drop_fake_rtable(skb);
-	skb_sender_cpu_clear(skb);
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 	    (skb->protocol == htons(ETH_P_8021Q) ||
diff --git a/net/core/filter.c b/net/core/filter.c
index a3aba15a8025..5e2a3b5e5196 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1597,7 +1597,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 	}
 
 	skb2->dev = dev;
-	skb_sender_cpu_clear(skb2);
 	return dev_queue_xmit(skb2);
 }
 
@@ -1650,7 +1649,6 @@ int skb_do_redirect(struct sk_buff *skb)
 	}
 
 	skb->dev = dev;
-	skb_sender_cpu_clear(skb);
 	return dev_queue_xmit(skb);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 488566b09c6d..7af7ec635d90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4302,7 +4302,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
-	skb_sender_cpu_clear(skb);
 	secpath_reset(skb);
 	nf_reset(skb);
 	nf_reset_trace(skb);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index da0d7ce85844..af18f1e4889e 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	skb_sender_cpu_clear(skb);
 	return dst_output(net, sk, skb);
 }
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a163102f1803..9428345d3a07 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -332,7 +332,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 				     struct sk_buff *skb)
 {
-	skb_sender_cpu_clear(skb);
 	return dst_output(net, sk, skb);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index a3f5cd9b3c4c..dc196a0f501d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -531,8 +531,6 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	if (ret == NF_ACCEPT) {
 		nf_reset(skb);
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 	}
 	return ret;
 }
@@ -573,8 +571,6 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 
 	if (!local) {
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
@@ -595,8 +591,6 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 	if (!local) {
 		ip_vs_drop_early_demux_sk(skb);
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 8414ee1a0319..7ec69723940f 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -31,7 +31,6 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
 		skb_push(skb, skb->mac_len);
 
 	skb->dev = dev;
-	skb_sender_cpu_clear(skb);
 	dev_queue_xmit(skb);
 }
 EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6b284d991e0b..e8a760cf7775 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -182,7 +182,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	skb_sender_cpu_clear(skb2);
 	err = dev_queue_xmit(skb2);
 
 	if (err) {
-- 
cgit v1.2.3


From 4ac801b77e6f06e6b12c069fd29216a4102065fb Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Sun, 28 Feb 2016 12:26:52 +0200
Subject: qed: Semantic refactoring of interrupt code

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_int.c  | 155 ++++++++++++++++-------------
 drivers/net/ethernet/qlogic/qed/qed_int.h  |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c |  15 +--
 include/linux/qed/qed_if.h                 |   6 ++
 5 files changed, 111 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index acfe7be49a58..d9a5175ebd04 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1011,13 +1011,17 @@ static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 {
 	u32 *resc_start = p_hwfn->hw_info.resc_start;
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
+	struct qed_sb_cnt_info sb_cnt_info;
 	int num_funcs, i;
 
 	num_funcs = MAX_NUM_PFS_BB;
 
+	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
+
 	resc_num[QED_SB] = min_t(u32,
 				 (MAX_SB_PER_PATH_BB / num_funcs),
-				 qed_int_get_num_sbs(p_hwfn, NULL));
+				 sb_cnt_info.sb_cnt);
 	resc_num[QED_L2_QUEUE] = MAX_NUM_L2_QUEUES_BB / num_funcs;
 	resc_num[QED_VPORT] = MAX_NUM_VPORTS_BB / num_funcs;
 	resc_num[QED_RSS_ENG] = ETH_RSS_ENGINE_NUM_BB / num_funcs;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index fa73daa94655..7fd1be61de5c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -343,17 +343,17 @@ void qed_int_sp_dpc(unsigned long hwfn_cookie)
 
 static void qed_int_sb_attn_free(struct qed_hwfn *p_hwfn)
 {
-	struct qed_dev *cdev   = p_hwfn->cdev;
-	struct qed_sb_attn_info *p_sb   = p_hwfn->p_sb_attn;
-
-	if (p_sb) {
-		if (p_sb->sb_attn)
-			dma_free_coherent(&cdev->pdev->dev,
-					  SB_ATTN_ALIGNED_SIZE(p_hwfn),
-					  p_sb->sb_attn,
-					  p_sb->sb_phys);
-		kfree(p_sb);
-	}
+	struct qed_sb_attn_info *p_sb = p_hwfn->p_sb_attn;
+
+	if (!p_sb)
+		return;
+
+	if (p_sb->sb_attn)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  SB_ATTN_ALIGNED_SIZE(p_hwfn),
+				  p_sb->sb_attn,
+				  p_sb->sb_phys);
+	kfree(p_sb);
 }
 
 static void qed_int_sb_attn_setup(struct qed_hwfn *p_hwfn,
@@ -433,6 +433,7 @@ void qed_init_cau_sb_entry(struct qed_hwfn *p_hwfn,
 			   u16 vf_number,
 			   u8 vf_valid)
 {
+	struct qed_dev *cdev = p_hwfn->cdev;
 	u32 cau_state;
 
 	memset(p_sb_entry, 0, sizeof(*p_sb_entry));
@@ -451,14 +452,12 @@ void qed_init_cau_sb_entry(struct qed_hwfn *p_hwfn,
 
 	cau_state = CAU_HC_DISABLE_STATE;
 
-	if (p_hwfn->cdev->int_coalescing_mode == QED_COAL_MODE_ENABLE) {
+	if (cdev->int_coalescing_mode == QED_COAL_MODE_ENABLE) {
 		cau_state = CAU_HC_ENABLE_STATE;
-		if (!p_hwfn->cdev->rx_coalesce_usecs)
-			p_hwfn->cdev->rx_coalesce_usecs =
-				QED_CAU_DEF_RX_USECS;
-		if (!p_hwfn->cdev->tx_coalesce_usecs)
-			p_hwfn->cdev->tx_coalesce_usecs =
-				QED_CAU_DEF_TX_USECS;
+		if (!cdev->rx_coalesce_usecs)
+			cdev->rx_coalesce_usecs = QED_CAU_DEF_RX_USECS;
+		if (!cdev->tx_coalesce_usecs)
+			cdev->tx_coalesce_usecs = QED_CAU_DEF_TX_USECS;
 	}
 
 	SET_FIELD(p_sb_entry->data, CAU_SB_ENTRY_STATE0, cau_state);
@@ -638,8 +637,10 @@ int qed_int_sb_release(struct qed_hwfn *p_hwfn,
 	sb_info->sb_ack = 0;
 	memset(sb_info->sb_virt, 0, sizeof(*sb_info->sb_virt));
 
-	p_hwfn->sbs_info[sb_id] = NULL;
-	p_hwfn->num_sbs--;
+	if (p_hwfn->sbs_info[sb_id] != NULL) {
+		p_hwfn->sbs_info[sb_id] = NULL;
+		p_hwfn->num_sbs--;
+	}
 
 	return 0;
 }
@@ -648,14 +649,15 @@ static void qed_int_sp_sb_free(struct qed_hwfn *p_hwfn)
 {
 	struct qed_sb_sp_info *p_sb = p_hwfn->p_sp_sb;
 
-	if (p_sb) {
-		if (p_sb->sb_info.sb_virt)
-			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
-					  SB_ALIGNED_SIZE(p_hwfn),
-					  p_sb->sb_info.sb_virt,
-					  p_sb->sb_info.sb_phys);
-		kfree(p_sb);
-	}
+	if (!p_sb)
+		return;
+
+	if (p_sb->sb_info.sb_virt)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  SB_ALIGNED_SIZE(p_hwfn),
+				  p_sb->sb_info.sb_virt,
+				  p_sb->sb_info.sb_phys);
+	kfree(p_sb);
 }
 
 static int qed_int_sp_sb_alloc(struct qed_hwfn *p_hwfn,
@@ -718,36 +720,36 @@ int qed_int_register_cb(struct qed_hwfn *p_hwfn,
 			__le16 **p_fw_cons)
 {
 	struct qed_sb_sp_info *p_sp_sb = p_hwfn->p_sp_sb;
-	int qed_status = -ENOMEM;
+	int rc = -ENOMEM;
 	u8 pi;
 
 	/* Look for a free index */
 	for (pi = 0; pi < ARRAY_SIZE(p_sp_sb->pi_info_arr); pi++) {
-		if (!p_sp_sb->pi_info_arr[pi].comp_cb) {
-			p_sp_sb->pi_info_arr[pi].comp_cb = comp_cb;
-			p_sp_sb->pi_info_arr[pi].cookie = cookie;
-			*sb_idx = pi;
-			*p_fw_cons = &p_sp_sb->sb_info.sb_virt->pi_array[pi];
-			qed_status = 0;
-			break;
-		}
+		if (p_sp_sb->pi_info_arr[pi].comp_cb)
+			continue;
+
+		p_sp_sb->pi_info_arr[pi].comp_cb = comp_cb;
+		p_sp_sb->pi_info_arr[pi].cookie = cookie;
+		*sb_idx = pi;
+		*p_fw_cons = &p_sp_sb->sb_info.sb_virt->pi_array[pi];
+		rc = 0;
+		break;
 	}
 
-	return qed_status;
+	return rc;
 }
 
 int qed_int_unregister_cb(struct qed_hwfn *p_hwfn, u8 pi)
 {
 	struct qed_sb_sp_info *p_sp_sb = p_hwfn->p_sp_sb;
-	int qed_status = -ENOMEM;
 
-	if (p_sp_sb->pi_info_arr[pi].comp_cb) {
-		p_sp_sb->pi_info_arr[pi].comp_cb = NULL;
-		p_sp_sb->pi_info_arr[pi].cookie = NULL;
-		qed_status = 0;
-	}
+	if (p_sp_sb->pi_info_arr[pi].comp_cb == NULL)
+		return -ENOMEM;
 
-	return qed_status;
+	p_sp_sb->pi_info_arr[pi].comp_cb = NULL;
+	p_sp_sb->pi_info_arr[pi].cookie = NULL;
+
+	return 0;
 }
 
 u16 qed_int_get_sp_sb_id(struct qed_hwfn *p_hwfn)
@@ -937,6 +939,39 @@ void qed_int_igu_init_pure_rt(struct qed_hwfn *p_hwfn,
 	}
 }
 
+static u32 qed_int_igu_read_cam_block(struct qed_hwfn	*p_hwfn,
+				      struct qed_ptt	*p_ptt,
+				      u16		sb_id)
+{
+	u32 val = qed_rd(p_hwfn, p_ptt,
+			 IGU_REG_MAPPING_MEMORY +
+			 sizeof(u32) * sb_id);
+	struct qed_igu_block *p_block;
+
+	p_block = &p_hwfn->hw_info.p_igu_info->igu_map.igu_blocks[sb_id];
+
+	/* stop scanning when hit first invalid PF entry */
+	if (!GET_FIELD(val, IGU_MAPPING_LINE_VALID) &&
+	    GET_FIELD(val, IGU_MAPPING_LINE_PF_VALID))
+		goto out;
+
+	/* Fill the block information */
+	p_block->status		= QED_IGU_STATUS_VALID;
+	p_block->function_id	= GET_FIELD(val,
+					    IGU_MAPPING_LINE_FUNCTION_NUMBER);
+	p_block->is_pf		= GET_FIELD(val, IGU_MAPPING_LINE_PF_VALID);
+	p_block->vector_number	= GET_FIELD(val,
+					    IGU_MAPPING_LINE_VECTOR_NUMBER);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+		   "IGU_BLOCK: [SB 0x%04x, Value in CAM 0x%08x] func_id = %d is_pf = %d vector_num = 0x%x\n",
+		   sb_id, val, p_block->function_id,
+		   p_block->is_pf, p_block->vector_number);
+
+out:
+	return val;
+}
+
 int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt)
 {
@@ -963,26 +998,13 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 	     sb_id++) {
 		blk = &p_igu_info->igu_map.igu_blocks[sb_id];
 
-		val = qed_rd(p_hwfn, p_ptt,
-			     IGU_REG_MAPPING_MEMORY + sizeof(u32) * sb_id);
+		val	= qed_int_igu_read_cam_block(p_hwfn, p_ptt, sb_id);
 
 		/* stop scanning when hit first invalid PF entry */
 		if (!GET_FIELD(val, IGU_MAPPING_LINE_VALID) &&
 		    GET_FIELD(val, IGU_MAPPING_LINE_PF_VALID))
 			break;
 
-		blk->status = QED_IGU_STATUS_VALID;
-		blk->function_id = GET_FIELD(val,
-					     IGU_MAPPING_LINE_FUNCTION_NUMBER);
-		blk->is_pf = GET_FIELD(val, IGU_MAPPING_LINE_PF_VALID);
-		blk->vector_number = GET_FIELD(val,
-					       IGU_MAPPING_LINE_VECTOR_NUMBER);
-
-		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
-			   "IGU_BLOCK[sb_id]:%x:func_id = %d is_pf = %d vector_num = 0x%x\n",
-			   val, blk->function_id, blk->is_pf,
-			   blk->vector_number);
-
 		if (blk->is_pf) {
 			if (blk->function_id == p_hwfn->rel_pf_id) {
 				blk->status |= QED_IGU_STATUS_PF;
@@ -1121,18 +1143,17 @@ void qed_int_setup(struct qed_hwfn *p_hwfn,
 	qed_int_sp_dpc_setup(p_hwfn);
 }
 
-int qed_int_get_num_sbs(struct qed_hwfn *p_hwfn,
-			int *p_iov_blks)
+void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
+			 struct qed_sb_cnt_info *p_sb_cnt_info)
 {
 	struct qed_igu_info *info = p_hwfn->hw_info.p_igu_info;
 
-	if (!info)
-		return 0;
-
-	if (p_iov_blks)
-		*p_iov_blks = info->free_blks;
+	if (!info || !p_sb_cnt_info)
+		return;
 
-	return info->igu_sb_cnt;
+	p_sb_cnt_info->sb_cnt		= info->igu_sb_cnt;
+	p_sb_cnt_info->sb_iov_cnt	= info->igu_sb_cnt_iov;
+	p_sb_cnt_info->sb_free_blk	= info->free_blks;
 }
 
 void qed_int_disable_post_isr_release(struct qed_dev *cdev)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
index 51e0b09a7f47..c57f2e680770 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -161,12 +161,12 @@ void qed_int_sp_dpc(unsigned long hwfn_cookie);
  *        blocks configured for this funciton in the igu.
  *
  * @param p_hwfn
- * @param p_iov_blks - configured free blks for vfs
+ * @param p_sb_cnt_info
  *
  * @return int - number of status blocks configured
  */
-int qed_int_get_num_sbs(struct qed_hwfn *p_hwfn,
-			int *p_iov_blks);
+void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
+			 struct qed_sb_cnt_info *p_sb_cnt_info);
 
 /**
  * @brief qed_int_disable_post_isr_release - performs the cleanup post ISR
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 25d6e91335ea..caa689e6575c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -634,15 +634,18 @@ static int qed_get_int_fp(struct qed_dev *cdev, struct qed_int_info *info)
 static int qed_slowpath_setup_int(struct qed_dev *cdev,
 				  enum qed_int_mode int_mode)
 {
-	int rc, i;
-	u8 num_vectors = 0;
-
+	struct qed_sb_cnt_info sb_cnt_info;
+	int rc;
+	int i;
 	memset(&cdev->int_params, 0, sizeof(struct qed_int_params));
 
 	cdev->int_params.in.int_mode = int_mode;
-	for_each_hwfn(cdev, i)
-		num_vectors +=  qed_int_get_num_sbs(&cdev->hwfns[i], NULL) + 1;
-	cdev->int_params.in.num_vectors = num_vectors;
+	for_each_hwfn(cdev, i) {
+		memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+		qed_int_get_num_sbs(&cdev->hwfns[i], &sb_cnt_info);
+		cdev->int_params.in.num_vectors += sb_cnt_info.sb_cnt;
+		cdev->int_params.in.num_vectors++; /* slowpath */
+	}
 
 	/* We want a minimum of one slowpath and one fastpath vector per hwfn */
 	cdev->int_params.in.min_msix_cnt = cdev->num_hwfns * 2;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 3d43c1d4ecef..1f7599c77cd4 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -446,6 +446,12 @@ struct qed_eth_stats {
 #define RX_PI           0
 #define TX_PI(tc)       (RX_PI + 1 + tc)
 
+struct qed_sb_cnt_info {
+	int	sb_cnt;
+	int	sb_iov_cnt;
+	int	sb_free_blk;
+};
+
 static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info)
 {
 	u32 prod = 0;
-- 
cgit v1.2.3


From a67dd266adf42a24df31380e9da78390bb4d65ef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:35 +0100
Subject: netfilter: xtables: prepare for on-demand hook register

This change prepares for upcoming on-demand xtables hook registration.

We change the protoypes of the register/unregister functions.
A followup patch will then add nf_hook_register/unregister calls
to the iptables one.

Once a hook is registered packets will be picked up, so all assignments
of the form

net->ipv4.iptable_$table = new_table

have to be moved to ip(6)t_register_table, else we can see NULL
net->ipv4.iptable_$table later.

This patch doesn't change functionality; without this the actual change
simply gets too big.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h  |  9 +++++----
 include/linux/netfilter_ipv4/ip_tables.h  |  9 +++++----
 include/linux/netfilter_ipv6/ip6_tables.h |  9 +++++----
 net/ipv4/netfilter/arp_tables.c           | 25 ++++++++++++++-----------
 net/ipv4/netfilter/arptable_filter.c      | 11 ++++++-----
 net/ipv4/netfilter/ip_tables.c            | 21 ++++++++++-----------
 net/ipv4/netfilter/iptable_filter.c       |  9 +++++----
 net/ipv4/netfilter/iptable_mangle.c       |  9 +++++----
 net/ipv4/netfilter/iptable_nat.c          |  8 +++++---
 net/ipv4/netfilter/iptable_raw.c          |  9 +++++----
 net/ipv4/netfilter/iptable_security.c     |  9 +++++----
 net/ipv6/netfilter/ip6_tables.c           | 23 ++++++++++++-----------
 net/ipv6/netfilter/ip6table_filter.c      |  9 +++++----
 net/ipv6/netfilter/ip6table_mangle.c      |  9 +++++----
 net/ipv6/netfilter/ip6table_nat.c         |  8 +++++---
 net/ipv6/netfilter/ip6table_raw.c         |  9 +++++----
 net/ipv6/netfilter/ip6table_security.c    |  9 +++++----
 17 files changed, 107 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 6f074db2f23d..029b95e8924e 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -48,10 +48,11 @@ struct arpt_error {
 }
 
 extern void *arpt_alloc_initial_table(const struct xt_table *);
-extern struct xt_table *arpt_register_table(struct net *net,
-					    const struct xt_table *table,
-					    const struct arpt_replace *repl);
-extern void arpt_unregister_table(struct xt_table *table);
+int arpt_register_table(struct net *net, const struct xt_table *table,
+			const struct arpt_replace *repl,
+			const struct nf_hook_ops *ops, struct xt_table **res);
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index aa598f942c01..7bfc5893ec31 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -24,10 +24,11 @@
 
 extern void ipt_init(void) __init;
 
-extern struct xt_table *ipt_register_table(struct net *net,
-					   const struct xt_table *table,
-					   const struct ipt_replace *repl);
-extern void ipt_unregister_table(struct net *net, struct xt_table *table);
+int ipt_register_table(struct net *net, const struct xt_table *table,
+		       const struct ipt_replace *repl,
+		       const struct nf_hook_ops *ops, struct xt_table **res);
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+			  const struct nf_hook_ops *ops);
 
 /* Standard entry. */
 struct ipt_standard {
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 0f76e5c674f9..b21c392d6012 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -25,10 +25,11 @@
 extern void ip6t_init(void) __init;
 
 extern void *ip6t_alloc_initial_table(const struct xt_table *);
-extern struct xt_table *ip6t_register_table(struct net *net,
-					    const struct xt_table *table,
-					    const struct ip6t_replace *repl);
-extern void ip6t_unregister_table(struct net *net, struct xt_table *table);
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+			const struct ip6t_replace *repl,
+			const struct nf_hook_ops *ops, struct xt_table **res);
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops);
 extern unsigned int ip6t_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b488cac9c5ca..00eed0852dfc 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1780,9 +1780,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	return ret;
 }
 
-struct xt_table *arpt_register_table(struct net *net,
-				     const struct xt_table *table,
-				     const struct arpt_replace *repl)
+int arpt_register_table(struct net *net,
+			const struct xt_table *table,
+			const struct arpt_replace *repl,
+			const struct nf_hook_ops *ops,
+			struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -1791,10 +1793,8 @@ struct xt_table *arpt_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -1809,15 +1809,18 @@ struct xt_table *arpt_register_table(struct net *net,
 		ret = PTR_ERR(new_table);
 		goto out_free;
 	}
-	return new_table;
+
+	WRITE_ONCE(*res, new_table);
+
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void arpt_unregister_table(struct xt_table *table)
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 1897ee160920..4c0241692576 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -38,19 +38,20 @@ static struct nf_hook_ops *arpfilter_ops __read_mostly;
 static int __net_init arptable_filter_net_init(struct net *net)
 {
 	struct arpt_replace *repl;
-	
+	int err;
+
 	repl = arpt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.arptable_filter =
-		arpt_register_table(net, &packet_filter, repl);
+	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
+				  &net->ipv4.arptable_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+	return err;
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
-	arpt_unregister_table(net->ipv4.arptable_filter);
+	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6ba1..1eb4fe5b4702 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2062,9 +2062,9 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-struct xt_table *ipt_register_table(struct net *net,
-				    const struct xt_table *table,
-				    const struct ipt_replace *repl)
+int ipt_register_table(struct net *net, const struct xt_table *table,
+		       const struct ipt_replace *repl,
+		       const struct nf_hook_ops *ops, struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -2073,10 +2073,8 @@ struct xt_table *ipt_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2091,15 +2089,16 @@ struct xt_table *ipt_register_table(struct net *net,
 		goto out_free;
 	}
 
-	return new_table;
+	WRITE_ONCE(*res, new_table);
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void ipt_unregister_table(struct net *net, struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+			  const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 397ef2dd133e..3fbe4acacb27 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -54,6 +54,7 @@ module_param(forward, bool, 0000);
 static int __net_init iptable_filter_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int err;
 
 	repl = ipt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
@@ -62,15 +63,15 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	((struct ipt_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	net->ipv4.iptable_filter =
-		ipt_register_table(net, &packet_filter, repl);
+	err = ipt_register_table(net, &packet_filter, repl, filter_ops,
+				 &net->ipv4.iptable_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+	return err;
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_filter);
+	ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index ba5d392a13c4..668e79166b81 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -96,19 +96,20 @@ static struct nf_hook_ops *mangle_ops __read_mostly;
 static int __net_init iptable_mangle_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_mangle =
-		ipt_register_table(net, &packet_mangler, repl);
+	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
+				 &net->ipv4.iptable_mangle);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+	return ret;
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_mangle);
+	ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ae2cd2752046..e984f1d3017f 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -98,18 +98,20 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
 static int __net_init iptable_nat_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
+				 nf_nat_ipv4_ops, &net->ipv4.nat_table);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+	return ret;
 }
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.nat_table);
+	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1ba02811acb0..9d78780a9036 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -37,19 +37,20 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init iptable_raw_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_raw =
-		ipt_register_table(net, &packet_raw, repl);
+	ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+				 &net->ipv4.iptable_raw);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+	return ret;
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_raw);
+	ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c2e23d5e9cd4..88bc52fb8f4a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -54,19 +54,20 @@ static struct nf_hook_ops *sectbl_ops __read_mostly;
 static int __net_init iptable_security_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_security =
-		ipt_register_table(net, &security_table, repl);
+	ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
+				 &net->ipv4.iptable_security);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+	return ret;
 }
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_security);
+	ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
 }
 
 static struct pernet_operations iptable_security_net_ops = {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 99425cf2819b..052d7447b52e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2071,9 +2071,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-struct xt_table *ip6t_register_table(struct net *net,
-				     const struct xt_table *table,
-				     const struct ip6t_replace *repl)
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+			const struct ip6t_replace *repl,
+			const struct nf_hook_ops *ops,
+			struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -2082,10 +2083,8 @@ struct xt_table *ip6t_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2099,15 +2098,17 @@ struct xt_table *ip6t_register_table(struct net *net,
 		ret = PTR_ERR(new_table);
 		goto out_free;
 	}
-	return new_table;
+
+	WRITE_ONCE(*res, new_table);
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void ip6t_unregister_table(struct net *net, struct xt_table *table)
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 8b277b983ca5..d191d54cdf50 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -47,6 +47,7 @@ module_param(forward, bool, 0000);
 static int __net_init ip6table_filter_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int err;
 
 	repl = ip6t_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
@@ -55,15 +56,15 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	((struct ip6t_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	net->ipv6.ip6table_filter =
-		ip6t_register_table(net, &packet_filter, repl);
+	err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
+				  &net->ipv6.ip6table_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter);
+	return err;
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+	ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index abe278b07932..fe43d08284bc 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -91,19 +91,20 @@ static struct nf_hook_ops *mangle_ops __read_mostly;
 static int __net_init ip6table_mangle_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_mangle =
-		ip6t_register_table(net, &packet_mangler, repl);
+	ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
+				  &net->ipv6.ip6table_mangle);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle);
+	return ret;
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+	ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index de2a10a565f5..7f9740e8ef47 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -100,18 +100,20 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
 static int __net_init ip6table_nat_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+	ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
+				  nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat);
+	return ret;
 }
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9021963565c3..5fac433da069 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -30,19 +30,20 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init ip6table_raw_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_raw =
-		ip6t_register_table(net, &packet_raw, repl);
+	ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops,
+				  &net->ipv6.ip6table_raw);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw);
+	return ret;
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+	ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 0d856fedfeb0..cf587453e322 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -47,19 +47,20 @@ static struct nf_hook_ops *sectbl_ops __read_mostly;
 static int __net_init ip6table_security_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_security =
-		ip6t_register_table(net, &security_table, repl);
+	ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
+				  &net->ipv6.ip6table_security);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security);
+	return ret;
 }
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_security);
+	ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
 }
 
 static struct pernet_operations ip6table_security_net_ops = {
-- 
cgit v1.2.3


From b9e69e127397187b70c813a4397cce7afb5e8cb1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:36 +0100
Subject: netfilter: xtables: don't hook tables by default

delay hook registration until the table is being requested inside a
namespace.

Historically, a particular table (iptables mangle, ip6tables filter, etc)
was registered on module load.

When netns support was added to iptables only the ip/ip6tables ruleset was
made namespace aware, not the actual hook points.

This means f.e. that when ipt_filter table/module is loaded on a system,
then each namespace on that system has an (empty) iptables filter ruleset.

In other words, if a namespace sends a packet, such skb is 'caught' by
netfilter machinery and fed to hooking points for that table (i.e. INPUT,
FORWARD, etc).

Thanks to Eric Biederman, hooks are no longer global, but per namespace.

This means that we can avoid allocation of empty ruleset in a namespace and
defer hook registration until we need the functionality.

We register a tables hook entry points ONLY in the initial namespace.
When an iptables get/setockopt is issued inside a given namespace, we check
if the table is found in the per-namespace list.

If not, we attempt to find it in the initial namespace, and, if found,
create an empty default table in the requesting namespace and register the
needed hooks.

Hook points are destroyed only once namespace is deleted, there is no
'usage count' (it makes no sense since there is no 'remove table' operation
in xtables api).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h     |  6 ++--
 net/ipv4/netfilter/arp_tables.c        | 41 +++++++++++++--------
 net/ipv4/netfilter/arptable_filter.c   | 29 ++++++++-------
 net/ipv4/netfilter/ip_tables.c         | 42 ++++++++++++++--------
 net/ipv4/netfilter/iptable_filter.c    | 35 ++++++++++++------
 net/ipv4/netfilter/iptable_mangle.c    | 32 ++++++++++++-----
 net/ipv4/netfilter/iptable_nat.c       | 33 ++++++++---------
 net/ipv4/netfilter/iptable_raw.c       | 29 ++++++++++-----
 net/ipv4/netfilter/iptable_security.c  | 35 +++++++++++-------
 net/ipv6/netfilter/ip6_tables.c        | 42 ++++++++++++++--------
 net/ipv6/netfilter/ip6table_filter.c   | 38 ++++++++++++--------
 net/ipv6/netfilter/ip6table_mangle.c   | 37 +++++++++++--------
 net/ipv6/netfilter/ip6table_nat.c      | 33 ++++++++---------
 net/ipv6/netfilter/ip6table_raw.c      | 37 +++++++++++--------
 net/ipv6/netfilter/ip6table_security.c | 35 +++++++++++-------
 net/netfilter/x_tables.c               | 65 +++++++++++++++++++++-------------
 16 files changed, 361 insertions(+), 208 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c5577410c25d..80a305b85323 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -200,6 +200,9 @@ struct xt_table {
 	u_int8_t af;		/* address/protocol family */
 	int priority;		/* hook order */
 
+	/* called when table is needed in the given netns */
+	int (*table_init)(struct net *net);
+
 	/* A unique name... */
 	const char name[XT_TABLE_MAXNAMELEN];
 };
@@ -408,8 +411,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 	return cnt;
 }
 
-struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *);
-void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *);
+struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 00eed0852dfc..bf081927e06b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1780,6 +1780,24 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	return ret;
 }
 
+static void __arpt_unregister_table(struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct arpt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int arpt_register_table(struct net *net,
 			const struct xt_table *table,
 			const struct arpt_replace *repl,
@@ -1810,8 +1828,15 @@ int arpt_register_table(struct net *net,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
 
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__arpt_unregister_table(new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -1822,20 +1847,8 @@ out_free:
 void arpt_unregister_table(struct net *net, struct xt_table *table,
 			   const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct arpt_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__arpt_unregister_table(table);
 }
 
 /* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 4c0241692576..dd8c80dc32a2 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
 			   (1 << NF_ARP_FORWARD))
 
+static int __net_init arptable_filter_table_init(struct net *net);
+
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_ARP,
 	.priority	= NF_IP_PRI_FILTER,
+	.table_init	= arptable_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c */
@@ -35,11 +38,14 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
 
-static int __net_init arptable_filter_net_init(struct net *net)
+static int __net_init arptable_filter_table_init(struct net *net)
 {
 	struct arpt_replace *repl;
 	int err;
 
+	if (net->ipv4.arptable_filter)
+		return 0;
+
 	repl = arpt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -51,11 +57,13 @@ static int __net_init arptable_filter_net_init(struct net *net)
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
+	if (!net->ipv4.arptable_filter)
+		return;
 	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+	net->ipv4.arptable_filter = NULL;
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
-	.init = arptable_filter_net_init,
 	.exit = arptable_filter_net_exit,
 };
 
@@ -63,26 +71,23 @@ static int __init arptable_filter_init(void)
 {
 	int ret;
 
+	arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+	if (IS_ERR(arpfilter_ops))
+		return PTR_ERR(arpfilter_ops);
+
 	ret = register_pernet_subsys(&arptable_filter_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(arpfilter_ops);
 		return ret;
-
-	arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
-	if (IS_ERR(arpfilter_ops)) {
-		ret = PTR_ERR(arpfilter_ops);
-		goto cleanup_table;
 	}
-	return ret;
 
-cleanup_table:
-	unregister_pernet_subsys(&arptable_filter_net_ops);
 	return ret;
 }
 
 static void __exit arptable_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, arpfilter_ops);
 	unregister_pernet_subsys(&arptable_filter_net_ops);
+	kfree(arpfilter_ops);
 }
 
 module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 1eb4fe5b4702..e53f8d6f326d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2062,6 +2062,24 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ipt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int ipt_register_table(struct net *net, const struct xt_table *table,
 		       const struct ipt_replace *repl,
 		       const struct nf_hook_ops *ops, struct xt_table **res)
@@ -2089,7 +2107,15 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__ipt_unregister_table(net, new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -2100,20 +2126,8 @@ out_free:
 void ipt_unregister_table(struct net *net, struct xt_table *table,
 			  const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct ipt_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter, net);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__ipt_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 3fbe4acacb27..7667f223d7f8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_filter_table_init(struct net *net);
 
 static const struct xt_table packet_filter = {
 	.name		= "filter",
@@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_FILTER,
+	.table_init	= iptable_filter_table_init,
 };
 
 static unsigned int
@@ -48,14 +50,17 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,
 static struct nf_hook_ops *filter_ops __read_mostly;
 
 /* Default to forward because I got too much mail already. */
-static bool forward = true;
+static bool forward __read_mostly = true;
 module_param(forward, bool, 0000);
 
-static int __net_init iptable_filter_net_init(struct net *net)
+static int __net_init iptable_filter_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int err;
 
+	if (net->ipv4.iptable_filter)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -69,9 +74,20 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	return err;
 }
 
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+	if (net == &init_net || !forward)
+		return iptable_filter_table_init(net);
+
+	return 0;
+}
+
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_filter)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
+	net->ipv4.iptable_filter = NULL;
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
@@ -83,24 +99,21 @@ static int __init iptable_filter_init(void)
 {
 	int ret;
 
+	filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+	if (IS_ERR(filter_ops))
+		return PTR_ERR(filter_ops);
+
 	ret = register_pernet_subsys(&iptable_filter_net_ops);
 	if (ret < 0)
-		return ret;
-
-	/* Register hooks */
-	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
-	if (IS_ERR(filter_ops)) {
-		ret = PTR_ERR(filter_ops);
-		unregister_pernet_subsys(&iptable_filter_net_ops);
-	}
+		kfree(filter_ops);
 
 	return ret;
 }
 
 static void __exit iptable_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, filter_ops);
 	unregister_pernet_subsys(&iptable_filter_net_ops);
+	kfree(filter_ops);
 }
 
 module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 668e79166b81..57fc97cdac70 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
+static int __net_init iptable_mangle_table_init(struct net *net);
+
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_MANGLE,
+	.table_init	= iptable_mangle_table_init,
 };
 
 static unsigned int
@@ -92,12 +95,14 @@ iptable_mangle_hook(void *priv,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-
-static int __net_init iptable_mangle_net_init(struct net *net)
+static int __net_init iptable_mangle_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_mangle)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -109,11 +114,13 @@ static int __net_init iptable_mangle_net_init(struct net *net)
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_mangle)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
+	net->ipv4.iptable_mangle = NULL;
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
-	.init = iptable_mangle_net_init,
 	.exit = iptable_mangle_net_exit,
 };
 
@@ -121,15 +128,22 @@ static int __init iptable_mangle_init(void)
 {
 	int ret;
 
+	mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+	if (IS_ERR(mangle_ops)) {
+		ret = PTR_ERR(mangle_ops);
+		return ret;
+	}
+
 	ret = register_pernet_subsys(&iptable_mangle_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(mangle_ops);
 		return ret;
+	}
 
-	/* Register hooks */
-	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
-	if (IS_ERR(mangle_ops)) {
-		ret = PTR_ERR(mangle_ops);
+	ret = iptable_mangle_table_init(&init_net);
+	if (ret) {
 		unregister_pernet_subsys(&iptable_mangle_net_ops);
+		kfree(mangle_ops);
 	}
 
 	return ret;
@@ -137,8 +151,8 @@ static int __init iptable_mangle_init(void)
 
 static void __exit iptable_mangle_fini(void)
 {
-	xt_hook_unlink(&packet_mangler, mangle_ops);
 	unregister_pernet_subsys(&iptable_mangle_net_ops);
+	kfree(mangle_ops);
 }
 
 module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index e984f1d3017f..138a24bc76ad 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 
+static int __net_init iptable_nat_table_init(struct net *net);
+
 static const struct xt_table nf_nat_ipv4_table = {
 	.name		= "nat",
 	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
@@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
+	.table_init	= iptable_nat_table_init,
 };
 
 static unsigned int iptable_nat_do_chain(void *priv,
@@ -95,11 +98,14 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
 	},
 };
 
-static int __net_init iptable_nat_net_init(struct net *net)
+static int __net_init iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.nat_table)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -111,36 +117,31 @@ static int __net_init iptable_nat_net_init(struct net *net)
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
+	if (!net->ipv4.nat_table)
+		return;
 	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+	net->ipv4.nat_table = NULL;
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
-	.init	= iptable_nat_net_init,
 	.exit	= iptable_nat_net_exit,
 };
 
 static int __init iptable_nat_init(void)
 {
-	int err;
-
-	err = register_pernet_subsys(&iptable_nat_net_ops);
-	if (err < 0)
-		goto err1;
+	int ret = register_pernet_subsys(&iptable_nat_net_ops);
 
-	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-	if (err < 0)
-		goto err2;
-	return 0;
+	if (ret)
+		return ret;
 
-err2:
-	unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
-	return err;
+	ret = iptable_nat_table_init(&init_net);
+	if (ret)
+		unregister_pernet_subsys(&iptable_nat_net_ops);
+	return ret;
 }
 
 static void __exit iptable_nat_exit(void)
 {
-	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
 	unregister_pernet_subsys(&iptable_nat_net_ops);
 }
 
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 9d78780a9036..2642ecd2645c 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -10,12 +10,15 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init iptable_raw_table_init(struct net *net);
+
 static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV4,
 	.priority = NF_IP_PRI_RAW,
+	.table_init = iptable_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -34,11 +37,14 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
 
-static int __net_init iptable_raw_net_init(struct net *net)
+static int __net_init iptable_raw_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_raw)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -50,11 +56,13 @@ static int __net_init iptable_raw_net_init(struct net *net)
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_raw)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
+	net->ipv4.iptable_raw = NULL;
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
-	.init = iptable_raw_net_init,
 	.exit = iptable_raw_net_exit,
 };
 
@@ -62,15 +70,20 @@ static int __init iptable_raw_init(void)
 {
 	int ret;
 
+	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+	if (IS_ERR(rawtable_ops))
+		return PTR_ERR(rawtable_ops);
+
 	ret = register_pernet_subsys(&iptable_raw_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(rawtable_ops);
 		return ret;
+	}
 
-	/* Register hooks */
-	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
-	if (IS_ERR(rawtable_ops)) {
-		ret = PTR_ERR(rawtable_ops);
+	ret = iptable_raw_table_init(&init_net);
+	if (ret) {
 		unregister_pernet_subsys(&iptable_raw_net_ops);
+		kfree(rawtable_ops);
 	}
 
 	return ret;
@@ -78,8 +91,8 @@ static int __init iptable_raw_init(void)
 
 static void __exit iptable_raw_fini(void)
 {
-	xt_hook_unlink(&packet_raw, rawtable_ops);
 	unregister_pernet_subsys(&iptable_raw_net_ops);
+	kfree(rawtable_ops);
 }
 
 module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 88bc52fb8f4a..ff226596e4b5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
+static int __net_init iptable_security_table_init(struct net *net);
+
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_SECURITY,
+	.table_init	= iptable_security_table_init,
 };
 
 static unsigned int
@@ -51,11 +54,14 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init iptable_security_net_init(struct net *net)
+static int __net_init iptable_security_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_security)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -67,11 +73,14 @@ static int __net_init iptable_security_net_init(struct net *net)
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_security)
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
+	net->ipv4.iptable_security = NULL;
 }
 
 static struct pernet_operations iptable_security_net_ops = {
-	.init = iptable_security_net_init,
 	.exit = iptable_security_net_exit,
 };
 
@@ -79,27 +88,29 @@ static int __init iptable_security_init(void)
 {
 	int ret;
 
+	sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+	if (IS_ERR(sectbl_ops))
+		return PTR_ERR(sectbl_ops);
+
 	ret = register_pernet_subsys(&iptable_security_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(sectbl_ops);
 		return ret;
-
-	sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
-	if (IS_ERR(sectbl_ops)) {
-		ret = PTR_ERR(sectbl_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
+	ret = iptable_security_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&iptable_security_net_ops);
+		kfree(sectbl_ops);
+	}
 
-cleanup_table:
-	unregister_pernet_subsys(&iptable_security_net_ops);
 	return ret;
 }
 
 static void __exit iptable_security_fini(void)
 {
-	xt_hook_unlink(&security_table, sectbl_ops);
 	unregister_pernet_subsys(&iptable_security_net_ops);
+	kfree(sectbl_ops);
 }
 
 module_init(iptable_security_init);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 052d7447b52e..84f9baf7aee8 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2071,6 +2071,24 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
+static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ip6t_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
 			const struct nf_hook_ops *ops,
@@ -2099,7 +2117,15 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__ip6t_unregister_table(net, new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -2110,20 +2136,8 @@ out_free:
 void ip6t_unregister_table(struct net *net, struct xt_table *table,
 			   const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct ip6t_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter, net);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__ip6t_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index d191d54cdf50..1343077dde93 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table");
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init ip6table_filter_table_init(struct net *net);
+
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_FILTER,
+	.table_init	= ip6table_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -44,11 +47,14 @@ static struct nf_hook_ops *filter_ops __read_mostly;
 static bool forward = true;
 module_param(forward, bool, 0000);
 
-static int __net_init ip6table_filter_net_init(struct net *net)
+static int __net_init ip6table_filter_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int err;
 
+	if (net->ipv6.ip6table_filter)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -62,9 +68,20 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	return err;
 }
 
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+	if (net == &init_net || !forward)
+		return ip6table_filter_table_init(net);
+
+	return 0;
+}
+
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_filter)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
+	net->ipv6.ip6table_filter = NULL;
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
@@ -76,28 +93,21 @@ static int __init ip6table_filter_init(void)
 {
 	int ret;
 
+	filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
+	if (IS_ERR(filter_ops))
+		return PTR_ERR(filter_ops);
+
 	ret = register_pernet_subsys(&ip6table_filter_net_ops);
 	if (ret < 0)
-		return ret;
-
-	/* Register hooks */
-	filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
-	if (IS_ERR(filter_ops)) {
-		ret = PTR_ERR(filter_ops);
-		goto cleanup_table;
-	}
+		kfree(filter_ops);
 
 	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_filter_net_ops);
-	return ret;
 }
 
 static void __exit ip6table_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, filter_ops);
 	unregister_pernet_subsys(&ip6table_filter_net_ops);
+	kfree(filter_ops);
 }
 
 module_init(ip6table_filter_init);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index fe43d08284bc..cb2b28883252 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
+static int __net_init ip6table_mangle_table_init(struct net *net);
+
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_MANGLE,
+	.table_init	= ip6table_mangle_table_init,
 };
 
 static unsigned int
@@ -88,11 +91,14 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_net_init(struct net *net)
+static int __net_init ip6table_mangle_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_mangle)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -104,11 +110,14 @@ static int __net_init ip6table_mangle_net_init(struct net *net)
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_mangle)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
+	net->ipv6.ip6table_mangle = NULL;
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
-	.init = ip6table_mangle_net_init,
 	.exit = ip6table_mangle_net_exit,
 };
 
@@ -116,28 +125,28 @@ static int __init ip6table_mangle_init(void)
 {
 	int ret;
 
+	mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
+	if (IS_ERR(mangle_ops))
+		return PTR_ERR(mangle_ops);
+
 	ret = register_pernet_subsys(&ip6table_mangle_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(mangle_ops);
 		return ret;
-
-	/* Register hooks */
-	mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
-	if (IS_ERR(mangle_ops)) {
-		ret = PTR_ERR(mangle_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	ret = ip6table_mangle_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_mangle_net_ops);
+		kfree(mangle_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_mangle_fini(void)
 {
-	xt_hook_unlink(&packet_mangler, mangle_ops);
 	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	kfree(mangle_ops);
 }
 
 module_init(ip6table_mangle_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 7f9740e8ef47..7d2bd940291f 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -20,6 +20,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 
+static int __net_init ip6table_nat_table_init(struct net *net);
+
 static const struct xt_table nf_nat_ipv6_table = {
 	.name		= "nat",
 	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
@@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
+	.table_init	= ip6table_nat_table_init,
 };
 
 static unsigned int ip6table_nat_do_chain(void *priv,
@@ -97,11 +100,14 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
 	},
 };
 
-static int __net_init ip6table_nat_net_init(struct net *net)
+static int __net_init ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_nat)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -113,36 +119,31 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_nat)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+	net->ipv6.ip6table_nat = NULL;
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
-	.init	= ip6table_nat_net_init,
 	.exit	= ip6table_nat_net_exit,
 };
 
 static int __init ip6table_nat_init(void)
 {
-	int err;
-
-	err = register_pernet_subsys(&ip6table_nat_net_ops);
-	if (err < 0)
-		goto err1;
+	int ret = register_pernet_subsys(&ip6table_nat_net_ops);
 
-	err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
-	if (err < 0)
-		goto err2;
-	return 0;
+	if (ret)
+		return ret;
 
-err2:
-	unregister_pernet_subsys(&ip6table_nat_net_ops);
-err1:
-	return err;
+	ret = ip6table_nat_table_init(&init_net);
+	if (ret)
+		unregister_pernet_subsys(&ip6table_nat_net_ops);
+	return ret;
 }
 
 static void __exit ip6table_nat_exit(void)
 {
-	nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
 	unregister_pernet_subsys(&ip6table_nat_net_ops);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 5fac433da069..d4bc56443dc1 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -9,12 +9,15 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init ip6table_raw_table_init(struct net *net);
+
 static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV6,
 	.priority = NF_IP6_PRI_RAW,
+	.table_init = ip6table_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -27,11 +30,14 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
 
-static int __net_init ip6table_raw_net_init(struct net *net)
+static int __net_init ip6table_raw_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_raw)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -43,11 +49,13 @@ static int __net_init ip6table_raw_net_init(struct net *net)
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_raw)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
+	net->ipv6.ip6table_raw = NULL;
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
-	.init = ip6table_raw_net_init,
 	.exit = ip6table_raw_net_exit,
 };
 
@@ -55,28 +63,29 @@ static int __init ip6table_raw_init(void)
 {
 	int ret;
 
+	/* Register hooks */
+	rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
+	if (IS_ERR(rawtable_ops))
+		return PTR_ERR(rawtable_ops);
+
 	ret = register_pernet_subsys(&ip6table_raw_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(rawtable_ops);
 		return ret;
-
-	/* Register hooks */
-	rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
-	if (IS_ERR(rawtable_ops)) {
-		ret = PTR_ERR(rawtable_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	ret = ip6table_raw_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_raw_net_ops);
+		kfree(rawtable_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_raw_fini(void)
 {
-	xt_hook_unlink(&packet_raw, rawtable_ops);
 	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	kfree(rawtable_ops);
 }
 
 module_init(ip6table_raw_init);
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index cf587453e322..cf26ccb04056 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
+static int __net_init ip6table_security_table_init(struct net *net);
+
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_SECURITY,
+	.table_init     = ip6table_security_table_init,
 };
 
 static unsigned int
@@ -44,11 +47,14 @@ ip6table_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init ip6table_security_net_init(struct net *net)
+static int __net_init ip6table_security_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_security)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -60,11 +66,13 @@ static int __net_init ip6table_security_net_init(struct net *net)
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_security)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
+	net->ipv6.ip6table_security = NULL;
 }
 
 static struct pernet_operations ip6table_security_net_ops = {
-	.init = ip6table_security_net_init,
 	.exit = ip6table_security_net_exit,
 };
 
@@ -72,27 +80,28 @@ static int __init ip6table_security_init(void)
 {
 	int ret;
 
+	sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
+	if (IS_ERR(sectbl_ops))
+		return PTR_ERR(sectbl_ops);
+
 	ret = register_pernet_subsys(&ip6table_security_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(sectbl_ops);
 		return ret;
-
-	sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
-	if (IS_ERR(sectbl_ops)) {
-		ret = PTR_ERR(sectbl_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
-cleanup_table:
-	unregister_pernet_subsys(&ip6table_security_net_ops);
+	ret = ip6table_security_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_security_net_ops);
+		kfree(sectbl_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_security_fini(void)
 {
-	xt_hook_unlink(&security_table, sectbl_ops);
 	unregister_pernet_subsys(&ip6table_security_net_ops);
+	kfree(sectbl_ops);
 }
 
 module_init(ip6table_security_init);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c8a0b7da5ff4..d0cd2b9bf844 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -694,12 +694,45 @@ EXPORT_SYMBOL(xt_free_table_info);
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
 {
-	struct xt_table *t;
+	struct xt_table *t, *found = NULL;
 
 	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &net->xt.tables[af], list)
 		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
 			return t;
+
+	if (net == &init_net)
+		goto out;
+
+	/* Table doesn't exist in this netns, re-try init */
+	list_for_each_entry(t, &init_net.xt.tables[af], list) {
+		if (strcmp(t->name, name))
+			continue;
+		if (!try_module_get(t->me))
+			return NULL;
+
+		mutex_unlock(&xt[af].mutex);
+		if (t->table_init(net) != 0) {
+			module_put(t->me);
+			return NULL;
+		}
+
+		found = t;
+
+		mutex_lock(&xt[af].mutex);
+		break;
+	}
+
+	if (!found)
+		goto out;
+
+	/* and once again: */
+	list_for_each_entry(t, &net->xt.tables[af], list)
+		if (strcmp(t->name, name) == 0)
+			return t;
+
+	module_put(found->me);
+ out:
 	mutex_unlock(&xt[af].mutex);
 	return NULL;
 }
@@ -1170,20 +1203,20 @@ static const struct file_operations xt_target_ops = {
 #endif /* CONFIG_PROC_FS */
 
 /**
- * xt_hook_link - set up hooks for a new table
+ * xt_hook_ops_alloc - set up hooks for a new table
  * @table:	table with metadata needed to set up hooks
  * @fn:		Hook function
  *
- * This function will take care of creating and registering the necessary
- * Netfilter hooks for XT tables.
+ * This function will create the nf_hook_ops that the x_table needs
+ * to hand to xt_hook_link_net().
  */
-struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+struct nf_hook_ops *
+xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
 {
 	unsigned int hook_mask = table->valid_hooks;
 	uint8_t i, num_hooks = hweight32(hook_mask);
 	uint8_t hooknum;
 	struct nf_hook_ops *ops;
-	int ret;
 
 	ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
 	if (ops == NULL)
@@ -1200,27 +1233,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
 		++i;
 	}
 
-	ret = nf_register_hooks(ops, num_hooks);
-	if (ret < 0) {
-		kfree(ops);
-		return ERR_PTR(ret);
-	}
-
 	return ops;
 }
-EXPORT_SYMBOL_GPL(xt_hook_link);
-
-/**
- * xt_hook_unlink - remove hooks for a table
- * @ops:	nf_hook_ops array as returned by nf_hook_link
- * @hook_mask:	the very same mask that was passed to nf_hook_link
- */
-void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
-{
-	nf_unregister_hooks(ops, hweight32(table->valid_hooks));
-	kfree(ops);
-}
-EXPORT_SYMBOL_GPL(xt_hook_unlink);
+EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
 
 int xt_proto_init(struct net *net, u_int8_t af)
 {
-- 
cgit v1.2.3


From af4610c39589d839551da104f7da342d86f23ea0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:38 +0100
Subject: netfilter: don't call hooks unless needed

With the previous patches in place, a netns nf_hook_list might be empty,
even if e.g. init_net performs filtering.

Thus change nf_hook_thresh to check the hook_list as well before
initializing hook_state and calling nf_hook_slow().

We still make use of static keys; if no netfilter modules are loaded
list is guaranteed to be empty.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 0ad556726181..9230f9aee896 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -141,22 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-
-static inline bool nf_hook_list_active(struct list_head *hook_list,
-				       u_int8_t pf, unsigned int hook)
-{
-	if (__builtin_constant_p(pf) &&
-	    __builtin_constant_p(hook))
-		return static_key_false(&nf_hooks_needed[pf][hook]);
-
-	return !list_empty(hook_list);
-}
-#else
-static inline bool nf_hook_list_active(struct list_head *hook_list,
-				       u_int8_t pf, unsigned int hook)
-{
-	return !list_empty(hook_list);
-}
 #endif
 
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
@@ -177,9 +161,18 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct net *, struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	struct list_head *hook_list = &net->nf.hooks[pf][hook];
+	struct list_head *hook_list;
+
+#ifdef HAVE_JUMP_LABEL
+	if (__builtin_constant_p(pf) &&
+	    __builtin_constant_p(hook) &&
+	    !static_key_false(&nf_hooks_needed[pf][hook]))
+		return 1;
+#endif
+
+	hook_list = &net->nf.hooks[pf][hook];
 
-	if (nf_hook_list_active(hook_list, pf, hook)) {
+	if (!list_empty(hook_list)) {
 		struct nf_hook_state state;
 
 		nf_hook_state_init(&state, hook_list, hook, thresh,
-- 
cgit v1.2.3


From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 1 Mar 2016 19:55:14 +0100
Subject: netfilter: nft_masq: support port range

Complete masquerading support by allowing port range selection.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_masq.h         |  4 ++-
 include/uapi/linux/netfilter/nf_tables.h |  4 +++
 net/ipv4/netfilter/nft_masq_ipv4.c       |  7 ++++-
 net/ipv6/netfilter/nft_masq_ipv6.c       |  7 ++++-
 net/netfilter/nft_masq.c                 | 51 +++++++++++++++++++++++++-------
 5 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h
index e2a518b60e19..a3f3c11b2526 100644
--- a/include/net/netfilter/nft_masq.h
+++ b/include/net/netfilter/nft_masq.h
@@ -2,7 +2,9 @@
 #define _NFT_MASQ_H_
 
 struct nft_masq {
-	u32	flags;
+	u32			flags;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
 };
 
 extern const struct nla_policy nft_masq_policy[];
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b19be0a098c0..eeffde196f80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -951,10 +951,14 @@ enum nft_nat_attributes {
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
  * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32)
+ * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_masq_attributes {
 	NFTA_MASQ_UNSPEC,
 	NFTA_MASQ_FLAGS,
+	NFTA_MASQ_REG_PROTO_MIN,
+	NFTA_MASQ_REG_PROTO_MAX,
 	__NFTA_MASQ_MAX
 };
 #define NFTA_MASQ_MAX		(__NFTA_MASQ_MAX - 1)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
 						    &range, pkt->out);
 }
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
 }
 
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
 #include <net/netfilter/nft_masq.h>
 
 const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
-	[NFTA_MASQ_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_MASQ_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MAX]	= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(nft_masq_policy);
 
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nft_expr *expr,
 		  const struct nlattr * const tb[])
 {
+	u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
 	struct nft_masq *priv = nft_expr_priv(expr);
 	int err;
 
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
 	if (err)
 		return err;
 
-	if (tb[NFTA_MASQ_FLAGS] == NULL)
-		return 0;
-
-	priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
-	if (priv->flags & ~NF_NAT_RANGE_MASK)
-		return -EINVAL;
+	if (tb[NFTA_MASQ_FLAGS]) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+		if (priv->flags & ~NF_NAT_RANGE_MASK)
+			return -EINVAL;
+	}
+
+	if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+		priv->sreg_proto_min =
+			nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+		err = nft_validate_register_load(priv->sreg_proto_min, plen);
+		if (err < 0)
+			return err;
+
+		if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+			priv->sreg_proto_max =
+				nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+			err = nft_validate_register_load(priv->sreg_proto_max,
+							 plen);
+			if (err < 0)
+				return err;
+		} else {
+			priv->sreg_proto_max = priv->sreg_proto_min;
+		}
+	}
 
 	return 0;
 }
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_masq *priv = nft_expr_priv(expr);
 
-	if (priv->flags == 0)
-		return 0;
-
-	if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+	if (priv->flags != 0 &&
+	    nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
 
+	if (priv->sreg_proto_min) {
+		if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+				      priv->sreg_proto_min) ||
+		    nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+				      priv->sreg_proto_max))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From afea03656add70a0e00f5b0039f87288c7af8b9f Mon Sep 17 00:00:00 2001
From: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Date: Mon, 29 Feb 2016 14:27:28 +0100
Subject: stmmac: rework DMA bus setting and introduce new platform AXI
 structure

This patch restructures the DMA bus settings and this is done
by introducing a new platform structure used for programming
the AXI Bus Mode Register inside the DMA module.
This structure can be populated from device-tree as documented in the
binding txt file.

After initializing the DMA, the AXI register can be optionally tuned
for platform drivers based.
This patch also reworks some parameters to make coherent the DMA
configuration now that AXI register is introduced.
For example, the burst_len is managed by using the mentioned axi
support above; so the snps,burst-len parameter has been removed.
It makes sense to provide the AAL parameter from DT to Address-Aligned
Beats inside the Register0 and review the PBL settings when initialize
the engine.

For PCI glue, rebuilding the story of this setting, it
was added to align a configuration so not for fixing some
known problem. No issue raised after this patch.
It is safe to use the default burst length instead of
tuning it to the maximum value

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt   | 54 ++++++++----
 drivers/net/ethernet/stmicro/stmmac/common.h       |  5 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac1000_dma.c    | 99 +++++++++++++++-------
 drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h    | 34 +++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 12 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   |  4 +-
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 42 ++++++++-
 include/linux/stmmac.h                             | 17 +++-
 10 files changed, 209 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index e862a922bd3f..6605d19601c2 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -17,7 +17,25 @@ Required properties:
 	The 1st cell is reset pre-delay in micro seconds.
 	The 2nd cell is reset pulse in micro seconds.
 	The 3rd cell is reset post-delay in micro seconds.
+
+Optional properties:
+- resets: Should contain a phandle to the STMMAC reset signal, if any
+- reset-names: Should contain the reset signal name "stmmaceth", if a
+	reset phandle is given
+- max-frame-size: See ethernet.txt file in the same directory
+- clocks: If present, the first clock should be the GMAC main clock and
+  the second clock should be peripheral's register interface clock. Further
+  clocks may be specified in derived bindings.
+- clock-names: One name for each entry in the clocks property, the
+  first one should be "stmmaceth" and the second one should be "pclk".
+- clk_ptp_ref: this is the PTP reference clock; in case of the PTP is
+  available this clock is used for programming the Timestamp Addend Register.
+  If not passed then the system clock will be used and this is fine on some
+  platforms.
+- tx-fifo-depth: See ethernet.txt file in the same directory
+- rx-fifo-depth: See ethernet.txt file in the same directory
 - snps,pbl		Programmable Burst Length
+- snps,aal		Address-Aligned Beats
 - snps,fixed-burst	Program the DMA to use the fixed burst mode
 - snps,mixed-burst	Program the DMA to use the mixed burst mode
 - snps,force_thresh_dma_mode	Force DMA to use the threshold mode for
@@ -29,27 +47,28 @@ Required properties:
 				supported by this device instance
 - snps,perfect-filter-entries:	Number of perfect filter entries supported
 				by this device instance
-
-Optional properties:
-- resets: Should contain a phandle to the STMMAC reset signal, if any
-- reset-names: Should contain the reset signal name "stmmaceth", if a
-	reset phandle is given
-- max-frame-size: See ethernet.txt file in the same directory
-- clocks: If present, the first clock should be the GMAC main clock
-  The optional second clock should be peripheral's register interface clock.
-  The third optional clock should be the ptp reference clock.
-  Further clocks may be specified in derived bindings.
-- clock-names: One name for each entry in the clocks property.
-  The first one should be "stmmaceth".
-  The optional second one should be "pclk".
-  The optional third one should be "clk_ptp_ref".
-- snps,burst_len: The AXI burst lenth value of the AXI BUS MODE register.
-- tx-fifo-depth: See ethernet.txt file in the same directory
-- rx-fifo-depth: See ethernet.txt file in the same directory
+- AXI BUS Mode parameters: below the list of all the parameters to program the
+			   AXI register inside the DMA module:
+	- snps,lpi_en: enable Low Power Interface
+	- snps,xit_frm: unlock on WoL
+	- snps,wr_osr_lmt: max write oustanding req. limit
+	- snps,rd_osr_lmt: max read oustanding req. limit
+	- snps,kbbe: do not cross 1KiB boundary.
+	- snps,axi_all: align address
+	- snps,blen: this is a vector of supported burst length.
+	- snps,fb: fixed-burst
+	- snps,mb: mixed-burst
+	- snps,rb: rebuild INCRx Burst
 - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
 
 Examples:
 
+	stmmac_axi_setup: stmmac-axi-config {
+		snps,wr_osr_lmt = <0xf>;
+		snps,rd_osr_lmt = <0xf>;
+		snps,blen = <256 128 64 32 0 0 0>;
+	};
+
 	gmac0: ethernet@e0800000 {
 		compatible = "st,spear600-gmac";
 		reg = <0xe0800000 0x8000>;
@@ -65,6 +84,7 @@ Examples:
 		tx-fifo-depth = <16384>;
 		clocks = <&clock>;
 		clock-names = "stmmaceth";
+		snps,axi-config = <&stmmac_axi_setup>;
 		mdio0 {
 			#address-cells = <1>;
 			#size-cells = <0>;
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index bac0e44d7634..586a33624dd2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -27,6 +27,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/netdevice.h>
+#include <linux/stmmac.h>
 #include <linux/phy.h>
 #include <linux/module.h>
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
@@ -378,7 +379,9 @@ struct stmmac_dma_ops {
 	/* DMA core initialization */
 	int (*reset)(void __iomem *ioaddr);
 	void (*init)(void __iomem *ioaddr, int pbl, int fb, int mb,
-		     int burst_len, u32 dma_tx, u32 dma_rx, int atds);
+		     int aal, u32 dma_tx, u32 dma_rx, int atds);
+	/* Configure the AXI Bus Mode Register */
+	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
 	/* Dump DMA registers */
 	void (*dump_regs) (void __iomem *ioaddr);
 	/* Set tx/rx threshold in the csr6 register
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index 9d36ae788429..b0593a4268ee 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -240,7 +240,7 @@ enum rx_tx_priority_ratio {
 #define DMA_BUS_MODE_RPBL_MASK	0x003e0000	/* Rx-Programmable Burst Len */
 #define DMA_BUS_MODE_RPBL_SHIFT	17
 #define DMA_BUS_MODE_USP	0x00800000
-#define DMA_BUS_MODE_PBL	0x01000000
+#define DMA_BUS_MODE_MAXPBL	0x01000000
 #define DMA_BUS_MODE_AAL	0x02000000
 
 /* DMA CRS Control and Status Register Mapping */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 5f0aea56b298..da32d6037e3e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -30,24 +30,76 @@
 #include "dwmac1000.h"
 #include "dwmac_dma.h"
 
+static void dwmac1000_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
+{
+	u32 value = readl(ioaddr + DMA_AXI_BUS_MODE);
+	int i;
+
+	pr_info("dwmac1000: Master AXI performs %s burst length\n",
+		!(value & DMA_AXI_UNDEF) ? "fixed" : "any");
+
+	if (axi->axi_lpi_en)
+		value |= DMA_AXI_EN_LPI;
+	if (axi->axi_xit_frm)
+		value |= DMA_AXI_LPI_XIT_FRM;
+
+	value |= (axi->axi_wr_osr_lmt & DMA_AXI_WR_OSR_LMT_MASK) <<
+		 DMA_AXI_WR_OSR_LMT_SHIFT;
+
+	value |= (axi->axi_rd_osr_lmt & DMA_AXI_RD_OSR_LMT_MASK) <<
+		 DMA_AXI_RD_OSR_LMT_SHIFT;
+
+	/* Depending on the UNDEF bit the Master AXI will perform any burst
+	 * length according to the BLEN programmed (by default all BLEN are
+	 * set).
+	 */
+	for (i = 0; i < AXI_BLEN; i++) {
+		switch (axi->axi_blen[i]) {
+		case 256:
+			value |= DMA_AXI_BLEN256;
+			break;
+		case 128:
+			value |= DMA_AXI_BLEN128;
+			break;
+		case 64:
+			value |= DMA_AXI_BLEN64;
+			break;
+		case 32:
+			value |= DMA_AXI_BLEN32;
+			break;
+		case 16:
+			value |= DMA_AXI_BLEN16;
+			break;
+		case 8:
+			value |= DMA_AXI_BLEN8;
+			break;
+		case 4:
+			value |= DMA_AXI_BLEN4;
+			break;
+		}
+	}
+
+	writel(value, ioaddr + DMA_AXI_BUS_MODE);
+}
+
 static void dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
-			       int burst_len, u32 dma_tx, u32 dma_rx, int atds)
+			       int aal, u32 dma_tx, u32 dma_rx, int atds)
 {
-	u32 value;
+	u32 value = readl(ioaddr + DMA_BUS_MODE);
 
 	/*
-	 * Set the DMA PBL (Programmable Burst Length) mode
-	 * Before stmmac core 3.50 this mode bit was 4xPBL, and
+	 * Set the DMA PBL (Programmable Burst Length) mode.
+	 *
+	 * Note: before stmmac core 3.50 this mode bit was 4xPBL, and
 	 * post 3.5 mode bit acts as 8*PBL.
-	 * For core rev < 3.5, when the core is set for 4xPBL mode, the
-	 * DMA transfers the data in 4, 8, 16, 32, 64 & 128 beats
-	 * depending on pbl value.
-	 * For core rev > 3.5, when the core is set for 8xPBL mode, the
-	 * DMA transfers the data in 8, 16, 32, 64, 128 & 256 beats
-	 * depending on pbl value.
+	 *
+	 * This configuration doesn't take care about the Separate PBL
+	 * so only the bits: 13-8 are programmed with the PBL passed from the
+	 * platform.
 	 */
-	value = DMA_BUS_MODE_PBL | ((pbl << DMA_BUS_MODE_PBL_SHIFT) |
-				    (pbl << DMA_BUS_MODE_RPBL_SHIFT));
+	value |= DMA_BUS_MODE_MAXPBL;
+	value &= ~DMA_BUS_MODE_PBL_MASK;
+	value |= (pbl << DMA_BUS_MODE_PBL_SHIFT);
 
 	/* Set the Fixed burst mode */
 	if (fb)
@@ -60,26 +112,10 @@ static void dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
 	if (atds)
 		value |= DMA_BUS_MODE_ATDS;
 
-	writel(value, ioaddr + DMA_BUS_MODE);
+	if (aal)
+		value |= DMA_BUS_MODE_AAL;
 
-	/* In case of GMAC AXI configuration, program the DMA_AXI_BUS_MODE
-	 * for supported bursts.
-	 *
-	 * Note: This is applicable only for revision GMACv3.61a. For
-	 * older version this register is reserved and shall have no
-	 * effect.
-	 *
-	 * Note:
-	 *  For Fixed Burst Mode: if we directly write 0xFF to this
-	 *  register using the configurations pass from platform code,
-	 *  this would ensure that all bursts supported by core are set
-	 *  and those which are not supported would remain ineffective.
-	 *
-	 *  For Non Fixed Burst Mode: provide the maximum value of the
-	 *  burst length. Any burst equal or below the provided burst
-	 *  length would be allowed to perform.
-	 */
-	writel(burst_len, ioaddr + DMA_AXI_BUS_MODE);
+	writel(value, ioaddr + DMA_BUS_MODE);
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_INTR_DEFAULT_MASK, ioaddr + DMA_INTR_ENA);
@@ -192,6 +228,7 @@ static void dwmac1000_rx_watchdog(void __iomem *ioaddr, u32 riwt)
 const struct stmmac_dma_ops dwmac1000_dma_ops = {
 	.reset = dwmac_dma_reset,
 	.init = dwmac1000_dma_init,
+	.axi = dwmac1000_dma_axi,
 	.dump_regs = dwmac1000_dump_dma_regs,
 	.dma_mode = dwmac1000_dma_operation_mode,
 	.enable_dma_transmission = dwmac_enable_dma_transmission,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
index c40582a938a4..61f54c99a7de 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
@@ -33,7 +33,7 @@
 #include "dwmac_dma.h"
 
 static void dwmac100_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
-			      int burst_len, u32 dma_tx, u32 dma_rx, int atds)
+			      int aal, u32 dma_tx, u32 dma_rx, int atds)
 {
 	/* Enable Application Access by writing to DMA CSR0 */
 	writel(DMA_BUS_MODE_DEFAULT | (pbl << DMA_BUS_MODE_PBL_SHIFT),
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
index 13ca90e23479..726d9d9aaf83 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
@@ -41,8 +41,40 @@
 
 /* Rx watchdog register */
 #define DMA_RX_WATCHDOG		0x00001024
-/* AXI Bus Mode */
+
+/* AXI Master Bus Mode */
 #define DMA_AXI_BUS_MODE	0x00001028
+
+#define DMA_AXI_EN_LPI		BIT(31)
+#define DMA_AXI_LPI_XIT_FRM	BIT(30)
+#define DMA_AXI_WR_OSR_LMT	GENMASK(23, 20)
+#define DMA_AXI_WR_OSR_LMT_SHIFT	20
+#define DMA_AXI_WR_OSR_LMT_MASK	0xf
+#define DMA_AXI_RD_OSR_LMT	GENMASK(19, 16)
+#define DMA_AXI_RD_OSR_LMT_SHIFT	16
+#define DMA_AXI_RD_OSR_LMT_MASK	0xf
+
+#define DMA_AXI_OSR_MAX		0xf
+#define DMA_AXI_MAX_OSR_LIMIT ((DMA_AXI_OSR_MAX << DMA_AXI_WR_OSR_LMT_SHIFT) | \
+			       (DMA_AXI_OSR_MAX << DMA_AXI_RD_OSR_LMT_SHIFT))
+#define	DMA_AXI_1KBBE		BIT(13)
+#define DMA_AXI_AAL		BIT(12)
+#define DMA_AXI_BLEN256		BIT(7)
+#define DMA_AXI_BLEN128		BIT(6)
+#define DMA_AXI_BLEN64		BIT(5)
+#define DMA_AXI_BLEN32		BIT(4)
+#define DMA_AXI_BLEN16		BIT(3)
+#define DMA_AXI_BLEN8		BIT(2)
+#define DMA_AXI_BLEN4		BIT(1)
+#define DMA_BURST_LEN_DEFAULT	(DMA_AXI_BLEN256 | DMA_AXI_BLEN128 | \
+				 DMA_AXI_BLEN64 | DMA_AXI_BLEN32 | \
+				 DMA_AXI_BLEN16 | DMA_AXI_BLEN8 | \
+				 DMA_AXI_BLEN4)
+
+#define DMA_AXI_UNDEF		BIT(0)
+
+#define DMA_AXI_BURST_LEN_MASK	0x000000FE
+
 #define DMA_CUR_TX_BUF_ADDR	0x00001050	/* Current Host Tx Buffer */
 #define DMA_CUR_RX_BUF_ADDR	0x00001054	/* Current Host Rx Buffer */
 #define DMA_HW_FEATURE		0x00001058	/* HW Feature Register */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 13752e933e43..89c26268822e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1635,7 +1635,7 @@ static void stmmac_check_ether_addr(struct stmmac_priv *priv)
  */
 static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 {
-	int pbl = DEFAULT_DMA_PBL, fixed_burst = 0, burst_len = 0;
+	int pbl = DEFAULT_DMA_PBL, fixed_burst = 0, aal = 0;
 	int mixed_burst = 0;
 	int atds = 0;
 	int ret = 0;
@@ -1644,7 +1644,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		pbl = priv->plat->dma_cfg->pbl;
 		fixed_burst = priv->plat->dma_cfg->fixed_burst;
 		mixed_burst = priv->plat->dma_cfg->mixed_burst;
-		burst_len = priv->plat->dma_cfg->burst_len;
+		aal = priv->plat->dma_cfg->aal;
 	}
 
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
@@ -1657,8 +1657,12 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	}
 
 	priv->hw->dma->init(priv->ioaddr, pbl, fixed_burst, mixed_burst,
-			    burst_len, priv->dma_tx_phy,
-			    priv->dma_rx_phy, atds);
+			    aal, priv->dma_tx_phy, priv->dma_rx_phy, atds);
+
+	if ((priv->synopsys_id >= DWMAC_CORE_3_50) &&
+	    (priv->plat->axi && priv->hw->dma->axi))
+		priv->hw->dma->axi(priv->ioaddr, priv->plat->axi);
+
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index d71a721ea61c..ae4388735b7f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -81,7 +81,7 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat)
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 32;
-	plat->dma_cfg->burst_len = DMA_AXI_BLEN_256;
+	/* TODO: AXI */
 
 	/* Set default value for multicast hash bins */
 	plat->multicast_filter_bins = HASH_TABLE_SIZE;
@@ -115,8 +115,8 @@ static int quark_default_data(struct plat_stmmacenet_data *plat,
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 16;
-	plat->dma_cfg->burst_len = DMA_AXI_BLEN_256;
 	plat->dma_cfg->fixed_burst = 1;
+	/* AXI (TODO) */
 
 	/* Set default value for multicast hash bins */
 	plat->multicast_filter_bins = HASH_TABLE_SIZE;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 6a52fa18cbf2..69ccf486d4fa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -95,6 +95,42 @@ static int dwmac1000_validate_ucast_entries(int ucast_entries)
 	return x;
 }
 
+/**
+ * stmmac_axi_setup - parse DT parameters for programming the AXI register
+ * @pdev: platform device
+ * @priv: driver private struct.
+ * Description:
+ * if required, from device-tree the AXI internal register can be tuned
+ * by using platform parameters.
+ */
+static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
+{
+	struct device_node *np;
+	struct stmmac_axi *axi;
+
+	np = of_parse_phandle(pdev->dev.of_node, "snps,axi-config", 0);
+	if (!np)
+		return NULL;
+
+	axi = kzalloc(sizeof(axi), GFP_KERNEL);
+	if (!axi)
+		return ERR_PTR(-ENOMEM);
+
+	axi->axi_lpi_en = of_property_read_bool(np, "snps,lpi_en");
+	axi->axi_xit_frm = of_property_read_bool(np, "snps,xit_frm");
+	axi->axi_kbbe = of_property_read_bool(np, "snps,axi_kbbe");
+	axi->axi_axi_all = of_property_read_bool(np, "snps,axi_all");
+	axi->axi_fb = of_property_read_bool(np, "snps,axi_fb");
+	axi->axi_mb = of_property_read_bool(np, "snps,axi_mb");
+	axi->axi_rb =  of_property_read_bool(np, "snps,axi_rb");
+
+	of_property_read_u32(np, "snps,wr_osr_lmt", &axi->axi_wr_osr_lmt);
+	of_property_read_u32(np, "snps,rd_osr_lmt", &axi->axi_rd_osr_lmt);
+	of_property_read_u32_array(np, "snps,blen", axi->axi_blen, AXI_BLEN);
+
+	return axi;
+}
+
 /**
  * stmmac_probe_config_dt - parse device-tree driver parameters
  * @pdev: platform_device structure
@@ -216,13 +252,11 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		}
 		plat->dma_cfg = dma_cfg;
 		of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
+		dma_cfg->aal = of_property_read_bool(np, "snps,aal");
 		dma_cfg->fixed_burst =
 			of_property_read_bool(np, "snps,fixed-burst");
 		dma_cfg->mixed_burst =
 			of_property_read_bool(np, "snps,mixed-burst");
-		of_property_read_u32(np, "snps,burst_len", &dma_cfg->burst_len);
-		if (dma_cfg->burst_len < 0 || dma_cfg->burst_len > 256)
-			dma_cfg->burst_len = 0;
 	}
 	plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode");
 	if (plat->force_thresh_dma_mode) {
@@ -230,6 +264,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		pr_warn("force_sf_dma_mode is ignored if force_thresh_dma_mode is set.");
 	}
 
+	plat->axi = stmmac_axi_setup(pdev);
+
 	return plat;
 }
 #else
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index eead8ab93c0a..6e53fa8942a4 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -90,7 +90,21 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int fixed_burst;
 	int mixed_burst;
-	int burst_len;
+	bool aal;
+};
+
+#define AXI_BLEN	7
+struct stmmac_axi {
+	bool axi_lpi_en;
+	bool axi_xit_frm;
+	u32 axi_wr_osr_lmt;
+	u32 axi_rd_osr_lmt;
+	bool axi_kbbe;
+	bool axi_axi_all;
+	u32 axi_blen[AXI_BLEN];
+	bool axi_fb;
+	bool axi_mb;
+	bool axi_rb;
 };
 
 struct plat_stmmacenet_data {
@@ -122,5 +136,6 @@ struct plat_stmmacenet_data {
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
+	struct stmmac_axi *axi;
 };
 #endif
-- 
cgit v1.2.3


From 1f27cde313d72d6b44a73ba89c8b2c6a99c628cf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Mar 2016 08:21:43 -0800
Subject: net: sched: use pfifo_fast for non real queues

Some devices declare a high number of TX queues, then set a much
lower real_num_tx_queues

This cause setups using fq_codel, sfq or fq as the default qdisc to consume
more memory than really needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 6 ++++++
 net/sched/sch_generic.c   | 1 +
 net/sched/sch_mq.c        | 2 +-
 net/sched/sch_mqprio.c    | 3 ++-
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e5bba897d206..46e55f0202a6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -345,6 +345,12 @@ extern struct Qdisc_ops pfifo_fast_ops;
 extern struct Qdisc_ops mq_qdisc_ops;
 extern struct Qdisc_ops noqueue_qdisc_ops;
 extern const struct Qdisc_ops *default_qdisc_ops;
+static inline const struct Qdisc_ops *
+get_default_qdisc_ops(const struct net_device *dev, int ntx)
+{
+	return ntx < dev->real_num_tx_queues ?
+			default_qdisc_ops : &pfifo_fast_ops;
+}
 
 struct Qdisc_class_common {
 	u32			classid;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 16bc83b2842a..f18c35024207 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -567,6 +567,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
+EXPORT_SYMBOL(pfifo_fast_ops);
 
 static struct lock_class_key qdisc_tx_busylock;
 
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 3e82f047caaf..56a77b878eb3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		dev_queue = netdev_get_tx_queue(dev, ntx);
-		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+		qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 02ffb3fbbc20..b8002ce3d010 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -125,7 +125,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
-		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+		qdisc = qdisc_create_dflt(dev_queue,
+					  get_default_qdisc_ops(dev, i),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
 		if (qdisc == NULL) {
-- 
cgit v1.2.3


From 0d12f8a4027d021c9cc942f09f38d28288020c5d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:53:46 +0000
Subject: rxrpc: Keep the skb private record of the Rx header in host byte
 order

Currently, a copy of the Rx packet header is copied into the the sk_buff
private data so that we can advance the pointer into the buffer,
potentially discarding the original.  At the moment, this copy is held in
network byte order, but this means we're doing a lot of unnecessary
translations.

The reasons it was done this way are that we need the values in network
byte order occasionally and we can use the copy, slightly modified, as part
of an iov array when sending an ack or an abort packet.

However, it seems more reasonable on review that it would be better kept in
host byte order and that we make up a new header when we want to send
another packet.

To this end, rename the original header struct to rxrpc_wire_header (with
BE fields) and institute a variant called rxrpc_host_header that has host
order fields.  Change the struct in the sk_buff private data into an
rxrpc_host_header and translate the values when filling it in.

This further allows us to keep values kept in various structures in host
byte order rather than network byte order and allows removal of some fields
that are byteswapped duplicates.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h    |   4 +-
 net/rxrpc/af_rxrpc.c      |  20 +++----
 net/rxrpc/ar-accept.c     |  40 ++++++++------
 net/rxrpc/ar-ack.c        | 108 +++++++++++++++++-------------------
 net/rxrpc/ar-call.c       |  68 +++++++++++------------
 net/rxrpc/ar-connection.c |  83 ++++++++++++++--------------
 net/rxrpc/ar-connevent.c  |  73 ++++++++++++------------
 net/rxrpc/ar-input.c      |  95 +++++++++++++++++--------------
 net/rxrpc/ar-internal.h   |  65 +++++++++++++---------
 net/rxrpc/ar-local.c      |  29 ++++++----
 net/rxrpc/ar-output.c     |  54 ++++++++++++------
 net/rxrpc/ar-peer.c       |   2 +-
 net/rxrpc/ar-proc.c       |  10 ++--
 net/rxrpc/ar-recvmsg.c    |  18 +++---
 net/rxrpc/ar-security.c   |   4 +-
 net/rxrpc/ar-skbuff.c     |   4 +-
 net/rxrpc/ar-transport.c  |   1 +
 net/rxrpc/rxkad.c         | 138 +++++++++++++++++++++++-----------------------
 18 files changed, 432 insertions(+), 384 deletions(-)

(limited to 'include')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index 4dce116bfd80..de1e67988ada 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -22,7 +22,7 @@ typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
  * on-the-wire Rx packet header
  * - all multibyte fields should be in network byte order
  */
-struct rxrpc_header {
+struct rxrpc_wire_header {
 	__be32		epoch;		/* client boot timestamp */
 
 	__be32		cid;		/* connection and channel ID */
@@ -68,8 +68,6 @@ struct rxrpc_header {
 
 } __packed;
 
-#define __rxrpc_header_off(X) offsetof(struct rxrpc_header,X)
-
 extern const char *rxrpc_pkts[];
 
 /*****************************************************************************/
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 7e2d1057d8bc..7bb5cca0ae32 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -37,7 +37,7 @@ static struct proto rxrpc_proto;
 static const struct proto_ops rxrpc_rpc_ops;
 
 /* local epoch for detecting local-end reset */
-__be32 rxrpc_epoch;
+u32 rxrpc_epoch;
 
 /* current debugging ID */
 atomic_t rxrpc_debug_id;
@@ -125,7 +125,6 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	struct sock *sk = sock->sk;
 	struct rxrpc_local *local;
 	struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
-	__be16 service_id;
 	int ret;
 
 	_enter("%p,%p,%d", rx, saddr, len);
@@ -152,14 +151,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 	rx->local = local;
 	if (srx->srx_service) {
-		service_id = htons(srx->srx_service);
 		write_lock_bh(&local->services_lock);
 		list_for_each_entry(prx, &local->services, listen_link) {
-			if (prx->service_id == service_id)
+			if (prx->srx.srx_service == srx->srx_service)
 				goto service_in_use;
 		}
 
-		rx->service_id = service_id;
 		list_add_tail(&rx->listen_link, &local->services);
 		write_unlock_bh(&local->services_lock);
 
@@ -276,7 +273,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	struct rxrpc_transport *trans;
 	struct rxrpc_call *call;
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
-	__be16 service_id;
 
 	_enter(",,%x,%lx", key_serial(key), user_call_ID);
 
@@ -299,16 +295,15 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 		atomic_inc(&trans->usage);
 	}
 
-	service_id = rx->service_id;
-	if (srx)
-		service_id = htons(srx->srx_service);
+	if (!srx)
+		srx = &rx->srx;
 
 	if (!key)
 		key = rx->key;
 	if (key && !key->payload.data[0])
 		key = NULL; /* a no-security key */
 
-	bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
+	bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp);
 	if (IS_ERR(bundle)) {
 		call = ERR_CAST(bundle);
 		goto out;
@@ -425,7 +420,6 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
 	}
 
 	rx->trans = trans;
-	rx->service_id = htons(srx->srx_service);
 	rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
 
 	release_sock(&rx->sk);
@@ -778,7 +772,7 @@ static struct proto rxrpc_proto = {
 	.name		= "RXRPC",
 	.owner		= THIS_MODULE,
 	.obj_size	= sizeof(struct rxrpc_sock),
-	.max_header	= sizeof(struct rxrpc_header),
+	.max_header	= sizeof(struct rxrpc_wire_header),
 };
 
 static const struct net_proto_family rxrpc_family_ops = {
@@ -796,7 +790,7 @@ static int __init af_rxrpc_init(void)
 
 	BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
 
-	rxrpc_epoch = htonl(get_seconds());
+	rxrpc_epoch = get_seconds();
 
 	ret = -ENOMEM;
 	rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 9a49f32e9e1e..73c905416271 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -27,7 +27,7 @@
  * generate a connection-level abort
  */
 static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
-		      struct rxrpc_header *hdr)
+		      struct rxrpc_wire_header *whdr)
 {
 	struct msghdr msg;
 	struct kvec iov[1];
@@ -36,25 +36,21 @@ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
 
 	_enter("%d,,", local->debug_id);
 
+	whdr->type	= RXRPC_PACKET_TYPE_BUSY;
+	whdr->serial	= htonl(1);
+
 	msg.msg_name	= &srx->transport.sin;
 	msg.msg_namelen	= sizeof(srx->transport.sin);
 	msg.msg_control	= NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->seq	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_BUSY;
-	hdr->flags	= 0;
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
-
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	iov[0].iov_base	= whdr;
+	iov[0].iov_len	= sizeof(*whdr);
 
 	len = iov[0].iov_len;
 
-	hdr->serial = htonl(1);
-	_proto("Tx BUSY %%%u", ntohl(hdr->serial));
+	_proto("Tx BUSY %%1");
 
 	ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
 	if (ret < 0) {
@@ -211,8 +207,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work)
 	struct rxrpc_skb_priv *sp;
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_sock *rx;
+	struct rxrpc_wire_header whdr;
 	struct sk_buff *skb;
-	__be16 service_id;
 	int ret;
 
 	_enter("%d", local->debug_id);
@@ -240,6 +236,19 @@ process_next_packet:
 
 	sp = rxrpc_skb(skb);
 
+	/* Set up a response packet header in case we need it */
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= htonl(sp->hdr.seq);
+	whdr.serial	= 0;
+	whdr.flags	= 0;
+	whdr.type	= 0;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = sp->hdr.securityIndex;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
 	/* determine the remote address */
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -256,10 +265,9 @@ process_next_packet:
 	}
 
 	/* get the socket providing the service */
-	service_id = sp->hdr.serviceId;
 	read_lock_bh(&local->services_lock);
 	list_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->service_id == service_id &&
+		if (rx->srx.srx_service == sp->hdr.serviceId &&
 		    rx->sk.sk_state != RXRPC_CLOSE)
 			goto found_service;
 	}
@@ -267,7 +275,7 @@ process_next_packet:
 	goto invalid_service;
 
 found_service:
-	_debug("found service %hd", ntohs(rx->service_id));
+	_debug("found service %hd", rx->srx.srx_service);
 	if (sk_acceptq_is_full(&rx->sk))
 		goto backlog_full;
 	sk_acceptq_added(&rx->sk);
@@ -296,7 +304,7 @@ found_service:
 backlog_full:
 	read_unlock_bh(&local->services_lock);
 busy:
-	rxrpc_busy(local, &srx, &sp->hdr);
+	rxrpc_busy(local, &srx, &whdr);
 	rxrpc_free_skb(skb);
 	goto process_next_packet;
 
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 9183da740600..20f3f001694e 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -91,7 +91,7 @@ static const s8 rxrpc_ack_priority[] = {
  * propose an ACK be sent
  */
 void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-			 __be32 serial, bool immediate)
+			 u32 serial, bool immediate)
 {
 	unsigned long expiry;
 	s8 prior = rxrpc_ack_priority[ack_reason];
@@ -99,8 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	ASSERTCMP(prior, >, 0);
 
 	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), ntohl(serial),
-	       immediate);
+	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
 
 	if (prior < rxrpc_ack_priority[call->ackr_reason]) {
 		if (immediate)
@@ -139,7 +138,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		expiry = rxrpc_requested_ack_delay;
 		if (!expiry)
 			goto cancel_timer;
-		if (!immediate || serial == cpu_to_be32(1)) {
+		if (!immediate || serial == 1) {
 			_debug("run defer timer");
 			goto run_timer;
 		}
@@ -157,7 +156,7 @@ run_timer:
 	return;
 
 cancel_timer:
-	_debug("cancel timer %%%u", ntohl(serial));
+	_debug("cancel timer %%%u", serial);
 	try_to_del_timer_sync(&call->ack_timer);
 	read_lock_bh(&call->state_lock);
 	if (call->state <= RXRPC_CALL_COMPLETE &&
@@ -170,7 +169,7 @@ cancel_timer:
  * propose an ACK be sent, locking the call structure
  */
 void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       __be32 serial, bool immediate)
+		       u32 serial, bool immediate)
 {
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
@@ -214,8 +213,8 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
  */
 static void rxrpc_resend(struct rxrpc_call *call)
 {
+	struct rxrpc_wire_header *whdr;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_header *hdr;
 	struct sk_buff *txb;
 	unsigned long *p_txb, resend_at;
 	bool stop;
@@ -247,14 +246,13 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			sp->need_resend = false;
 
 			/* each Tx packet has a new serial number */
-			sp->hdr.serial =
-				htonl(atomic_inc_return(&call->conn->serial));
+			sp->hdr.serial = atomic_inc_return(&call->conn->serial);
 
-			hdr = (struct rxrpc_header *) txb->head;
-			hdr->serial = sp->hdr.serial;
+			whdr = (struct rxrpc_wire_header *)txb->head;
+			whdr->serial = htonl(sp->hdr.serial);
 
 			_proto("Tx DATA %%%u { #%d }",
-			       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+			       sp->hdr.serial, sp->hdr.seq);
 			if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
 				stop = true;
 				sp->resend_at = jiffies + 3;
@@ -428,7 +426,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
 	int tail = call->acks_tail, old_tail;
 	int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
 
-	_enter("{%u,%u},%u", call->acks_hard, win, hard);
+	kenter("{%u,%u},%u", call->acks_hard, win, hard);
 
 	ASSERTCMP(hard - call->acks_hard, <=, win);
 
@@ -478,11 +476,11 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
 		sp = rxrpc_skb(skb);
 
 		_debug("drain OOS packet %d [%d]",
-		       ntohl(sp->hdr.seq), call->rx_first_oos);
+		       sp->hdr.seq, call->rx_first_oos);
 
-		if (ntohl(sp->hdr.seq) != call->rx_first_oos) {
+		if (sp->hdr.seq != call->rx_first_oos) {
 			skb_queue_head(&call->rx_oos_queue, skb);
-			call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq);
+			call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
 			_debug("requeue %p {%u}", skb, call->rx_first_oos);
 		} else {
 			skb->mark = RXRPC_SKB_MARK_DATA;
@@ -496,8 +494,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
 			/* find out what the next packet is */
 			skb = skb_peek(&call->rx_oos_queue);
 			if (skb)
-				call->rx_first_oos =
-					ntohl(rxrpc_skb(skb)->hdr.seq);
+				call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
 			else
 				call->rx_first_oos = 0;
 			_debug("peek %p {%u}", skb, call->rx_first_oos);
@@ -522,7 +519,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
 	u32 seq;
 
 	sp = rxrpc_skb(skb);
-	seq = ntohl(sp->hdr.seq);
+	seq = sp->hdr.seq;
 	_enter(",,{%u}", seq);
 
 	skb->destructor = rxrpc_packet_destructor;
@@ -535,9 +532,8 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
 
 	skb_queue_walk(&call->rx_oos_queue, p) {
 		psp = rxrpc_skb(p);
-		if (ntohl(psp->hdr.seq) > seq) {
-			_debug("insert oos #%u before #%u",
-			       seq, ntohl(psp->hdr.seq));
+		if (psp->hdr.seq > seq) {
+			_debug("insert oos #%u before #%u", seq, psp->hdr.seq);
 			skb_insert(p, skb, &call->rx_oos_queue);
 			goto inserted;
 		}
@@ -586,7 +582,7 @@ static void rxrpc_zap_tx_window(struct rxrpc_call *call)
 
 		skb = (struct sk_buff *) _skb;
 		sp = rxrpc_skb(skb);
-		_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+		_debug("+++ clear Tx %u", sp->hdr.seq);
 		rxrpc_free_skb(skb);
 	}
 
@@ -657,8 +653,7 @@ process_further:
 		/* data packets that wind up here have been received out of
 		 * order, need security processing or are jumbo packets */
 	case RXRPC_PACKET_TYPE_DATA:
-		_proto("OOSQ DATA %%%u { #%u }",
-		       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+		_proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
 		/* secured packets must be verified and possibly decrypted */
 		if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
@@ -676,7 +671,7 @@ process_further:
 		if (!skb_pull(skb, sizeof(ack)))
 			BUG();
 
-		latest = ntohl(sp->hdr.serial);
+		latest = sp->hdr.serial;
 		hard = ntohl(ack.firstPacket);
 		tx = atomic_read(&call->sequence);
 
@@ -881,9 +876,9 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
+	struct rxrpc_wire_header whdr;
 	struct rxrpc_ackpacket ack;
 	struct rxrpc_ackinfo ackinfo;
-	struct rxrpc_header hdr;
 	struct msghdr msg;
 	struct kvec iov[5];
 	enum rxrpc_call_event genbit;
@@ -891,7 +886,7 @@ void rxrpc_process_call(struct work_struct *work)
 	__be32 data, pad;
 	size_t len;
 	int loop, nbit, ioc, ret, mtu;
-	u32 abort_code = RX_PROTOCOL_ERROR;
+	u32 serial, abort_code = RX_PROTOCOL_ERROR;
 	u8 *acks = NULL;
 
 	//printk("\n--------------------\n");
@@ -912,20 +907,20 @@ void rxrpc_process_call(struct work_struct *work)
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= call->conn->epoch;
-	hdr.cid		= call->cid;
-	hdr.callNumber	= call->call_id;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_ACK;
-	hdr.flags	= call->conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = call->conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= call->conn->service_id;
+	whdr.epoch	= htonl(call->conn->epoch);
+	whdr.cid	= htonl(call->cid);
+	whdr.callNumber	= htonl(call->call_id);
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_ACK;
+	whdr.flags	= call->conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = call->conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(call->service_id);
 
 	memset(iov, 0, sizeof(iov));
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 
 	/* deal with events of a final nature */
 	if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
@@ -966,7 +961,7 @@ void rxrpc_process_call(struct work_struct *work)
 	}
 
 	if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
-		hdr.type = RXRPC_PACKET_TYPE_BUSY;
+		whdr.type = RXRPC_PACKET_TYPE_BUSY;
 		genbit = RXRPC_CALL_EV_REJECT_BUSY;
 		goto send_message;
 	}
@@ -977,7 +972,7 @@ void rxrpc_process_call(struct work_struct *work)
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
 				       ECONNABORTED, true) < 0)
 			goto no_mem;
-		hdr.type = RXRPC_PACKET_TYPE_ABORT;
+		whdr.type = RXRPC_PACKET_TYPE_ABORT;
 		data = htonl(call->abort_code);
 		iov[1].iov_base = &data;
 		iov[1].iov_len = sizeof(data);
@@ -996,9 +991,9 @@ void rxrpc_process_call(struct work_struct *work)
 		call->ackr_reason = 0;
 
 		spin_lock_bh(&call->lock);
-		ack.serial = call->ackr_serial;
-		ack.previousPacket = call->ackr_prev_seq;
-		ack.firstPacket = htonl(call->rx_data_eaten + 1);
+		ack.serial	= htonl(call->ackr_serial);
+		ack.previousPacket = htonl(call->ackr_prev_seq);
+		ack.firstPacket	= htonl(call->rx_data_eaten + 1);
 		spin_unlock_bh(&call->lock);
 
 		pad = 0;
@@ -1100,13 +1095,11 @@ void rxrpc_process_call(struct work_struct *work)
 		//hdr.flags	= RXRPC_SLOW_START_OK;
 		ack.bufferSpace	= htons(8);
 		ack.maxSkew	= 0;
-		ack.serial	= 0;
-		ack.reason	= 0;
 
 		spin_lock_bh(&call->lock);
-		ack.reason = call->ackr_reason;
-		ack.serial = call->ackr_serial;
-		ack.previousPacket = call->ackr_prev_seq;
+		ack.reason	= call->ackr_reason;
+		ack.serial	= htonl(call->ackr_serial);
+		ack.previousPacket = htonl(call->ackr_prev_seq);
 		ack.firstPacket = htonl(call->rx_data_eaten + 1);
 
 		ack.nAcks = 0;
@@ -1225,9 +1218,10 @@ send_ACK:
 	ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
 	ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
 
-	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+	serial = atomic_inc_return(&call->conn->serial);
+	whdr.serial = htonl(serial);
 	_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       ntohl(hdr.serial),
+	       serial,
 	       ntohs(ack.maxSkew),
 	       ntohl(ack.firstPacket),
 	       ntohl(ack.previousPacket),
@@ -1243,8 +1237,9 @@ send_ACK:
 send_message:
 	_debug("send message");
 
-	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
-	_proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial));
+	serial = atomic_inc_return(&call->conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
 send_message_2:
 
 	len = iov[0].iov_len;
@@ -1327,8 +1322,7 @@ maybe_reschedule:
 	if (call->state >= RXRPC_CALL_COMPLETE &&
 	    !list_empty(&call->accept_link)) {
 		_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
-		       call, call->events, call->flags,
-		       ntohl(call->conn->cid));
+		       call, call->events, call->flags, call->conn->cid);
 
 		read_lock_bh(&call->state_lock);
 		if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
@@ -1346,7 +1340,7 @@ error:
 	 * this means there's a race between clearing the flag and setting the
 	 * work pending bit and the work item being processed again */
 	if (call->events && !work_pending(&call->processor)) {
-		_debug("jumpstart %x", ntohl(call->conn->cid));
+		_debug("jumpstart %x", call->conn->cid);
 		rxrpc_queue_call(call);
 	}
 
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index 3468a0705ab7..4a499e0100f1 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -64,11 +64,11 @@ static DEFINE_HASHTABLE(rxrpc_call_hash, 10);
  * Hash function for rxrpc_call_hash
  */
 static unsigned long rxrpc_call_hashfunc(
-	u8		clientflag,
-	__be32		cid,
-	__be32		call_id,
-	__be32		epoch,
-	__be16		service_id,
+	u8		in_clientflag,
+	u32		cid,
+	u32		call_id,
+	u32		epoch,
+	u16		service_id,
 	sa_family_t	proto,
 	void		*localptr,
 	unsigned int	addr_size,
@@ -77,7 +77,6 @@ static unsigned long rxrpc_call_hashfunc(
 	const u16 *p;
 	unsigned int i;
 	unsigned long key;
-	u32 hcid = ntohl(cid);
 
 	_enter("");
 
@@ -85,12 +84,12 @@ static unsigned long rxrpc_call_hashfunc(
 	/* We just want to add up the __be32 values, so forcing the
 	 * cast should be okay.
 	 */
-	key += (__force u32)epoch;
-	key += (__force u16)service_id;
-	key += (__force u32)call_id;
-	key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
-	key += hcid & RXRPC_CHANNELMASK;
-	key += clientflag;
+	key += epoch;
+	key += service_id;
+	key += call_id;
+	key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
+	key += cid & RXRPC_CHANNELMASK;
+	key += in_clientflag;
 	key += proto;
 	/* Step through the peer address in 16-bit portions for speed */
 	for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++)
@@ -148,19 +147,16 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call)
  * isn't there.
  */
 struct rxrpc_call *rxrpc_find_call_hash(
-	u8		clientflag,
-	__be32		cid,
-	__be32		call_id,
-	__be32		epoch,
-	__be16		service_id,
+	struct rxrpc_host_header *hdr,
 	void		*localptr,
 	sa_family_t	proto,
-	const u8	*peer_addr)
+	const void	*peer_addr)
 {
 	unsigned long key;
 	unsigned int addr_size = 0;
 	struct rxrpc_call *call = NULL;
 	struct rxrpc_call *ret = NULL;
+	u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED;
 
 	_enter("");
 	switch (proto) {
@@ -174,20 +170,21 @@ struct rxrpc_call *rxrpc_find_call_hash(
 		break;
 	}
 
-	key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch,
-				  service_id, proto, localptr, addr_size,
+	key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber,
+				  hdr->epoch, hdr->serviceId,
+				  proto, localptr, addr_size,
 				  peer_addr);
 	hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) {
 		if (call->hash_key == key &&
-		    call->call_id == call_id &&
-		    call->cid == cid &&
-		    call->in_clientflag == clientflag &&
-		    call->service_id == service_id &&
+		    call->call_id == hdr->callNumber &&
+		    call->cid == hdr->cid &&
+		    call->in_clientflag == in_clientflag &&
+		    call->service_id == hdr->serviceId &&
 		    call->proto == proto &&
 		    call->local == localptr &&
 		    memcmp(call->peer_ip.ipv6_addr, peer_addr,
-			      addr_size) == 0 &&
-		    call->epoch == epoch) {
+			   addr_size) == 0 &&
+		    call->epoch == hdr->epoch) {
 			ret = call;
 			break;
 		}
@@ -414,12 +411,12 @@ found_extant_second:
  */
 struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 				       struct rxrpc_connection *conn,
-				       struct rxrpc_header *hdr,
+				       struct rxrpc_host_header *hdr,
 				       gfp_t gfp)
 {
 	struct rxrpc_call *call, *candidate;
 	struct rb_node **p, *parent;
-	__be32 call_id;
+	u32 call_id;
 
 	_enter(",%d,,%x", conn->debug_id, gfp);
 
@@ -433,7 +430,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	candidate->conn = conn;
 	candidate->cid = hdr->cid;
 	candidate->call_id = hdr->callNumber;
-	candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK;
+	candidate->channel = hdr->cid & RXRPC_CHANNELMASK;
 	candidate->rx_data_post = 0;
 	candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
 	if (conn->security_ix > 0)
@@ -492,9 +489,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 		/* The tree is sorted in order of the __be32 value without
 		 * turning it into host order.
 		 */
-		if ((__force u32)call_id < (__force u32)call->call_id)
+		if (call_id < call->call_id)
 			p = &(*p)->rb_left;
-		else if ((__force u32)call_id > (__force u32)call->call_id)
+		else if (call_id > call->call_id)
 			p = &(*p)->rb_right;
 		else
 			goto old_call;
@@ -714,8 +711,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
 
 			_debug("- zap %s %%%u #%u",
 			       rxrpc_pkts[sp->hdr.type],
-			       ntohl(sp->hdr.serial),
-			       ntohl(sp->hdr.seq));
+			       sp->hdr.serial, sp->hdr.seq);
 			rxrpc_free_skb(skb);
 			spin_lock_bh(&call->lock);
 		}
@@ -873,9 +869,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
 			unsigned long _skb;
 
 			_skb = call->acks_window[call->acks_tail] & ~1;
-			sp = rxrpc_skb((struct sk_buff *) _skb);
-			_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
-			rxrpc_free_skb((struct sk_buff *) _skb);
+			sp = rxrpc_skb((struct sk_buff *)_skb);
+			_debug("+++ clear Tx %u", sp->hdr.seq);
+			rxrpc_free_skb((struct sk_buff *)_skb);
 			call->acks_tail =
 				(call->acks_tail + 1) & (call->acks_winsz - 1);
 		}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 6c71ed1caf16..53df14cb8d25 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -57,10 +57,10 @@ static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
  */
 static inline
 int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
-		     struct key *key, __be16 service_id)
+		     struct key *key, u16 service_id)
 {
 	return (bundle->service_id - service_id) ?:
-		((unsigned long) bundle->key - (unsigned long) key);
+		((unsigned long)bundle->key - (unsigned long)key);
 }
 
 /*
@@ -69,14 +69,14 @@ int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
 struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
 					   struct rxrpc_transport *trans,
 					   struct key *key,
-					   __be16 service_id,
+					   u16 service_id,
 					   gfp_t gfp)
 {
 	struct rxrpc_conn_bundle *bundle, *candidate;
 	struct rb_node *p, *parent, **pp;
 
 	_enter("%p{%x},%x,%hx,",
-	       rx, key_serial(key), trans->debug_id, ntohs(service_id));
+	       rx, key_serial(key), trans->debug_id, service_id);
 
 	if (rx->trans == trans && rx->bundle) {
 		atomic_inc(&rx->bundle->usage);
@@ -213,7 +213,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
 		conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
 		conn->avail_calls = RXRPC_MAXCALLS;
 		conn->size_align = 4;
-		conn->header_size = sizeof(struct rxrpc_header);
+		conn->header_size = sizeof(struct rxrpc_wire_header);
 	}
 
 	_leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
@@ -230,7 +230,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
 	struct rxrpc_connection *xconn;
 	struct rb_node *parent, **p;
 	__be32 epoch;
-	u32 real_conn_id;
+	u32 cid;
 
 	_enter("");
 
@@ -241,7 +241,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
 	conn->trans->conn_idcounter += RXRPC_CID_INC;
 	if (conn->trans->conn_idcounter < RXRPC_CID_INC)
 		conn->trans->conn_idcounter = RXRPC_CID_INC;
-	real_conn_id = conn->trans->conn_idcounter;
+	cid = conn->trans->conn_idcounter;
 
 attempt_insertion:
 	parent = NULL;
@@ -255,9 +255,9 @@ attempt_insertion:
 			p = &(*p)->rb_left;
 		else if (epoch > xconn->epoch)
 			p = &(*p)->rb_right;
-		else if (real_conn_id < xconn->real_conn_id)
+		else if (cid < xconn->cid)
 			p = &(*p)->rb_left;
-		else if (real_conn_id > xconn->real_conn_id)
+		else if (cid > xconn->cid)
 			p = &(*p)->rb_right;
 		else
 			goto id_exists;
@@ -268,20 +268,19 @@ attempt_insertion:
 	rb_link_node(&conn->node, parent, p);
 	rb_insert_color(&conn->node, &conn->trans->client_conns);
 
-	conn->real_conn_id = real_conn_id;
-	conn->cid = htonl(real_conn_id);
+	conn->cid = cid;
 	write_unlock_bh(&conn->trans->conn_lock);
-	_leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid));
+	_leave(" [CID %x]", cid);
 	return;
 
 	/* we found a connection with the proposed ID - walk the tree from that
 	 * point looking for the next unused ID */
 id_exists:
 	for (;;) {
-		real_conn_id += RXRPC_CID_INC;
-		if (real_conn_id < RXRPC_CID_INC) {
-			real_conn_id = RXRPC_CID_INC;
-			conn->trans->conn_idcounter = real_conn_id;
+		cid += RXRPC_CID_INC;
+		if (cid < RXRPC_CID_INC) {
+			cid = RXRPC_CID_INC;
+			conn->trans->conn_idcounter = cid;
 			goto attempt_insertion;
 		}
 
@@ -291,7 +290,7 @@ id_exists:
 
 		xconn = rb_entry(parent, struct rxrpc_connection, node);
 		if (epoch < xconn->epoch ||
-		    real_conn_id < xconn->real_conn_id)
+		    cid < xconn->cid)
 			goto attempt_insertion;
 	}
 }
@@ -334,7 +333,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
  */
 static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
 				   struct rxrpc_transport *trans,
-				   __be16 service_id,
+				   u16 service_id,
 				   struct rxrpc_call *call,
 				   gfp_t gfp)
 {
@@ -404,11 +403,11 @@ found_channel:
 	conn->channels[chan] = call;
 	call->conn = conn;
 	call->channel = chan;
-	call->cid = conn->cid | htonl(chan);
-	call->call_id = htonl(++conn->call_counter);
+	call->cid = conn->cid | chan;
+	call->call_id = ++conn->call_counter;
 
 	_net("CONNECT client on conn %d chan %d as call %x",
-	     conn->debug_id, chan, ntohl(call->call_id));
+	     conn->debug_id, chan, call->call_id);
 
 	spin_unlock(&trans->client_lock);
 
@@ -593,11 +592,11 @@ found_channel:
 	conn->channels[chan] = call;
 	call->conn = conn;
 	call->channel = chan;
-	call->cid = conn->cid | htonl(chan);
-	call->call_id = htonl(++conn->call_counter);
+	call->cid = conn->cid | chan;
+	call->call_id = ++conn->call_counter;
 
 	_net("CONNECT client on conn %d chan %d as call %x",
-	     conn->debug_id, chan, ntohl(call->call_id));
+	     conn->debug_id, chan, call->call_id);
 
 	ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
 	spin_unlock(&trans->client_lock);
@@ -620,21 +619,21 @@ interrupted:
  */
 struct rxrpc_connection *
 rxrpc_incoming_connection(struct rxrpc_transport *trans,
-			  struct rxrpc_header *hdr,
+			  struct rxrpc_host_header *hdr,
 			  gfp_t gfp)
 {
 	struct rxrpc_connection *conn, *candidate = NULL;
 	struct rb_node *p, **pp;
 	const char *new = "old";
 	__be32 epoch;
-	u32 conn_id;
+	u32 cid;
 
 	_enter("");
 
 	ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
 
 	epoch = hdr->epoch;
-	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+	cid = hdr->cid & RXRPC_CIDMASK;
 
 	/* search the connection list first */
 	read_lock_bh(&trans->conn_lock);
@@ -643,15 +642,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 	while (p) {
 		conn = rb_entry(p, struct rxrpc_connection, node);
 
-		_debug("maybe %x", conn->real_conn_id);
+		_debug("maybe %x", conn->cid);
 
 		if (epoch < conn->epoch)
 			p = p->rb_left;
 		else if (epoch > conn->epoch)
 			p = p->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			p = p->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			p = p->rb_right;
 		else
 			goto found_extant_connection;
@@ -668,12 +667,11 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 
 	candidate->trans = trans;
 	candidate->epoch = hdr->epoch;
-	candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK);
+	candidate->cid = hdr->cid & RXRPC_CIDMASK;
 	candidate->service_id = hdr->serviceId;
 	candidate->security_ix = hdr->securityIndex;
 	candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
 	candidate->out_clientflag = 0;
-	candidate->real_conn_id = conn_id;
 	candidate->state = RXRPC_CONN_SERVER;
 	if (candidate->service_id)
 		candidate->state = RXRPC_CONN_SERVER_UNSECURED;
@@ -690,9 +688,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 			pp = &(*pp)->rb_left;
 		else if (epoch > conn->epoch)
 			pp = &(*pp)->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			pp = &(*pp)->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			pp = &(*pp)->rb_right;
 		else
 			goto found_extant_second;
@@ -714,7 +712,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 	new = "new";
 
 success:
-	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id);
+	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid);
 
 	_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
 	return conn;
@@ -751,18 +749,17 @@ security_mismatch:
  * packet
  */
 struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
-					       struct rxrpc_header *hdr)
+					       struct rxrpc_host_header *hdr)
 {
 	struct rxrpc_connection *conn;
 	struct rb_node *p;
-	__be32 epoch;
-	u32 conn_id;
+	u32 epoch, cid;
 
-	_enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags);
+	_enter(",{%x,%x}", hdr->cid, hdr->flags);
 
 	read_lock_bh(&trans->conn_lock);
 
-	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+	cid = hdr->cid & RXRPC_CIDMASK;
 	epoch = hdr->epoch;
 
 	if (hdr->flags & RXRPC_CLIENT_INITIATED)
@@ -773,15 +770,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
 	while (p) {
 		conn = rb_entry(p, struct rxrpc_connection, node);
 
-		_debug("maybe %x", conn->real_conn_id);
+		_debug("maybe %x", conn->cid);
 
 		if (epoch < conn->epoch)
 			p = p->rb_left;
 		else if (epoch > conn->epoch)
 			p = p->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			p = p->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			p = p->rb_right;
 		else
 			goto found;
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
index 45e81b7e96ae..1bdaaed8cdc4 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/ar-connevent.c
@@ -60,11 +60,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
 static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 				  u32 error, u32 abort_code)
 {
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[2];
 	__be32 word;
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("%d,,%u,%u", conn->debug_id, error, abort_code);
@@ -89,28 +90,29 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= conn->epoch;
-	hdr.cid		= conn->cid;
-	hdr.callNumber	= 0;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_ABORT;
-	hdr.flags	= conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= conn->service_id;
+	whdr.epoch	= htonl(conn->epoch);
+	whdr.cid	= htonl(conn->cid);
+	whdr.callNumber	= 0;
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_ABORT;
+	whdr.flags	= conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(conn->service_id);
 
 	word = htonl(abort_code);
 
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= &word;
 	iov[1].iov_len	= sizeof(word);
 
 	len = iov[0].iov_len + iov[1].iov_len;
 
-	hdr.serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code);
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx CONN ABORT %%%u { %d }", serial, abort_code);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
 	if (ret < 0) {
@@ -146,8 +148,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 			       u32 *_abort_code)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 tmp;
-	u32 serial;
+	__be32 wtmp;
+	u32 abort_code;
 	int loop, ret;
 
 	if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
@@ -155,19 +157,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 		return -ECONNABORTED;
 	}
 
-	serial = ntohl(sp->hdr.serial);
-
-	_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial);
+	_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
 
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_ABORT:
-		if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0)
+		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
 			return -EPROTO;
-		_proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp));
+		abort_code = ntohl(wtmp);
+		_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
 
 		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
 		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
-				  ntohl(tmp));
+				  abort_code);
 		return -ECONNABORTED;
 
 	case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -335,7 +336,7 @@ void rxrpc_reject_packets(struct work_struct *work)
 		struct sockaddr_in sin;
 	} sa;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct rxrpc_local *local;
 	struct sk_buff *skb;
 	struct msghdr msg;
@@ -348,11 +349,11 @@ void rxrpc_reject_packets(struct work_struct *work)
 
 	_enter("%d", local->debug_id);
 
-	iov[0].iov_base = &hdr;
-	iov[0].iov_len = sizeof(hdr);
+	iov[0].iov_base = &whdr;
+	iov[0].iov_len = sizeof(whdr);
 	iov[1].iov_base = &code;
 	iov[1].iov_len = sizeof(code);
-	size = sizeof(hdr) + sizeof(code);
+	size = sizeof(whdr) + sizeof(code);
 
 	msg.msg_name = &sa;
 	msg.msg_control = NULL;
@@ -370,8 +371,8 @@ void rxrpc_reject_packets(struct work_struct *work)
 		break;
 	}
 
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.type = RXRPC_PACKET_TYPE_ABORT;
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.type = RXRPC_PACKET_TYPE_ABORT;
 
 	while ((skb = skb_dequeue(&local->reject_queue))) {
 		sp = rxrpc_skb(skb);
@@ -381,13 +382,13 @@ void rxrpc_reject_packets(struct work_struct *work)
 			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
 			code = htonl(skb->priority);
 
-			hdr.epoch = sp->hdr.epoch;
-			hdr.cid = sp->hdr.cid;
-			hdr.callNumber = sp->hdr.callNumber;
-			hdr.serviceId = sp->hdr.serviceId;
-			hdr.flags = sp->hdr.flags;
-			hdr.flags ^= RXRPC_CLIENT_INITIATED;
-			hdr.flags &= RXRPC_CLIENT_INITIATED;
+			whdr.epoch	= htonl(sp->hdr.epoch);
+			whdr.cid	= htonl(sp->hdr.cid);
+			whdr.callNumber	= htonl(sp->hdr.callNumber);
+			whdr.serviceId	= htons(sp->hdr.serviceId);
+			whdr.flags	= sp->hdr.flags;
+			whdr.flags	^= RXRPC_CLIENT_INITIATED;
+			whdr.flags	&= RXRPC_CLIENT_INITIATED;
 
 			kernel_sendmsg(local->socket, &msg, iov, 2, size);
 			break;
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 9185535af5f5..e6396a8c969f 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -310,8 +310,8 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
 void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 _abort_code;
-	u32 serial, hi_serial, seq, abort_code;
+	__be32 wtmp;
+	u32 hi_serial, abort_code;
 
 	_enter("%p,%p", call, skb);
 
@@ -330,16 +330,15 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* track the latest serial number on this connection for ACK packet
 	 * information */
-	serial = ntohl(sp->hdr.serial);
 	hi_serial = atomic_read(&call->conn->hi_serial);
-	while (serial > hi_serial)
+	while (sp->hdr.serial > hi_serial)
 		hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
-					   serial);
+					   sp->hdr.serial);
 
 	/* request ACK generation for any ACK or DATA packet that requests
 	 * it */
 	if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
-		_proto("ACK Requested on %%%u", serial);
+		_proto("ACK Requested on %%%u", sp->hdr.serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
 	}
 
@@ -347,12 +346,11 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	case RXRPC_PACKET_TYPE_ABORT:
 		_debug("abort");
 
-		if (skb_copy_bits(skb, 0, &_abort_code,
-				  sizeof(_abort_code)) < 0)
+		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
 			goto protocol_error;
 
-		abort_code = ntohl(_abort_code);
-		_proto("Rx ABORT %%%u { %x }", serial, abort_code);
+		abort_code = ntohl(wtmp);
+		_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
 
 		write_lock_bh(&call->state_lock);
 		if (call->state < RXRPC_CALL_COMPLETE) {
@@ -364,7 +362,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		goto free_packet_unlock;
 
 	case RXRPC_PACKET_TYPE_BUSY:
-		_proto("Rx BUSY %%%u", serial);
+		_proto("Rx BUSY %%%u", sp->hdr.serial);
 
 		if (call->conn->out_clientflag)
 			goto protocol_error;
@@ -382,15 +380,13 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		}
 
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial);
+		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
 		goto protocol_error;
 
 	case RXRPC_PACKET_TYPE_DATA:
-		seq = ntohl(sp->hdr.seq);
+		_proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
-		_proto("Rx DATA %%%u { #%u }", serial, seq);
-
-		if (seq == 0)
+		if (sp->hdr.seq == 0)
 			goto protocol_error;
 
 		call->ackr_prev_seq = sp->hdr.seq;
@@ -398,9 +394,9 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		/* received data implicitly ACKs all of the request packets we
 		 * sent when we're acting as a client */
 		if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
-			rxrpc_assume_implicit_ackall(call, serial);
+			rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
 
-		switch (rxrpc_fast_process_data(call, skb, seq)) {
+		switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
 		case 0:
 			skb = NULL;
 			goto done;
@@ -481,12 +477,12 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
 		if (!pskb_pull(jumbo, sizeof(jhdr)))
 			BUG();
 
-		sp->hdr.seq	= htonl(ntohl(sp->hdr.seq) + 1);
-		sp->hdr.serial	= htonl(ntohl(sp->hdr.serial) + 1);
+		sp->hdr.seq	+= 1;
+		sp->hdr.serial	+= 1;
 		sp->hdr.flags	= jhdr.flags;
 		sp->hdr._rsvd	= jhdr._rsvd;
 
-		_proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1);
+		_proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
 
 		rxrpc_fast_process_packet(call, part);
 		part = NULL;
@@ -607,6 +603,35 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
 	rxrpc_queue_work(&local->event_processor);
 }
 
+/*
+ * Extract the wire header from a packet and translate the byte order.
+ */
+static noinline
+int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
+{
+	struct rxrpc_wire_header whdr;
+
+	/* dig out the RxRPC connection details */
+	if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0)
+		return -EBADMSG;
+	if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr)))
+		BUG();
+
+	memset(sp, 0, sizeof(*sp));
+	sp->hdr.epoch		= ntohl(whdr.epoch);
+	sp->hdr.cid		= ntohl(whdr.cid);
+	sp->hdr.callNumber	= ntohl(whdr.callNumber);
+	sp->hdr.seq		= ntohl(whdr.seq);
+	sp->hdr.serial		= ntohl(whdr.serial);
+	sp->hdr.flags		= whdr.flags;
+	sp->hdr.type		= whdr.type;
+	sp->hdr.userStatus	= whdr.userStatus;
+	sp->hdr.securityIndex	= whdr.securityIndex;
+	sp->hdr._rsvd		= ntohs(whdr._rsvd);
+	sp->hdr.serviceId	= ntohs(whdr.serviceId);
+	return 0;
+}
+
 static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local,
 					       struct sk_buff *skb,
 					       struct rxrpc_skb_priv *sp)
@@ -686,27 +711,22 @@ void rxrpc_data_ready(struct sock *sk)
 
 	UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
 
-	/* the socket buffer we have is owned by UDP, with UDP's data all over
-	 * it, but we really want our own */
+	/* The socket buffer we have is owned by UDP, with UDP's data all over
+	 * it, but we really want our own data there.
+	 */
 	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
-	memset(sp, 0, sizeof(*sp));
 
 	_net("Rx UDP packet from %08x:%04hu",
 	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
 
 	/* dig out the RxRPC connection details */
-	if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr,
-			  sizeof(sp->hdr)) < 0)
+	if (rxrpc_extract_header(sp, skb) < 0)
 		goto bad_message;
-	if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr)))
-		BUG();
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
-	     ntohl(sp->hdr.epoch),
-	     ntohl(sp->hdr.cid),
-	     ntohl(sp->hdr.callNumber));
+	     sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
 
 	if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
 		_proto("Rx Bad Packet Type %u", sp->hdr.type);
@@ -737,14 +757,9 @@ void rxrpc_data_ready(struct sock *sk)
 		rxrpc_put_connection(conn);
 	} else {
 		struct rxrpc_call *call;
-		u8 in_clientflag = 0;
-
-		if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
-			in_clientflag = RXRPC_CLIENT_INITIATED;
-		call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid,
-					    sp->hdr.callNumber, sp->hdr.epoch,
-					    sp->hdr.serviceId, local, AF_INET,
-					    (u8 *)&ip_hdr(skb)->saddr);
+
+		call = rxrpc_find_call_hash(&sp->hdr, local,
+					    AF_INET, &ip_hdr(skb)->saddr);
 		if (call)
 			rxrpc_post_packet_to_call(call, skb);
 		else
@@ -759,7 +774,7 @@ cant_route_call:
 	_debug("can't route call");
 	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
 	    sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		if (sp->hdr.seq == cpu_to_be32(1)) {
+		if (sp->hdr.seq == 1) {
 			_debug("first packet");
 			skb_queue_tail(&local->accept_queue, skb);
 			rxrpc_queue_work(&local->acceptor);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3f2940626569..06bf5abd920d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -70,11 +70,30 @@ struct rxrpc_sock {
 #define RXRPC_SECURITY_MAX	RXRPC_SECURITY_ENCRYPT
 	struct sockaddr_rxrpc	srx;		/* local address */
 	sa_family_t		proto;		/* protocol created with */
-	__be16			service_id;	/* service ID of local/remote service */
 };
 
 #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
 
+/*
+ * CPU-byteorder normalised Rx packet header.
+ */
+struct rxrpc_host_header {
+	u32		epoch;		/* client boot timestamp */
+	u32		cid;		/* connection and channel ID */
+	u32		callNumber;	/* call ID (0 for connection-level packets) */
+	u32		seq;		/* sequence number of pkt in call stream */
+	u32		serial;		/* serial number of pkt sent to network */
+	u8		type;		/* packet type */
+	u8		flags;		/* packet flags */
+	u8		userStatus;	/* app-layer defined status */
+	u8		securityIndex;	/* security protocol ID */
+	union {
+		u16	_rsvd;		/* reserved */
+		u16	cksum;		/* kerberos security checksum */
+	};
+	u16		serviceId;	/* service ID */
+} __packed;
+
 /*
  * RxRPC socket buffer private variables
  * - max 48 bytes (struct sk_buff::cb)
@@ -89,7 +108,7 @@ struct rxrpc_skb_priv {
 		bool		need_resend;	/* T if needs resending */
 	};
 
-	struct rxrpc_header	hdr;		/* RxRPC packet header from this packet */
+	struct rxrpc_host_header hdr;		/* RxRPC packet header from this packet */
 };
 
 #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
@@ -230,7 +249,7 @@ struct rxrpc_conn_bundle {
 	atomic_t		usage;
 	int			debug_id;	/* debug ID for printks */
 	unsigned short		num_conns;	/* number of connections in this bundle */
-	__be16			service_id;	/* service ID */
+	u16			service_id;	/* Service ID for this bundle */
 	u8			security_ix;	/* security type */
 };
 
@@ -260,7 +279,6 @@ struct rxrpc_connection {
 	rwlock_t		lock;		/* access lock */
 	spinlock_t		state_lock;	/* state-change lock */
 	atomic_t		usage;
-	u32			real_conn_id;	/* connection ID (host-endian) */
 	enum {					/* current state of connection */
 		RXRPC_CONN_UNUSED,		/* - connection not yet attempted */
 		RXRPC_CONN_CLIENT,		/* - client connection */
@@ -282,11 +300,9 @@ struct rxrpc_connection {
 	u8			security_size;	/* security header size */
 	u32			security_level;	/* security level negotiated */
 	u32			security_nonce;	/* response re-use preventer */
-
-	/* the following are all in net order */
-	__be32			epoch;		/* epoch of this connection */
-	__be32			cid;		/* connection ID */
-	__be16			service_id;	/* service ID */
+	u32			epoch;		/* epoch of this connection */
+	u32			cid;		/* connection ID */
+	u16			service_id;	/* service ID for this connection */
 	u8			security_ix;	/* security type */
 	u8			in_clientflag;	/* RXRPC_CLIENT_INITIATED if we are server */
 	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
@@ -406,9 +422,9 @@ struct rxrpc_call {
 	rxrpc_seq_t		rx_data_eaten;	/* last data seq ID consumed by recvmsg */
 	rxrpc_seq_t		rx_first_oos;	/* first packet in rx_oos_queue (or 0) */
 	rxrpc_seq_t		ackr_win_top;	/* top of ACK window (rx_data_eaten is bottom) */
-	rxrpc_seq_net_t		ackr_prev_seq;	/* previous sequence number received */
+	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
 	u8			ackr_reason;	/* reason to ACK */
-	__be32			ackr_serial;	/* serial of packet being ACK'd */
+	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
 	atomic_t		ackr_not_idle;	/* number of packets in Rx queue */
 
 	/* received packet records, 1 bit per record */
@@ -420,11 +436,10 @@ struct rxrpc_call {
 	u8			in_clientflag;	/* Copy of conn->in_clientflag for hashing */
 	struct rxrpc_local	*local;		/* Local endpoint. Used for hashing. */
 	sa_family_t		proto;		/* Frame protocol */
-	/* the following should all be in net order */
-	__be32			cid;		/* connection ID + channel index  */
-	__be32			call_id;	/* call ID on connection  */
-	__be32			epoch;		/* epoch of this connection */
-	__be16			service_id;	/* service ID */
+	u32			call_id;	/* call ID on connection  */
+	u32			cid;		/* connection ID plus channel index */
+	u32			epoch;		/* epoch of this connection */
+	u16			service_id;	/* service ID */
 	union {					/* Peer IP address for hashing */
 		__be32	ipv4_addr;
 		__u8	ipv6_addr[16];		/* Anticipates eventual IPv6 support */
@@ -449,7 +464,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
  * af_rxrpc.c
  */
 extern atomic_t rxrpc_n_skbs;
-extern __be32 rxrpc_epoch;
+extern u32 rxrpc_epoch;
 extern atomic_t rxrpc_debug_id;
 extern struct workqueue_struct *rxrpc_workqueue;
 
@@ -470,8 +485,8 @@ extern unsigned rxrpc_rx_window_size;
 extern unsigned rxrpc_rx_mtu;
 extern unsigned rxrpc_rx_jumbo_max;
 
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
 void rxrpc_process_call(struct work_struct *);
 
 /*
@@ -483,15 +498,15 @@ extern struct kmem_cache *rxrpc_call_jar;
 extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
 
-struct rxrpc_call *rxrpc_find_call_hash(u8,  __be32, __be32, __be32,
-					__be16, void *, sa_family_t, const u8 *);
+struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *,
+					void *, sa_family_t, const void *);
 struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
 					 struct rxrpc_transport *,
 					 struct rxrpc_conn_bundle *,
 					 unsigned long, int, gfp_t);
 struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
 				       struct rxrpc_connection *,
-				       struct rxrpc_header *, gfp_t);
+				       struct rxrpc_host_header *, gfp_t);
 struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
 void rxrpc_release_call(struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
@@ -507,16 +522,16 @@ extern rwlock_t rxrpc_connection_lock;
 
 struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
 					   struct rxrpc_transport *,
-					   struct key *, __be16, gfp_t);
+					   struct key *, u16, gfp_t);
 void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
 int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
 		       struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
 void rxrpc_put_connection(struct rxrpc_connection *);
 void __exit rxrpc_destroy_all_connections(void);
 struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
-					       struct rxrpc_header *);
+					       struct rxrpc_host_header *);
 extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *,
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *,
 			  gfp_t);
 
 /*
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
index 78483b4602bf..4e1e6db0050b 100644
--- a/net/rxrpc/ar-local.c
+++ b/net/rxrpc/ar-local.c
@@ -323,9 +323,11 @@ void __exit rxrpc_destroy_all_locals(void)
  * Reply to a version request
  */
 static void rxrpc_send_version_request(struct rxrpc_local *local,
-				       struct rxrpc_header *hdr,
+				       struct rxrpc_host_header *hdr,
 				       struct sk_buff *skb)
 {
+	struct rxrpc_wire_header whdr;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct sockaddr_in sin;
 	struct msghdr msg;
 	struct kvec iov[2];
@@ -344,15 +346,20 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->seq	= 0;
-	hdr->serial	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_VERSION;
-	hdr->flags	= RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
-
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= 0;
+	whdr.serial	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_VERSION;
+	whdr.flags	= RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+	whdr.userStatus	= 0;
+	whdr.securityIndex = 0;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= (char *)rxrpc_version_string;
 	iov[1].iov_len	= sizeof(rxrpc_version_string);
 
@@ -383,7 +390,7 @@ static void rxrpc_process_local_events(struct work_struct *work)
 	while ((skb = skb_dequeue(&local->event_queue))) {
 		struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		kdebug("{%d},{%u}", local->debug_id, sp->hdr.type);
+		_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
 
 		switch (sp->hdr.type) {
 		case RXRPC_PACKET_TYPE_VERSION:
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 9e1527a6d026..353f5c9141ea 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -136,7 +136,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
 	struct rxrpc_call *call;
 	unsigned long user_call_ID = 0;
 	struct key *key;
-	__be16 service_id;
+	u16 service_id;
 	u32 abort_code = 0;
 	int ret;
 
@@ -151,11 +151,11 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
 
 	bundle = NULL;
 	if (trans) {
-		service_id = rx->service_id;
+		service_id = rx->srx.srx_service;
 		if (msg->msg_name) {
 			DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx,
 					 msg->msg_name);
-			service_id = htons(srx->srx_service);
+			service_id = srx->srx_service;
 		}
 		key = rx->key;
 		if (key && !rx->key->payload.data[0])
@@ -348,7 +348,7 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
 
 	/* send the packet with the don't fragment bit set if we currently
 	 * think it's small enough */
-	if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) {
+	if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) {
 		down_read(&trans->local->defrag_sem);
 		/* send the packet by UDP
 		 * - returns -EMSGSIZE if UDP would have to fragment the packet
@@ -480,8 +480,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		write_unlock_bh(&call->state_lock);
 	}
 
-	_proto("Tx DATA %%%u { #%u }",
-	       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+	_proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
 	sp->need_resend = false;
 	sp->resend_at = jiffies + rxrpc_resend_timeout;
@@ -512,6 +511,29 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	_leave("");
 }
 
+/*
+ * Convert a host-endian header into a network-endian header.
+ */
+static void rxrpc_insert_header(struct sk_buff *skb)
+{
+	struct rxrpc_wire_header whdr;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= htonl(sp->hdr.seq);
+	whdr.serial	= htonl(sp->hdr.serial);
+	whdr.type	= sp->hdr.type;
+	whdr.flags	= sp->hdr.flags;
+	whdr.userStatus	= sp->hdr.userStatus;
+	whdr.securityIndex = sp->hdr.securityIndex;
+	whdr._rsvd	= htons(sp->hdr._rsvd);
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
+	memcpy(skb->head, &whdr, sizeof(whdr));
+}
+
 /*
  * send data through a socket
  * - must be called in process context
@@ -650,17 +672,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			seq = atomic_inc_return(&call->sequence);
 
-			sp->hdr.epoch = conn->epoch;
-			sp->hdr.cid = call->cid;
+			sp->hdr.epoch	= conn->epoch;
+			sp->hdr.cid	= call->cid;
 			sp->hdr.callNumber = call->call_id;
-			sp->hdr.seq = htonl(seq);
-			sp->hdr.serial =
-				htonl(atomic_inc_return(&conn->serial));
-			sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+			sp->hdr.seq	= seq;
+			sp->hdr.serial	= atomic_inc_return(&conn->serial);
+			sp->hdr.type	= RXRPC_PACKET_TYPE_DATA;
 			sp->hdr.userStatus = 0;
 			sp->hdr.securityIndex = conn->security_ix;
-			sp->hdr._rsvd = 0;
-			sp->hdr.serviceId = conn->service_id;
+			sp->hdr._rsvd	= 0;
+			sp->hdr.serviceId = call->service_id;
 
 			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
@@ -673,12 +694,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			ret = rxrpc_secure_packet(
 				call, skb, skb->mark,
-				skb->head + sizeof(struct rxrpc_header));
+				skb->head + sizeof(struct rxrpc_wire_header));
 			if (ret < 0)
 				goto out;
 
-			memcpy(skb->head, &sp->hdr,
-			       sizeof(struct rxrpc_header));
+			rxrpc_insert_header(skb);
 			rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
 			skb = NULL;
 		}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index bebaa43484bc..dc089b1976aa 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -92,7 +92,7 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
 			BUG();
 		}
 
-		peer->hdrsize += sizeof(struct rxrpc_header);
+		peer->hdrsize += sizeof(struct rxrpc_wire_header);
 		peer->maxdata = peer->mtu - peer->hdrsize;
 	}
 
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
index 38047f713f2c..525b2ba5a8f4 100644
--- a/net/rxrpc/ar-proc.c
+++ b/net/rxrpc/ar-proc.c
@@ -74,9 +74,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 		   " %-8.8s %08x %lx\n",
 		   lbuff,
 		   rbuff,
-		   ntohs(call->conn->service_id),
-		   ntohl(call->conn->cid),
-		   ntohl(call->call_id),
+		   call->conn->service_id,
+		   call->cid,
+		   call->call_id,
 		   call->conn->in_clientflag ? "Svc" : "Clt",
 		   atomic_read(&call->usage),
 		   rxrpc_call_states[call->state],
@@ -157,8 +157,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 		   " %s %08x %08x %08x\n",
 		   lbuff,
 		   rbuff,
-		   ntohs(conn->service_id),
-		   ntohl(conn->cid),
+		   conn->service_id,
+		   conn->cid,
 		   conn->call_counter,
 		   conn->in_clientflag ? "Svc" : "Clt",
 		   atomic_read(&conn->usage),
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 70f47033ff2f..64facba24a45 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -158,7 +158,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			goto receive_non_data_message;
 
 		_debug("recvmsg DATA #%u { %d, %d }",
-		       ntohl(sp->hdr.seq), skb->len, sp->offset);
+		       sp->hdr.seq, skb->len, sp->offset);
 
 		if (!continue_call) {
 			/* only set the control data once per recvmsg() */
@@ -169,11 +169,11 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
 		}
 
-		ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
-		ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
-		call->rx_data_recv = ntohl(sp->hdr.seq);
+		ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+		ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+		call->rx_data_recv = sp->hdr.seq;
 
-		ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+		ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
 
 		offset = sp->offset;
 		copy = skb->len - offset;
@@ -364,11 +364,11 @@ void rxrpc_kernel_data_delivered(struct sk_buff *skb)
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_call *call = sp->call;
 
-	ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
-	ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
-	call->rx_data_recv = ntohl(sp->hdr.seq);
+	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+	call->rx_data_recv = sp->hdr.seq;
 
-	ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
 	rxrpc_free_skb(skb);
 }
 
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
index 8334474eb26c..e2f4c49a9246 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/ar-security.c
@@ -171,7 +171,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 
 	_enter("");
 
-	sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix);
+	sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix);
 
 	sec = rxrpc_security_lookup(conn->security_ix);
 	if (!sec) {
@@ -182,7 +182,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 	/* find the service */
 	read_lock_bh(&local->services_lock);
 	list_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->service_id == conn->service_id)
+		if (rx->srx.srx_service == conn->service_id)
 			goto found_service;
 	}
 
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
index 81f3c0238b9a..ae9f93f94ed2 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/ar-skbuff.c
@@ -59,7 +59,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
 
 	spin_lock_bh(&call->lock);
 
-	_debug("hard ACK #%u", ntohl(sp->hdr.seq));
+	_debug("hard ACK #%u", sp->hdr.seq);
 
 	for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
 		call->ackr_window[loop] >>= 1;
@@ -67,7 +67,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
 			call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
 	}
 
-	seq = ntohl(sp->hdr.seq);
+	seq = sp->hdr.seq;
 	ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
 	call->rx_data_eaten = seq;
 
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
index 9946467f16b4..5f9b9d462f53 100644
--- a/net/rxrpc/ar-transport.c
+++ b/net/rxrpc/ar-transport.c
@@ -51,6 +51,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
 		spin_lock_init(&trans->client_lock);
 		rwlock_init(&trans->conn_lock);
 		atomic_set(&trans->usage, 1);
+		trans->conn_idcounter = peer->srx.srx_service << 16;
 		trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
 
 		if (peer->srx.transport.family == AF_INET) {
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index d7a9ab5a9d9c..160480221224 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -132,8 +132,8 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 	desc.info = iv.x;
 	desc.flags = 0;
 
-	tmpbuf.x[0] = conn->epoch;
-	tmpbuf.x[1] = conn->cid;
+	tmpbuf.x[0] = htonl(conn->epoch);
+	tmpbuf.x[1] = htonl(conn->cid);
 	tmpbuf.x[2] = 0;
 	tmpbuf.x[3] = htonl(conn->security_ix);
 
@@ -169,8 +169,8 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
-	data_size |= (u32) check << 16;
+	check = sp->hdr.seq ^ sp->hdr.callNumber;
+	data_size |= (u32)check << 16;
 
 	tmpbuf.hdr.data_size = htonl(data_size);
 	memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
@@ -215,9 +215,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check = sp->hdr.seq ^ sp->hdr.callNumber;
 
-	rxkhdr.data_size = htonl(data_size | (u32) check << 16);
+	rxkhdr.data_size = htonl(data_size | (u32)check << 16);
 	rxkhdr.checksum = 0;
 
 	/* encrypt from the session key */
@@ -262,14 +262,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	struct {
 		__be32 x[2];
 	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
-	__be32 x;
-	u32 y;
+	u32 x, y;
 	int ret;
 
 	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u},%zu,",
-	       call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq),
+	       call->debug_id, key_serial(call->conn->key), sp->hdr.seq,
 	       data_size);
 
 	if (!call->conn->cipher)
@@ -286,10 +285,10 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	desc.flags = 0;
 
 	/* calculate the security checksum */
-	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
-	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
-	tmpbuf.x[0] = sp->hdr.callNumber;
-	tmpbuf.x[1] = x;
+	x = call->channel << (32 - RXRPC_CIDSHIFT);
+	x |= sp->hdr.seq & 0x3fffffff;
+	tmpbuf.x[0] = htonl(sp->hdr.callNumber);
+	tmpbuf.x[1] = htonl(x);
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
@@ -299,7 +298,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	y = (y >> 16) & 0xffff;
 	if (y == 0)
 		y = 1; /* zero checksums are not permitted */
-	sp->hdr.cksum = htons(y);
+	sp->hdr.cksum = y;
 
 	switch (call->conn->security_level) {
 	case RXRPC_SECURITY_PLAIN:
@@ -368,7 +367,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
 	check &= 0xffff;
 	if (check != 0) {
 		*_abort_code = RXKADSEALEDINCON;
@@ -453,7 +452,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
 	check &= 0xffff;
 	if (check != 0) {
 		*_abort_code = RXKADSEALEDINCON;
@@ -494,16 +493,14 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 	struct {
 		__be32 x[2];
 	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
-	__be32 x;
-	__be16 cksum;
-	u32 y;
+	u16 cksum;
+	u32 x, y;
 	int ret;
 
 	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u}",
-	       call->debug_id, key_serial(call->conn->key),
-	       ntohl(sp->hdr.seq));
+	       call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
 
 	if (!call->conn->cipher)
 		return 0;
@@ -521,21 +518,20 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 	desc.flags = 0;
 
 	/* validate the security checksum */
-	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
-	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
-	tmpbuf.x[0] = call->call_id;
-	tmpbuf.x[1] = x;
+	x = call->channel << (32 - RXRPC_CIDSHIFT);
+	x |= sp->hdr.seq & 0x3fffffff;
+	tmpbuf.x[0] = htonl(call->call_id);
+	tmpbuf.x[1] = htonl(x);
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
 	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
 
 	y = ntohl(tmpbuf.x[1]);
-	y = (y >> 16) & 0xffff;
-	if (y == 0)
-		y = 1; /* zero checksums are not permitted */
+	cksum = (y >> 16) & 0xffff;
+	if (cksum == 0)
+		cksum = 1; /* zero checksums are not permitted */
 
-	cksum = htons(y);
 	if (sp->hdr.cksum != cksum) {
 		*_abort_code = RXKADSEALEDINCON;
 		_leave(" = -EPROTO [csum failed]");
@@ -567,10 +563,11 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 {
 	struct rxkad_challenge challenge;
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[2];
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
@@ -592,26 +589,27 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= conn->epoch;
-	hdr.cid		= conn->cid;
-	hdr.callNumber	= 0;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_CHALLENGE;
-	hdr.flags	= conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= conn->service_id;
-
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	whdr.epoch	= htonl(conn->epoch);
+	whdr.cid	= htonl(conn->cid);
+	whdr.callNumber	= 0;
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_CHALLENGE;
+	whdr.flags	= conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(conn->service_id);
+
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= &challenge;
 	iov[1].iov_len	= sizeof(challenge);
 
 	len = iov[0].iov_len + iov[1].iov_len;
 
-	hdr.serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx CHALLENGE %%%u", ntohl(hdr.serial));
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx CHALLENGE %%%u", serial);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
 	if (ret < 0) {
@@ -627,13 +625,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
  * send a Kerberos security response
  */
 static int rxkad_send_response(struct rxrpc_connection *conn,
-			       struct rxrpc_header *hdr,
+			       struct rxrpc_host_header *hdr,
 			       struct rxkad_response *resp,
 			       const struct rxkad_key *s2)
 {
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[3];
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("");
@@ -644,24 +644,26 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->epoch	= conn->epoch;
-	hdr->seq	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_RESPONSE;
-	hdr->flags	= conn->out_clientflag;
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.epoch	= htonl(hdr->epoch);
+	whdr.cid	= htonl(hdr->cid);
+	whdr.type	= RXRPC_PACKET_TYPE_RESPONSE;
+	whdr.flags	= conn->out_clientflag;
+	whdr.securityIndex = hdr->securityIndex;
+	whdr.serviceId	= htons(hdr->serviceId);
 
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= resp;
 	iov[1].iov_len	= sizeof(*resp);
-	iov[2].iov_base	= (void *) s2->ticket;
+	iov[2].iov_base	= (void *)s2->ticket;
 	iov[2].iov_len	= s2->ticket_len;
 
 	len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
 
-	hdr->serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx RESPONSE %%%u", ntohl(hdr->serial));
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx RESPONSE %%%u", serial);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
 	if (ret < 0) {
@@ -770,7 +772,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	min_level = ntohl(challenge.min_level);
 
 	_proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
-	       ntohl(sp->hdr.serial), version, nonce, min_level);
+	       sp->hdr.serial, version, nonce, min_level);
 
 	abort_code = RXKADINCONSISTENCY;
 	if (version != RXKAD_VERSION)
@@ -786,17 +788,17 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	memset(&resp, 0, sizeof(resp));
 
 	resp.version = RXKAD_VERSION;
-	resp.encrypted.epoch = conn->epoch;
-	resp.encrypted.cid = conn->cid;
+	resp.encrypted.epoch = htonl(conn->epoch);
+	resp.encrypted.cid = htonl(conn->cid);
 	resp.encrypted.securityIndex = htonl(conn->security_ix);
 	resp.encrypted.call_id[0] =
-		(conn->channels[0] ? conn->channels[0]->call_id : 0);
+		htonl(conn->channels[0] ? conn->channels[0]->call_id : 0);
 	resp.encrypted.call_id[1] =
-		(conn->channels[1] ? conn->channels[1]->call_id : 0);
+		htonl(conn->channels[1] ? conn->channels[1]->call_id : 0);
 	resp.encrypted.call_id[2] =
-		(conn->channels[2] ? conn->channels[2]->call_id : 0);
+		htonl(conn->channels[2] ? conn->channels[2]->call_id : 0);
 	resp.encrypted.call_id[3] =
-		(conn->channels[3] ? conn->channels[3]->call_id : 0);
+		htonl(conn->channels[3] ? conn->channels[3]->call_id : 0);
 	resp.encrypted.inc_nonce = htonl(nonce + 1);
 	resp.encrypted.level = htonl(conn->security_level);
 	resp.kvno = htonl(token->kad->kvno);
@@ -1022,7 +1024,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	kvno = ntohl(response.kvno);
 	sp = rxrpc_skb(skb);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
-	       ntohl(sp->hdr.serial), version, kvno, ticket_len);
+	       sp->hdr.serial, version, kvno, ticket_len);
 
 	abort_code = RXKADINCONSISTENCY;
 	if (version != RXKAD_VERSION)
@@ -1058,9 +1060,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	rxkad_decrypt_response(conn, &response, &session_key);
 
 	abort_code = RXKADSEALEDINCON;
-	if (response.encrypted.epoch != conn->epoch)
+	if (ntohl(response.encrypted.epoch) != conn->epoch)
 		goto protocol_error_free;
-	if (response.encrypted.cid != conn->cid)
+	if (ntohl(response.encrypted.cid) != conn->cid)
 		goto protocol_error_free;
 	if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
 		goto protocol_error_free;
@@ -1077,7 +1079,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 		goto protocol_error_free;
 
 	abort_code = RXKADOUTOFSEQUENCE;
-	if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1))
+	if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
 		goto protocol_error_free;
 
 	abort_code = RXKADLEVELFAIL;
-- 
cgit v1.2.3


From 351c1e648623b742fe1687636117306adc8b561c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:56:06 +0000
Subject: rxrpc: Be more selective about the types of received packets we
 accept

Currently, received RxRPC packets outside the range 1-13 are rejected.
There are, however, holes in the range that should also be rejected - plus
at least one type we don't yet support - so reject these also.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h | 11 +++++++++++
 net/rxrpc/ar-input.c   |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index de1e67988ada..9ebab3a8cf0a 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -70,6 +70,17 @@ struct rxrpc_wire_header {
 
 extern const char *rxrpc_pkts[];
 
+#define RXRPC_SUPPORTED_PACKET_TYPES (			\
+		(1 << RXRPC_PACKET_TYPE_DATA) |		\
+		(1 << RXRPC_PACKET_TYPE_ACK) |		\
+		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
+		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
+		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
+		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
+		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
+		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
+		(1 << RXRPC_PACKET_TYPE_VERSION))
+
 /*****************************************************************************/
 /*
  * jumbo packet secondary header
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index e6396a8c969f..63ed75c40e29 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -728,7 +728,8 @@ void rxrpc_data_ready(struct sock *sk)
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
 	     sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
 
-	if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
+	if (sp->hdr.type >= RXRPC_N_PACKET_TYPES ||
+	    !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) {
 		_proto("Rx Bad Packet Type %u", sp->hdr.type);
 		goto bad_message;
 	}
-- 
cgit v1.2.3


From b5d3755a22e0cc4c369c0985aef0c52c2477c1e7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:16 +0100
Subject: uapi: define DIV_ROUND_UP for userland

DIV_ROUND_UP is defined in linux/kernel.h only for the kernel.
When ethtool.h is included by a userland app, we got the following error:

include/linux/ethtool.h:1218:8: error: variably modified 'queue_mask' at file scope
  __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
        ^

Let's add a common definition in uapi and use it everywhere.

Fixes: ac2c7ad0e5d6 ("net/ethtool: introduce a new ioctl for per queue setting")
CC: Kan Liang <kan.liang@intel.com>
Suggested-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h       | 2 +-
 include/uapi/linux/ethtool.h | 3 ++-
 include/uapi/linux/kernel.h  | 1 +
 include/uapi/linux/mroute6.h | 9 ++-------
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f31638c6e873..ac1923957236 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -64,7 +64,7 @@
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 #define DIV_ROUND_UP_ULL(ll,d) \
 	({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; })
 
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 37fd6dc33de4..9c22249ebf35 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _UAPI_LINUX_ETHTOOL_H
 #define _UAPI_LINUX_ETHTOOL_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
@@ -1215,7 +1216,7 @@ enum ethtool_sfeatures_retval_bits {
 struct ethtool_per_queue_op {
 	__u32	cmd;
 	__u32	sub_command;
-	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	__u32	queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
 	char	data[];
 };
 
diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h
index 321e399457f5..466073f0ce46 100644
--- a/include/uapi/linux/kernel.h
+++ b/include/uapi/linux/kernel.h
@@ -9,5 +9,6 @@
 #define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
+#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
 #endif /* _UAPI_LINUX_KERNEL_H */
diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index ce91215cf7e6..5062fb5751e1 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI__LINUX_MROUTE6_H
 #define _UAPI__LINUX_MROUTE6_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sockios.h>
 
@@ -46,14 +47,8 @@ typedef unsigned short mifi_t;
 typedef	__u32		if_mask;
 #define NIFBITS (sizeof(if_mask) * 8)        /* bits per mask */
 
-#if !defined(__KERNEL__)
-#if !defined(DIV_ROUND_UP)
-#define	DIV_ROUND_UP(x,y)	(((x) + ((y) - 1)) / (y))
-#endif
-#endif
-
 typedef struct if_set {
-	if_mask ifs_bits[DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
+	if_mask ifs_bits[__KERNEL_DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
 } if_set;
 
 #define IF_SET(n, p)    ((p)->ifs_bits[(n)/NIFBITS] |= (1 << ((n) % NIFBITS)))
-- 
cgit v1.2.3


From 14e2037902d65213842b4e40305ff54a64abbcb6 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:19 +0100
Subject: ethtool.h: define INT_MAX for userland

INT_MAX needs limits.h in userland.
When ethtool.h is included by a userland app, we got the following error:

.../usr/include/linux/ethtool.h: In function 'ethtool_validate_speed':
.../usr/include/linux/ethtool.h:1471:18: error: 'INT_MAX' undeclared (first use in this function)
  return speed <= INT_MAX || speed == SPEED_UNKNOWN
                  ^

Fixes: e02564ee334a ("ethtool: make validate_speed accept all speeds between 0 and INT_MAX")
CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9c22249ebf35..2835b07416b7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -17,6 +17,10 @@
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
+#ifndef __KERNEL__
+#include <limits.h> /* for INT_MAX */
+#endif
+
 /* All structures exposed to userland should be defined such that they
  * have the same layout for 32-bit and 64-bit userland.
  */
-- 
cgit v1.2.3


From f719e3754ee2f7275437e61a6afd520181fdd43b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 5 Mar 2016 15:03:22 +0200
Subject: ipvs: drop first packet to redirect conntrack

Jiri Bohac is reporting for a problem where the attempt
to reschedule existing connection to another real server
needs proper redirect for the conntrack used by the IPVS
connection. For example, when IPVS connection is created
to NAT-ed real server we alter the reply direction of
conntrack. If we later decide to select different real
server we can not alter again the conntrack. And if we
expire the old connection, the new connection is left
without conntrack.

So, the only way to redirect both the IPVS connection and
the Netfilter's conntrack is to drop the SYN packet that
hits existing connection, to wait for the next jiffie
to expire the old connection and its conntrack and to rely
on client's retransmission to create new connection as
usually.

Jiri Bohac provided a fix that drops all SYNs on rescheduling,
I extended his patch to do such drops only for connections
that use conntrack. Here is the original report from Jiri Bohac:

Since commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server
is dead"), new connections to dead servers are redistributed
immediately to new servers.  The old connection is expired using
ip_vs_conn_expire_now() which sets the connection timer to expire
immediately.

However, before the timer callback, ip_vs_conn_expire(), is run
to clean the connection's conntrack entry, the new redistributed
connection may already be established and its conntrack removed
instead.

Fix this by dropping the first packet of the new connection
instead, like we do when the destination server is not available.
The timer will have deleted the old conntrack entry long before
the first packet of the new connection is retransmitted.

Fixes: dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead")
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 17 +++++++++++++++++
 net/netfilter/ipvs/ip_vs_core.c | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 0816c872b689..a6cc576fd467 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 }
 #endif /* CONFIG_IP_VS_NFCT */
 
+/* Really using conntrack? */
+static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
+					     struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_NFCT
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		return false;
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct && !nf_ct_is_untracked(ct))
+		return true;
+#endif
+	return false;
+}
+
 static inline int
 ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
 {
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f57b4dcdb233..4da560005b0e 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1757,15 +1757,34 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
 	cp = pp->conn_in_get(ipvs, af, skb, &iph);
 
 	conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
-	if (conn_reuse_mode && !iph.fragoffs &&
-	    is_new_conn(skb, &iph) && cp &&
-	    ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
-	      unlikely(!atomic_read(&cp->dest->weight))) ||
-	     unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
-		if (!atomic_read(&cp->n_control))
-			ip_vs_conn_expire_now(cp);
-		__ip_vs_conn_put(cp);
-		cp = NULL;
+	if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+		bool uses_ct = false, resched = false;
+
+		if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+		    unlikely(!atomic_read(&cp->dest->weight))) {
+			resched = true;
+			uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+		} else if (is_new_conn_expected(cp, conn_reuse_mode)) {
+			uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+			if (!atomic_read(&cp->n_control)) {
+				resched = true;
+			} else {
+				/* Do not reschedule controlling connection
+				 * that uses conntrack while it is still
+				 * referenced by controlled connection(s).
+				 */
+				resched = !uses_ct;
+			}
+		}
+
+		if (resched) {
+			if (!atomic_read(&cp->n_control))
+				ip_vs_conn_expire_now(cp);
+			__ip_vs_conn_put(cp);
+			if (uses_ct)
+				return NF_DROP;
+			cp = NULL;
+		}
 	}
 
 	if (unlikely(!cp)) {
-- 
cgit v1.2.3


From 4d7928959832ea41f7f91456b76da19cad01bd09 Mon Sep 17 00:00:00 2001
From: Hante Meuleman <meuleman@broadcom.com>
Date: Wed, 17 Feb 2016 11:27:07 +0100
Subject: brcmfmac: switch to new platform data

Platform data is only available for sdio. With this patch a new
platform data structure is being used which allows for platform
data for any device and configurable per device. This patch only
switches to the new structure and adds support for SDIO devices.

Reviewed-by: Arend Van Spriel <arend@broadcom.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Signed-off-by: Hante Meuleman <meuleman@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c  |  15 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   4 +-
 .../wireless/broadcom/brcm80211/brcmfmac/common.c  |  43 ++++-
 .../wireless/broadcom/brcm80211/brcmfmac/common.h  |  35 +---
 .../net/wireless/broadcom/brcm80211/brcmfmac/of.c  |   3 +-
 .../net/wireless/broadcom/brcm80211/brcmfmac/of.h  |   5 +-
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.c    |  77 +++++----
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.h    |   2 +-
 include/linux/platform_data/brcmfmac-sdio.h        | 135 ---------------
 include/linux/platform_data/brcmfmac.h             | 185 +++++++++++++++++++++
 10 files changed, 282 insertions(+), 222 deletions(-)
 delete mode 100644 include/linux/platform_data/brcmfmac-sdio.h
 create mode 100644 include/linux/platform_data/brcmfmac.h

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 25cd71229c95..bb4aece9ad2c 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -103,7 +103,7 @@ static void brcmf_sdiod_dummy_irqhandler(struct sdio_func *func)
 
 int brcmf_sdiod_intr_register(struct brcmf_sdio_dev *sdiodev)
 {
-	struct brcmfmac_sdio_platform_data *pdata;
+	struct brcmfmac_sdio_pd *pdata;
 	int ret = 0;
 	u8 data;
 	u32 addr, gpiocontrol;
@@ -173,7 +173,7 @@ int brcmf_sdiod_intr_register(struct brcmf_sdio_dev *sdiodev)
 
 int brcmf_sdiod_intr_unregister(struct brcmf_sdio_dev *sdiodev)
 {
-	struct brcmfmac_sdio_platform_data *pdata;
+	struct brcmfmac_sdio_pd *pdata;
 
 	brcmf_dbg(SDIO, "Entering\n");
 
@@ -1164,17 +1164,6 @@ static int brcmf_ops_sdio_probe(struct sdio_func *func,
 	dev_set_drvdata(&func->dev, bus_if);
 	dev_set_drvdata(&sdiodev->func[1]->dev, bus_if);
 	sdiodev->dev = &sdiodev->func[1]->dev;
-	sdiodev->pdata = brcmf_get_module_param(sdiodev->dev);
-
-#ifdef CONFIG_PM_SLEEP
-	/* wowl can be supported when KEEP_POWER is true and (WAKE_SDIO_IRQ
-	 * is true or when platform data OOB irq is true).
-	 */
-	if ((sdio_get_host_pm_caps(sdiodev->func[1]) & MMC_PM_KEEP_POWER) &&
-	    ((sdio_get_host_pm_caps(sdiodev->func[1]) & MMC_PM_WAKE_SDIO_IRQ) ||
-	     (sdiodev->pdata && sdiodev->pdata->oob_irq_supported)))
-		bus_if->wowl_supported = true;
-#endif
 
 	brcmf_sdiod_change_state(sdiodev, BRCMF_SDIOD_DOWN);
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 5609a79df1c1..5e3acaca7231 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6459,8 +6459,8 @@ int brcmf_cfg80211_wait_vif_event(struct brcmf_cfg80211_info *cfg,
 static s32 brcmf_translate_country_code(struct brcmf_pub *drvr, char alpha2[2],
 					struct brcmf_fil_country_le *ccreq)
 {
-	struct cc_translate *country_codes;
-	struct cc_entry *cc;
+	struct brcmfmac_pd_cc *country_codes;
+	struct brcmfmac_pd_cc_entry *cc;
 	s32 found_index;
 	int i;
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
index 020901c2e0ca..4bd3225cdea6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
@@ -80,7 +80,7 @@ module_param_named(ignore_probe_fail, brcmf_ignore_probe_fail, int, 0);
 MODULE_PARM_DESC(ignore_probe_fail, "always succeed probe for debugging");
 #endif
 
-static struct brcmfmac_sdio_platform_data *brcmfmac_pdata;
+static struct brcmfmac_platform_data *brcmfmac_pdata;
 struct brcmf_mp_global_t brcmf_mp_global;
 
 int brcmf_c_preinit_dcmds(struct brcmf_if *ifp)
@@ -229,15 +229,46 @@ void __brcmf_dbg(u32 level, const char *func, const char *fmt, ...)
 
 static void brcmf_mp_attach(void)
 {
+	/* If module param firmware path is set then this will always be used,
+	 * if not set then if available use the platform data version. To make
+	 * sure it gets initialized at all, always copy the module param version
+	 */
 	strlcpy(brcmf_mp_global.firmware_path, brcmf_firmware_path,
 		BRCMF_FW_ALTPATH_LEN);
+	if ((brcmfmac_pdata) && (brcmfmac_pdata->fw_alternative_path) &&
+	    (brcmf_mp_global.firmware_path[0] == '\0')) {
+		strlcpy(brcmf_mp_global.firmware_path,
+			brcmfmac_pdata->fw_alternative_path,
+			BRCMF_FW_ALTPATH_LEN);
+	}
 }
 
-struct brcmfmac_sdio_platform_data *brcmf_get_module_param(struct device *dev)
+struct brcmfmac_sdio_pd *brcmf_get_module_param(struct device *dev,
+						enum brcmf_bus_type bus_type,
+						u32 chip, u32 chiprev)
 {
-	if (!brcmfmac_pdata)
-		brcmf_of_probe(dev, &brcmfmac_pdata);
-	return brcmfmac_pdata;
+	struct brcmfmac_sdio_pd *pdata;
+	struct brcmfmac_pd_device *device_pd;
+	int i;
+
+	if (brcmfmac_pdata) {
+		for (i = 0; i < brcmfmac_pdata->device_count; i++) {
+			device_pd = &brcmfmac_pdata->devices[i];
+			if ((device_pd->bus_type == bus_type) &&
+			    (device_pd->id == chip) &&
+			    ((device_pd->rev == chiprev) ||
+			     (device_pd->rev == -1))) {
+				brcmf_dbg(INFO, "Platform data for device found\n");
+				if (device_pd->bus_type == BRCMF_BUSTYPE_SDIO)
+					return &device_pd->bus.sdio;
+				break;
+			}
+		}
+	}
+	pdata = NULL;
+	brcmf_of_probe(dev, &pdata);
+
+	return pdata;
 }
 
 int brcmf_mp_device_attach(struct brcmf_pub *drvr)
@@ -287,7 +318,7 @@ static int brcmf_common_pd_remove(struct platform_device *pdev)
 static struct platform_driver brcmf_pd = {
 	.remove		= brcmf_common_pd_remove,
 	.driver		= {
-		.name	= BRCMFMAC_SDIO_PDATA_NAME,
+		.name	= BRCMFMAC_PDATA_NAME,
 	}
 };
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
index 54a26ede808d..a64e40e8bfda 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
@@ -16,7 +16,7 @@
 #define BRCMFMAC_COMMON_H
 
 #include <linux/platform_device.h>
-#include <linux/platform_data/brcmfmac-sdio.h>
+#include <linux/platform_data/brcmfmac.h>
 #include "fwil_types.h"
 
 extern const u8 ALLFFMAC[ETH_ALEN];
@@ -42,33 +42,6 @@ struct brcmf_mp_global_t {
 
 extern struct brcmf_mp_global_t brcmf_mp_global;
 
-/**
- * struct cc_entry - Struct for translating user space country code (iso3166) to
- *		     firmware country code and revision.
- *
- * @iso3166: iso3166 alpha 2 country code string.
- * @cc: firmware country code string.
- * @rev: firmware country code revision.
- */
-struct cc_entry {
-	char	iso3166[BRCMF_COUNTRY_BUF_SZ];
-	char	cc[BRCMF_COUNTRY_BUF_SZ];
-	s32	rev;
-};
-
-/**
- * struct cc_translate - Struct for translating country codes as set by user
- *			 space to a country code and rev which can be used by
- *			 firmware.
- *
- * @table_size: number of entries in table (> 0)
- * @table: dynamic array of 1 or more elements with translation information.
- */
-struct cc_translate {
-	int	table_size;
-	struct cc_entry table[0];
-};
-
 /**
  * struct brcmf_mp_device - Device module paramaters.
  *
@@ -88,10 +61,12 @@ struct brcmf_mp_device {
 	int	fcmode;
 	bool	roamoff;
 	bool	ignore_probe_fail;
-	struct cc_translate *country_codes;
+	struct brcmfmac_pd_cc *country_codes;
 };
 
-struct brcmfmac_sdio_platform_data *brcmf_get_module_param(struct device *dev);
+struct brcmfmac_sdio_pd *brcmf_get_module_param(struct device *dev,
+						enum brcmf_bus_type bus_type,
+						u32 chip, u32 chiprev);
 int brcmf_mp_device_attach(struct brcmf_pub *drvr);
 void brcmf_mp_device_detach(struct brcmf_pub *drvr);
 #ifdef DEBUG
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.c
index 8201d937b826..ece0b65dd039 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.c
@@ -23,8 +23,7 @@
 #include "common.h"
 #include "of.h"
 
-void
-brcmf_of_probe(struct device *dev, struct brcmfmac_sdio_platform_data **sdio)
+void brcmf_of_probe(struct device *dev, struct brcmfmac_sdio_pd **sdio)
 {
 	struct device_node *np = dev->of_node;
 	int irq;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.h
index 84b474484cea..1ba951f9b542 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/of.h
@@ -15,10 +15,9 @@
  */
 #ifdef CONFIG_OF
 void
-brcmf_of_probe(struct device *dev, struct brcmfmac_sdio_platform_data **sdio);
+brcmf_of_probe(struct device *dev, struct brcmfmac_sdio_pd **sdio);
 #else
-static void brcmf_of_probe(struct device *dev,
-			   struct brcmfmac_sdio_platform_data **sdio)
+static void brcmf_of_probe(struct device *dev, struct brcmfmac_sdio_pd **sdio)
 {
 }
 #endif /* CONFIG_OF */
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index c790fa89db05..6e367041f691 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -33,8 +33,6 @@
 #include <linux/bcma/bcma.h>
 #include <linux/debugfs.h>
 #include <linux/vmalloc.h>
-#include <linux/platform_data/brcmfmac-sdio.h>
-#include <linux/moduleparam.h>
 #include <asm/unaligned.h>
 #include <defs.h>
 #include <brcmu_wifi.h>
@@ -44,6 +42,8 @@
 #include "sdio.h"
 #include "chip.h"
 #include "firmware.h"
+#include "core.h"
+#include "common.h"
 
 #define DCMD_RESP_TIMEOUT	msecs_to_jiffies(2500)
 #define CTL_DONE_TIMEOUT	msecs_to_jiffies(2500)
@@ -3775,26 +3775,28 @@ static const struct brcmf_buscore_ops brcmf_sdio_buscore_ops = {
 static bool
 brcmf_sdio_probe_attach(struct brcmf_sdio *bus)
 {
+	struct brcmf_sdio_dev *sdiodev;
 	u8 clkctl = 0;
 	int err = 0;
 	int reg_addr;
 	u32 reg_val;
 	u32 drivestrength;
 
-	sdio_claim_host(bus->sdiodev->func[1]);
+	sdiodev = bus->sdiodev;
+	sdio_claim_host(sdiodev->func[1]);
 
 	pr_debug("F1 signature read @0x18000000=0x%4x\n",
-		 brcmf_sdiod_regrl(bus->sdiodev, SI_ENUM_BASE, NULL));
+		 brcmf_sdiod_regrl(sdiodev, SI_ENUM_BASE, NULL));
 
 	/*
 	 * Force PLL off until brcmf_chip_attach()
 	 * programs PLL control regs
 	 */
 
-	brcmf_sdiod_regwb(bus->sdiodev, SBSDIO_FUNC1_CHIPCLKCSR,
+	brcmf_sdiod_regwb(sdiodev, SBSDIO_FUNC1_CHIPCLKCSR,
 			  BRCMF_INIT_CLKCTL1, &err);
 	if (!err)
-		clkctl = brcmf_sdiod_regrb(bus->sdiodev,
+		clkctl = brcmf_sdiod_regrb(sdiodev,
 					   SBSDIO_FUNC1_CHIPCLKCSR, &err);
 
 	if (err || ((clkctl & ~SBSDIO_AVBITS) != BRCMF_INIT_CLKCTL1)) {
@@ -3803,50 +3805,77 @@ brcmf_sdio_probe_attach(struct brcmf_sdio *bus)
 		goto fail;
 	}
 
-	bus->ci = brcmf_chip_attach(bus->sdiodev, &brcmf_sdio_buscore_ops);
+	bus->ci = brcmf_chip_attach(sdiodev, &brcmf_sdio_buscore_ops);
 	if (IS_ERR(bus->ci)) {
 		brcmf_err("brcmf_chip_attach failed!\n");
 		bus->ci = NULL;
 		goto fail;
 	}
+	sdiodev->pdata = brcmf_get_module_param(sdiodev->dev,
+						   BRCMF_BUSTYPE_SDIO,
+						   bus->ci->chip,
+						   bus->ci->chiprev);
+	/* platform specific configuration:
+	 *   alignments must be at least 4 bytes for ADMA
+	 */
+	bus->head_align = ALIGNMENT;
+	bus->sgentry_align = ALIGNMENT;
+	if (sdiodev->pdata) {
+		if (sdiodev->pdata->sd_head_align > ALIGNMENT)
+			bus->head_align = sdiodev->pdata->sd_head_align;
+		if (sdiodev->pdata->sd_sgentry_align > ALIGNMENT)
+			bus->sgentry_align = sdiodev->pdata->sd_sgentry_align;
+	}
+	/* allocate scatter-gather table. sg support
+	 * will be disabled upon allocation failure.
+	 */
+	brcmf_sdiod_sgtable_alloc(sdiodev);
+
+#ifdef CONFIG_PM_SLEEP
+	/* wowl can be supported when KEEP_POWER is true and (WAKE_SDIO_IRQ
+	 * is true or when platform data OOB irq is true).
+	 */
+	if ((sdio_get_host_pm_caps(sdiodev->func[1]) & MMC_PM_KEEP_POWER) &&
+	    ((sdio_get_host_pm_caps(sdiodev->func[1]) & MMC_PM_WAKE_SDIO_IRQ) ||
+	     (sdiodev->pdata && sdiodev->pdata->oob_irq_supported)))
+		sdiodev->bus_if->wowl_supported = true;
+#endif
 
 	if (brcmf_sdio_kso_init(bus)) {
 		brcmf_err("error enabling KSO\n");
 		goto fail;
 	}
 
-	if ((bus->sdiodev->pdata) && (bus->sdiodev->pdata->drive_strength))
-		drivestrength = bus->sdiodev->pdata->drive_strength;
+	if ((sdiodev->pdata) && (sdiodev->pdata->drive_strength))
+		drivestrength = sdiodev->pdata->drive_strength;
 	else
 		drivestrength = DEFAULT_SDIO_DRIVE_STRENGTH;
-	brcmf_sdio_drivestrengthinit(bus->sdiodev, bus->ci, drivestrength);
+	brcmf_sdio_drivestrengthinit(sdiodev, bus->ci, drivestrength);
 
 	/* Set card control so an SDIO card reset does a WLAN backplane reset */
-	reg_val = brcmf_sdiod_regrb(bus->sdiodev,
-				    SDIO_CCCR_BRCM_CARDCTRL, &err);
+	reg_val = brcmf_sdiod_regrb(sdiodev, SDIO_CCCR_BRCM_CARDCTRL, &err);
 	if (err)
 		goto fail;
 
 	reg_val |= SDIO_CCCR_BRCM_CARDCTRL_WLANRESET;
 
-	brcmf_sdiod_regwb(bus->sdiodev,
-			  SDIO_CCCR_BRCM_CARDCTRL, reg_val, &err);
+	brcmf_sdiod_regwb(sdiodev, SDIO_CCCR_BRCM_CARDCTRL, reg_val, &err);
 	if (err)
 		goto fail;
 
 	/* set PMUControl so a backplane reset does PMU state reload */
 	reg_addr = CORE_CC_REG(brcmf_chip_get_pmu(bus->ci)->base, pmucontrol);
-	reg_val = brcmf_sdiod_regrl(bus->sdiodev, reg_addr, &err);
+	reg_val = brcmf_sdiod_regrl(sdiodev, reg_addr, &err);
 	if (err)
 		goto fail;
 
 	reg_val |= (BCMA_CC_PMU_CTL_RES_RELOAD << BCMA_CC_PMU_CTL_RES_SHIFT);
 
-	brcmf_sdiod_regwl(bus->sdiodev, reg_addr, reg_val, &err);
+	brcmf_sdiod_regwl(sdiodev, reg_addr, reg_val, &err);
 	if (err)
 		goto fail;
 
-	sdio_release_host(bus->sdiodev->func[1]);
+	sdio_release_host(sdiodev->func[1]);
 
 	brcmu_pktq_init(&bus->txq, (PRIOMASK + 1), TXQLEN);
 
@@ -3867,7 +3896,7 @@ brcmf_sdio_probe_attach(struct brcmf_sdio *bus)
 	return true;
 
 fail:
-	sdio_release_host(bus->sdiodev->func[1]);
+	sdio_release_host(sdiodev->func[1]);
 	return false;
 }
 
@@ -4045,18 +4074,6 @@ struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev)
 	bus->txminmax = BRCMF_TXMINMAX;
 	bus->tx_seq = SDPCM_SEQ_WRAP - 1;
 
-	/* platform specific configuration:
-	 *   alignments must be at least 4 bytes for ADMA
-	 */
-	bus->head_align = ALIGNMENT;
-	bus->sgentry_align = ALIGNMENT;
-	if (sdiodev->pdata) {
-		if (sdiodev->pdata->sd_head_align > ALIGNMENT)
-			bus->head_align = sdiodev->pdata->sd_head_align;
-		if (sdiodev->pdata->sd_sgentry_align > ALIGNMENT)
-			bus->sgentry_align = sdiodev->pdata->sd_sgentry_align;
-	}
-
 	/* single-threaded workqueue */
 	wq = alloc_ordered_workqueue("brcmf_wq/%s", WQ_MEM_RECLAIM,
 				     dev_name(&sdiodev->func[1]->dev));
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.h
index 23f223150cef..50df9cb21af2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.h
@@ -184,7 +184,7 @@ struct brcmf_sdio_dev {
 	struct brcmf_sdio *bus;
 	struct device *dev;
 	struct brcmf_bus *bus_if;
-	struct brcmfmac_sdio_platform_data *pdata;
+	struct brcmfmac_sdio_pd *pdata;
 	bool oob_irq_requested;
 	bool irq_en;			/* irq enable flags */
 	spinlock_t irq_en_lock;
diff --git a/include/linux/platform_data/brcmfmac-sdio.h b/include/linux/platform_data/brcmfmac-sdio.h
deleted file mode 100644
index e75dcbf2b230..000000000000
--- a/include/linux/platform_data/brcmfmac-sdio.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2013 Broadcom Corporation
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifndef _LINUX_BRCMFMAC_PLATFORM_H
-#define _LINUX_BRCMFMAC_PLATFORM_H
-
-/*
- * Platform specific driver functions and data. Through the platform specific
- * device data functions can be provided to help the brcmfmac driver to
- * operate with the device in combination with the used platform.
- *
- * Use the platform data in the following (similar) way:
- *
- *
-#include <brcmfmac_platform.h>
-
-
-static void brcmfmac_power_on(void)
-{
-}
-
-static void brcmfmac_power_off(void)
-{
-}
-
-static void brcmfmac_reset(void)
-{
-}
-
-static struct brcmfmac_sdio_platform_data brcmfmac_sdio_pdata = {
-	.power_on		= brcmfmac_power_on,
-	.power_off		= brcmfmac_power_off,
-	.reset			= brcmfmac_reset
-};
-
-static struct platform_device brcmfmac_device = {
-	.name			= BRCMFMAC_SDIO_PDATA_NAME,
-	.id			= PLATFORM_DEVID_NONE,
-	.dev.platform_data	= &brcmfmac_sdio_pdata
-};
-
-void __init brcmfmac_init_pdata(void)
-{
-	brcmfmac_sdio_pdata.oob_irq_supported = true;
-	brcmfmac_sdio_pdata.oob_irq_nr = gpio_to_irq(GPIO_BRCMF_SDIO_OOB);
-	brcmfmac_sdio_pdata.oob_irq_flags = IORESOURCE_IRQ |
-					    IORESOURCE_IRQ_HIGHLEVEL;
-	platform_device_register(&brcmfmac_device);
-}
- *
- *
- * Note: the brcmfmac can be loaded as module or be statically built-in into
- * the kernel. If built-in then do note that it uses module_init (and
- * module_exit) routines which equal device_initcall. So if you intend to
- * create a module with the platform specific data for the brcmfmac and have
- * it built-in to the kernel then use a higher initcall then device_initcall
- * (see init.h). If this is not done then brcmfmac will load without problems
- * but will not pickup the platform data.
- *
- * When the driver does not "detect" platform driver data then it will continue
- * without reporting anything and just assume there is no data needed. Which is
- * probably true for most platforms.
- *
- * Explanation of the platform_data fields:
- *
- * drive_strength: is the preferred drive_strength to be used for the SDIO
- * pins. If 0 then a default value will be used. This is the target drive
- * strength, the exact drive strength which will be used depends on the
- * capabilities of the device.
- *
- * oob_irq_supported: does the board have support for OOB interrupts. SDIO
- * in-band interrupts are relatively slow and for having less overhead on
- * interrupt processing an out of band interrupt can be used. If the HW
- * supports this then enable this by setting this field to true and configure
- * the oob related fields.
- *
- * oob_irq_nr, oob_irq_flags: the OOB interrupt information. The values are
- * used for registering the irq using request_irq function.
- *
- * broken_sg_support: flag for broken sg list support of SDIO host controller.
- * Set this to true if the SDIO host controller has higher align requirement
- * than 32 bytes for each scatterlist item.
- *
- * sd_head_align: alignment requirement for start of data buffer
- *
- * sd_sgentry_align: length alignment requirement for each sg entry
- *
- * power_on: This function is called by the brcmfmac when the module gets
- * loaded. This can be particularly useful for low power devices. The platform
- * spcific routine may for example decide to power up the complete device.
- * If there is no use-case for this function then provide NULL.
- *
- * power_off: This function is called by the brcmfmac when the module gets
- * unloaded. At this point the device can be powered down or otherwise be reset.
- * So if an actual power_off is not supported but reset is then reset the device
- * when this function gets called. This can be particularly useful for low power
- * devices. If there is no use-case for this function (either power-down or
- * reset) then provide NULL.
- *
- * reset: This function can get called if the device communication broke down.
- * This functionality is particularly useful in case of SDIO type devices. It is
- * possible to reset a dongle via sdio data interface, but it requires that
- * this is fully functional. This function is chip/module specific and this
- * function should return only after the complete reset has completed.
- */
-
-#define BRCMFMAC_SDIO_PDATA_NAME	"brcmfmac_sdio"
-
-struct brcmfmac_sdio_platform_data {
-	unsigned int drive_strength;
-	bool oob_irq_supported;
-	unsigned int oob_irq_nr;
-	unsigned long oob_irq_flags;
-	bool broken_sg_support;
-	unsigned short sd_head_align;
-	unsigned short sd_sgentry_align;
-	void (*power_on)(void);
-	void (*power_off)(void);
-	void (*reset)(void);
-};
-
-#endif /* _LINUX_BRCMFMAC_PLATFORM_H */
diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h
new file mode 100644
index 000000000000..1d30bf278231
--- /dev/null
+++ b/include/linux/platform_data/brcmfmac.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 201 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _LINUX_BRCMFMAC_PLATFORM_H
+#define _LINUX_BRCMFMAC_PLATFORM_H
+
+
+#define BRCMFMAC_PDATA_NAME		"brcmfmac"
+
+#define BRCMFMAC_COUNTRY_BUF_SZ		4
+
+
+/*
+ * Platform specific driver functions and data. Through the platform specific
+ * device data functions and data can be provided to help the brcmfmac driver to
+ * operate with the device in combination with the used platform.
+ */
+
+
+/**
+ * Note: the brcmfmac can be loaded as module or be statically built-in into
+ * the kernel. If built-in then do note that it uses module_init (and
+ * module_exit) routines which equal device_initcall. So if you intend to
+ * create a module with the platform specific data for the brcmfmac and have
+ * it built-in to the kernel then use a higher initcall then device_initcall
+ * (see init.h). If this is not done then brcmfmac will load without problems
+ * but will not pickup the platform data.
+ *
+ * When the driver does not "detect" platform driver data then it will continue
+ * without reporting anything and just assume there is no data needed. Which is
+ * probably true for most platforms.
+ */
+
+/**
+ * enum brcmf_bus_type - Bus type identifier. Currently SDIO, USB and PCIE are
+ *			 supported.
+ */
+enum brcmf_bus_type {
+	BRCMF_BUSTYPE_SDIO,
+	BRCMF_BUSTYPE_USB,
+	BRCMF_BUSTYPE_PCIE
+};
+
+
+/**
+ * struct brcmfmac_sdio_pd - SDIO Device specific platform data.
+ *
+ * @txglomsz:		SDIO txglom size. Use 0 if default of driver is to be
+ *			used.
+ * @drive_strength:	is the preferred drive_strength to be used for the SDIO
+ *			pins. If 0 then a default value will be used. This is
+ *			the target drive strength, the exact drive strength
+ *			which will be used depends on the capabilities of the
+ *			device.
+ * @oob_irq_supported:	does the board have support for OOB interrupts. SDIO
+ *			in-band interrupts are relatively slow and for having
+ *			less overhead on interrupt processing an out of band
+ *			interrupt can be used. If the HW supports this then
+ *			enable this by setting this field to true and configure
+ *			the oob related fields.
+ * @oob_irq_nr,
+ * @oob_irq_flags:	the OOB interrupt information. The values are used for
+ *			registering the irq using request_irq function.
+ * @broken_sg_support:	flag for broken sg list support of SDIO host controller.
+ *			Set this to true if the SDIO host controller has higher
+ *			align requirement than 32 bytes for each scatterlist
+ *			item.
+ * @sd_head_align:	alignment requirement for start of data buffer.
+ * @sd_sgentry_align:	length alignment requirement for each sg entry.
+ * @reset:		This function can get called if the device communication
+ *			broke down. This functionality is particularly useful in
+ *			case of SDIO type devices. It is possible to reset a
+ *			dongle via sdio data interface, but it requires that
+ *			this is fully functional. This function is chip/module
+ *			specific and this function should return only after the
+ *			complete reset has completed.
+ */
+struct brcmfmac_sdio_pd {
+	int		txglomsz;
+	unsigned int	drive_strength;
+	bool		oob_irq_supported;
+	unsigned int	oob_irq_nr;
+	unsigned long	oob_irq_flags;
+	bool		broken_sg_support;
+	unsigned short	sd_head_align;
+	unsigned short	sd_sgentry_align;
+	void		(*reset)(void);
+};
+
+/**
+ * struct brcmfmac_pd_cc_entry - Struct for translating user space country code
+ *				 (iso3166) to firmware country code and
+ *				 revision.
+ *
+ * @iso3166:	iso3166 alpha 2 country code string.
+ * @cc:		firmware country code string.
+ * @rev:	firmware country code revision.
+ */
+struct brcmfmac_pd_cc_entry {
+	char	iso3166[BRCMFMAC_COUNTRY_BUF_SZ];
+	char	cc[BRCMFMAC_COUNTRY_BUF_SZ];
+	s32	rev;
+};
+
+/**
+ * struct brcmfmac_pd_cc - Struct for translating country codes as set by user
+ *			   space to a country code and rev which can be used by
+ *			   firmware.
+ *
+ * @table_size:	number of entries in table (> 0)
+ * @table:	array of 1 or more elements with translation information.
+ */
+struct brcmfmac_pd_cc {
+	int				table_size;
+	struct brcmfmac_pd_cc_entry	table[0];
+};
+
+/**
+ * struct brcmfmac_pd_device - Device specific platform data. (id/rev/bus_type)
+ *			       is the unique identifier of the device.
+ *
+ * @id:			ID of the device for which this data is. In case of SDIO
+ *			or PCIE this is the chipid as identified by chip.c In
+ *			case of USB this is the chipid as identified by the
+ *			device query.
+ * @rev:		chip revision, see id.
+ * @bus_type:		The type of bus. Some chipid/rev exist for different bus
+ *			types. Each bus type has its own set of settings.
+ * @feature_disable:	Bitmask of features to disable (override), See feature.c
+ *			in brcmfmac for details.
+ * @country_codes:	If available, pointer to struct for translating country
+ *			codes.
+ * @bus:		Bus specific (union) device settings. Currently only
+ *			SDIO.
+ */
+struct brcmfmac_pd_device {
+	unsigned int		id;
+	unsigned int		rev;
+	enum brcmf_bus_type	bus_type;
+	unsigned int		feature_disable;
+	struct brcmfmac_pd_cc	*country_codes;
+	union {
+		struct brcmfmac_sdio_pd sdio;
+	} bus;
+};
+
+/**
+ * struct brcmfmac_platform_data - BRCMFMAC specific platform data.
+ *
+ * @power_on:	This function is called by the brcmfmac driver when the module
+ *		gets loaded. This can be particularly useful for low power
+ *		devices. The platform spcific routine may for example decide to
+ *		power up the complete device. If there is no use-case for this
+ *		function then provide NULL.
+ * @power_off:	This function is called by the brcmfmac when the module gets
+ *		unloaded. At this point the devices can be powered down or
+ *		otherwise be reset. So if an actual power_off is not supported
+ *		but reset is supported by the devices then reset the devices
+ *		when this function gets called. This can be particularly useful
+ *		for low power devices. If there is no use-case for this
+ *		function then provide NULL.
+ */
+struct brcmfmac_platform_data {
+	void	(*power_on)(void);
+	void	(*power_off)(void);
+	char	*fw_alternative_path;
+	int	device_count;
+	struct brcmfmac_pd_device devices[0];
+};
+
+
+#endif /* _LINUX_BRCMFMAC_PLATFORM_H */
-- 
cgit v1.2.3


From 2e62f9b2a41e4ade1a0bb3c1bbda4defe4c67243 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Fri, 12 Feb 2016 10:15:43 +0100
Subject: bcma: drop unneeded fields from bcma_pflash struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most of info stored in this struct wasn't really used anywhere as we put
all that data in platform data & resource as well.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_mips.c                  | 11 ++++-------
 include/linux/bcma/bcma_driver_chipcommon.h |  3 ---
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c
index a40a203314db..20c134c016dc 100644
--- a/drivers/bcma/driver_mips.c
+++ b/drivers/bcma/driver_mips.c
@@ -288,18 +288,15 @@ static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore)
 	case BCMA_CC_FLASHT_PARA:
 		bcma_debug(bus, "Found parallel flash\n");
 		pflash->present = true;
-		pflash->window = BCMA_SOC_FLASH2;
-		pflash->window_size = BCMA_SOC_FLASH2_SZ;
 
 		if ((bcma_read32(cc->core, BCMA_CC_FLASH_CFG) &
 		     BCMA_CC_FLASH_CFG_DS) == 0)
-			pflash->buswidth = 1;
+			bcma_pflash_data.width = 1;
 		else
-			pflash->buswidth = 2;
+			bcma_pflash_data.width = 2;
 
-		bcma_pflash_data.width = pflash->buswidth;
-		bcma_pflash_resource.start = pflash->window;
-		bcma_pflash_resource.end = pflash->window + pflash->window_size;
+		bcma_pflash_resource.start = BCMA_SOC_FLASH2;
+		bcma_pflash_resource.end = BCMA_SOC_FLASH2 + BCMA_SOC_FLASH2_SZ;
 
 		break;
 	default:
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 700d0c6f7480..16eaaad9dda5 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -579,9 +579,6 @@ struct bcma_chipcommon_pmu {
 #ifdef CONFIG_BCMA_DRIVER_MIPS
 struct bcma_pflash {
 	bool present;
-	u8 buswidth;
-	u32 window;
-	u32 window_size;
 };
 
 #ifdef CONFIG_BCMA_SFLASH
-- 
cgit v1.2.3


From d6a3b51ada68c2bd3e184f4729ce626a1721cf74 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Fri, 12 Feb 2016 10:15:44 +0100
Subject: bcma: move parallel flash support to separated file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This follows the way of handling other flashes and cleans code a bit. As
next task we will want to move flash code to ChipCommon driver as:
1) Flash controllers are accesible using ChipCommon registers
2) This code isn't MIPS specific
This change prepares bcma for that.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/Kconfig                        |  5 +++
 drivers/bcma/Makefile                       |  1 +
 drivers/bcma/bcma_private.h                 | 18 +++++++++--
 drivers/bcma/driver_chipcommon_pflash.c     | 49 +++++++++++++++++++++++++++++
 drivers/bcma/driver_mips.c                  | 35 +--------------------
 drivers/bcma/main.c                         |  2 +-
 include/linux/bcma/bcma_driver_chipcommon.h |  8 +++--
 7 files changed, 78 insertions(+), 40 deletions(-)
 create mode 100644 drivers/bcma/driver_chipcommon_pflash.c

(limited to 'include')

diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig
index 023d448ed3fa..efdc2ae8441a 100644
--- a/drivers/bcma/Kconfig
+++ b/drivers/bcma/Kconfig
@@ -70,6 +70,11 @@ config BCMA_DRIVER_MIPS
 
 	  If unsure, say N
 
+config BCMA_PFLASH
+	bool
+	depends on BCMA_DRIVER_MIPS
+	default y
+
 config BCMA_SFLASH
 	bool
 	depends on BCMA_DRIVER_MIPS
diff --git a/drivers/bcma/Makefile b/drivers/bcma/Makefile
index f32af9b76bcd..087948a1d20d 100644
--- a/drivers/bcma/Makefile
+++ b/drivers/bcma/Makefile
@@ -1,6 +1,7 @@
 bcma-y					+= main.o scan.o core.o sprom.o
 bcma-y					+= driver_chipcommon.o driver_chipcommon_pmu.o
 bcma-y					+= driver_chipcommon_b.o
+bcma-$(CONFIG_BCMA_PFLASH)		+= driver_chipcommon_pflash.o
 bcma-$(CONFIG_BCMA_SFLASH)		+= driver_chipcommon_sflash.o
 bcma-$(CONFIG_BCMA_NFLASH)		+= driver_chipcommon_nflash.o
 bcma-$(CONFIG_BCMA_DRIVER_PCI)		+= driver_pci.o
diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h
index 7e4ddfb076d3..eda09090cb52 100644
--- a/drivers/bcma/bcma_private.h
+++ b/drivers/bcma/bcma_private.h
@@ -47,9 +47,6 @@ int bcma_sprom_get(struct bcma_bus *bus);
 void bcma_core_chipcommon_early_init(struct bcma_drv_cc *cc);
 void bcma_core_chipcommon_init(struct bcma_drv_cc *cc);
 void bcma_chipco_bcm4331_ext_pa_lines_ctl(struct bcma_drv_cc *cc, bool enable);
-#ifdef CONFIG_BCMA_DRIVER_MIPS
-extern struct platform_device bcma_pflash_dev;
-#endif /* CONFIG_BCMA_DRIVER_MIPS */
 
 /* driver_chipcommon_b.c */
 int bcma_core_chipcommon_b_init(struct bcma_drv_cc_b *ccb);
@@ -61,6 +58,21 @@ void bcma_pmu_init(struct bcma_drv_cc *cc);
 u32 bcma_pmu_get_alp_clock(struct bcma_drv_cc *cc);
 u32 bcma_pmu_get_cpu_clock(struct bcma_drv_cc *cc);
 
+/**************************************************
+ * driver_chipcommon_sflash.c
+ **************************************************/
+
+#ifdef CONFIG_BCMA_PFLASH
+extern struct platform_device bcma_pflash_dev;
+int bcma_pflash_init(struct bcma_drv_cc *cc);
+#else
+static inline int bcma_pflash_init(struct bcma_drv_cc *cc)
+{
+	bcma_err(cc->core->bus, "Parallel flash not supported\n");
+	return 0;
+}
+#endif /* CONFIG_BCMA_PFLASH */
+
 #ifdef CONFIG_BCMA_SFLASH
 /* driver_chipcommon_sflash.c */
 int bcma_sflash_init(struct bcma_drv_cc *cc);
diff --git a/drivers/bcma/driver_chipcommon_pflash.c b/drivers/bcma/driver_chipcommon_pflash.c
new file mode 100644
index 000000000000..3b497c9ee0d4
--- /dev/null
+++ b/drivers/bcma/driver_chipcommon_pflash.c
@@ -0,0 +1,49 @@
+/*
+ * Broadcom specific AMBA
+ * ChipCommon parallel flash
+ *
+ * Licensed under the GNU/GPL. See COPYING for details.
+ */
+
+#include "bcma_private.h"
+
+#include <linux/bcma/bcma.h>
+#include <linux/mtd/physmap.h>
+#include <linux/platform_device.h>
+
+static const char * const part_probes[] = { "bcm47xxpart", NULL };
+
+static struct physmap_flash_data bcma_pflash_data = {
+	.part_probe_types	= part_probes,
+};
+
+static struct resource bcma_pflash_resource = {
+	.name	= "bcma_pflash",
+	.flags  = IORESOURCE_MEM,
+};
+
+struct platform_device bcma_pflash_dev = {
+	.name		= "physmap-flash",
+	.dev		= {
+		.platform_data  = &bcma_pflash_data,
+	},
+	.resource	= &bcma_pflash_resource,
+	.num_resources	= 1,
+};
+
+int bcma_pflash_init(struct bcma_drv_cc *cc)
+{
+	struct bcma_pflash *pflash = &cc->pflash;
+
+	pflash->present = true;
+
+	if (!(bcma_read32(cc->core, BCMA_CC_FLASH_CFG) & BCMA_CC_FLASH_CFG_DS))
+		bcma_pflash_data.width = 1;
+	else
+		bcma_pflash_data.width = 2;
+
+	bcma_pflash_resource.start = BCMA_SOC_FLASH2;
+	bcma_pflash_resource.end = BCMA_SOC_FLASH2 + BCMA_SOC_FLASH2_SZ;
+
+	return 0;
+}
diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c
index 20c134c016dc..967b0e85e2cc 100644
--- a/drivers/bcma/driver_mips.c
+++ b/drivers/bcma/driver_mips.c
@@ -14,8 +14,6 @@
 
 #include <linux/bcma/bcma.h>
 
-#include <linux/mtd/physmap.h>
-#include <linux/platform_device.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
 #include <linux/serial_reg.h>
@@ -32,26 +30,6 @@ enum bcma_boot_dev {
 	BCMA_BOOT_DEV_NAND,
 };
 
-static const char * const part_probes[] = { "bcm47xxpart", NULL };
-
-static struct physmap_flash_data bcma_pflash_data = {
-	.part_probe_types	= part_probes,
-};
-
-static struct resource bcma_pflash_resource = {
-	.name	= "bcma_pflash",
-	.flags  = IORESOURCE_MEM,
-};
-
-struct platform_device bcma_pflash_dev = {
-	.name		= "physmap-flash",
-	.dev		= {
-		.platform_data  = &bcma_pflash_data,
-	},
-	.resource	= &bcma_pflash_resource,
-	.num_resources	= 1,
-};
-
 /* The 47162a0 hangs when reading MIPS DMP registers registers */
 static inline bool bcma_core_mips_bcm47162a0_quirk(struct bcma_device *dev)
 {
@@ -276,7 +254,6 @@ static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore)
 {
 	struct bcma_bus *bus = mcore->core->bus;
 	struct bcma_drv_cc *cc = &bus->drv_cc;
-	struct bcma_pflash *pflash = &cc->pflash;
 	enum bcma_boot_dev boot_dev;
 
 	switch (cc->capabilities & BCMA_CC_CAP_FLASHT) {
@@ -287,17 +264,7 @@ static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore)
 		break;
 	case BCMA_CC_FLASHT_PARA:
 		bcma_debug(bus, "Found parallel flash\n");
-		pflash->present = true;
-
-		if ((bcma_read32(cc->core, BCMA_CC_FLASH_CFG) &
-		     BCMA_CC_FLASH_CFG_DS) == 0)
-			bcma_pflash_data.width = 1;
-		else
-			bcma_pflash_data.width = 2;
-
-		bcma_pflash_resource.start = BCMA_SOC_FLASH2;
-		bcma_pflash_resource.end = BCMA_SOC_FLASH2 + BCMA_SOC_FLASH2_SZ;
-
+		bcma_pflash_init(cc);
 		break;
 	default:
 		bcma_err(bus, "Flash type not supported\n");
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index c466f752b067..786be8fed39e 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -350,7 +350,7 @@ static int bcma_register_devices(struct bcma_bus *bus)
 		bcma_register_core(bus, core);
 	}
 
-#ifdef CONFIG_BCMA_DRIVER_MIPS
+#ifdef CONFIG_BCMA_PFLASH
 	if (bus->drv_cc.pflash.present) {
 		err = platform_device_register(&bcma_pflash_dev);
 		if (err)
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 16eaaad9dda5..846513c73606 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -576,10 +576,11 @@ struct bcma_chipcommon_pmu {
 	u32 crystalfreq;	/* The active crystal frequency (in kHz) */
 };
 
-#ifdef CONFIG_BCMA_DRIVER_MIPS
+#ifdef CONFIG_BCMA_PFLASH
 struct bcma_pflash {
 	bool present;
 };
+#endif
 
 #ifdef CONFIG_BCMA_SFLASH
 struct mtd_info;
@@ -603,6 +604,7 @@ struct bcma_nflash {
 };
 #endif
 
+#ifdef CONFIG_BCMA_DRIVER_MIPS
 struct bcma_serial_port {
 	void *regs;
 	unsigned long clockspeed;
@@ -622,8 +624,9 @@ struct bcma_drv_cc {
 	/* Fast Powerup Delay constant */
 	u16 fast_pwrup_delay;
 	struct bcma_chipcommon_pmu pmu;
-#ifdef CONFIG_BCMA_DRIVER_MIPS
+#ifdef CONFIG_BCMA_PFLASH
 	struct bcma_pflash pflash;
+#endif
 #ifdef CONFIG_BCMA_SFLASH
 	struct bcma_sflash sflash;
 #endif
@@ -631,6 +634,7 @@ struct bcma_drv_cc {
 	struct bcma_nflash nflash;
 #endif
 
+#ifdef CONFIG_BCMA_DRIVER_MIPS
 	int nr_serial_ports;
 	struct bcma_serial_port serial_ports[4];
 #endif /* CONFIG_BCMA_DRIVER_MIPS */
-- 
cgit v1.2.3


From 088c86183012495b53ecc1c734909e5712a40b66 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@qlogic.com>
Date: Fri, 4 Mar 2016 12:35:05 -0500
Subject: qed/qede: Add infrastructure support for hardware GRO

This patch adds mainly structures and APIs prototype changes
in order to give support for qede slowpath/fastpath support
for the same.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h    | 14 ++++-
 drivers/net/ethernet/qlogic/qed/qed_l2.c     | 81 +++++++++++++++++++---------
 drivers/net/ethernet/qlogic/qede/qede_main.c | 17 +++---
 include/linux/qed/qed_eth_if.h               | 12 +++--
 4 files changed, 88 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 592e0e6d9b42..236db8a99ec3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -2919,7 +2919,19 @@ struct eth_vport_rx_mode {
 };
 
 struct eth_vport_tpa_param {
-	u64     reserved[2];
+	u8	tpa_ipv4_en_flg;
+	u8	tpa_ipv6_en_flg;
+	u8	tpa_ipv4_tunn_en_flg;
+	u8	tpa_ipv6_tunn_en_flg;
+	u8	tpa_pkt_split_flg;
+	u8	tpa_hdr_data_split_flg;
+	u8	tpa_gro_consistent_flg;
+	u8	tpa_max_aggs_num;
+	u16	tpa_max_size;
+	u16	tpa_min_size_to_start;
+	u16	tpa_min_size_to_cont;
+	u8	max_buff_num;
+	u8	reserved;
 };
 
 struct eth_vport_tx_mode {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 102ddc73b841..3f35c6ca9252 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -132,16 +132,29 @@ struct qed_sp_vport_update_params {
 	struct qed_filter_accept_flags	accept_flags;
 };
 
+enum qed_tpa_mode {
+	QED_TPA_MODE_NONE,
+	QED_TPA_MODE_UNUSED,
+	QED_TPA_MODE_GRO,
+	QED_TPA_MODE_MAX
+};
+
+struct qed_sp_vport_start_params {
+	enum qed_tpa_mode	tpa_mode;
+	bool			remove_inner_vlan;
+	bool			drop_ttl0;
+	u8			max_buffers_per_cqe;
+	u32			concrete_fid;
+	u16			opaque_fid;
+	u8			vport_id;
+	u16			mtu;
+};
+
 #define QED_MAX_SGES_NUM 16
 #define CRC32_POLY 0x1edc6f41
 
 static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
-			      u32 concrete_fid,
-			      u16 opaque_fid,
-			      u8 vport_id,
-			      u16 mtu,
-			      u8 drop_ttl0_flg,
-			      u8 inner_vlan_removal_en_flg)
+			      struct qed_sp_vport_start_params *p_params)
 {
 	struct vport_start_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent =  NULL;
@@ -150,13 +163,13 @@ static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 	u16 rx_mode = 0;
 	u8 abs_vport_id = 0;
 
-	rc = qed_fw_vport(p_hwfn, vport_id, &abs_vport_id);
+	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
 	if (rc != 0)
 		return rc;
 
 	memset(&init_data, 0, sizeof(init_data));
 	init_data.cid = qed_spq_get_cid(p_hwfn);
-	init_data.opaque_fid = opaque_fid;
+	init_data.opaque_fid = p_params->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -168,9 +181,9 @@ static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 	p_ramrod		= &p_ent->ramrod.vport_start;
 	p_ramrod->vport_id	= abs_vport_id;
 
-	p_ramrod->mtu			= cpu_to_le16(mtu);
-	p_ramrod->inner_vlan_removal_en = inner_vlan_removal_en_flg;
-	p_ramrod->drop_ttl0_en		= drop_ttl0_flg;
+	p_ramrod->mtu			= cpu_to_le16(p_params->mtu);
+	p_ramrod->inner_vlan_removal_en	= p_params->remove_inner_vlan;
+	p_ramrod->drop_ttl0_en		= p_params->drop_ttl0;
 
 	SET_FIELD(rx_mode, ETH_VPORT_RX_MODE_UCAST_DROP_ALL, 1);
 	SET_FIELD(rx_mode, ETH_VPORT_RX_MODE_MCAST_DROP_ALL, 1);
@@ -181,9 +194,26 @@ static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 	memset(&p_ramrod->tpa_param, 0,
 	       sizeof(struct eth_vport_tpa_param));
 
+	p_ramrod->tpa_param.max_buff_num = p_params->max_buffers_per_cqe;
+
+	switch (p_params->tpa_mode) {
+	case QED_TPA_MODE_GRO:
+		p_ramrod->tpa_param.tpa_max_aggs_num = ETH_TPA_MAX_AGGS_NUM;
+		p_ramrod->tpa_param.tpa_max_size = (u16)-1;
+		p_ramrod->tpa_param.tpa_min_size_to_cont = p_params->mtu / 2;
+		p_ramrod->tpa_param.tpa_min_size_to_start = p_params->mtu / 2;
+		p_ramrod->tpa_param.tpa_ipv4_en_flg = 1;
+		p_ramrod->tpa_param.tpa_ipv6_en_flg = 1;
+		p_ramrod->tpa_param.tpa_pkt_split_flg = 1;
+		p_ramrod->tpa_param.tpa_gro_consistent_flg = 1;
+		break;
+	default:
+		break;
+	}
+
 	/* Software Function ID in hwfn (PFs are 0 - 15, VFs are 16 - 135) */
 	p_ramrod->sw_fid = qed_concrete_to_sw_fid(p_hwfn->cdev,
-						  concrete_fid);
+						  p_params->concrete_fid);
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
@@ -1592,24 +1622,25 @@ static void qed_register_eth_ops(struct qed_dev *cdev,
 }
 
 static int qed_start_vport(struct qed_dev *cdev,
-			   u8 vport_id,
-			   u16 mtu,
-			   u8 drop_ttl0_flg,
-			   u8 inner_vlan_removal_en_flg)
+			   struct qed_start_vport_params *params)
 {
 	int rc, i;
 
 	for_each_hwfn(cdev, i) {
+		struct qed_sp_vport_start_params start = { 0 };
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
-		rc = qed_sp_vport_start(p_hwfn,
-					p_hwfn->hw_info.concrete_fid,
-					p_hwfn->hw_info.opaque_fid,
-					vport_id,
-					mtu,
-					drop_ttl0_flg,
-					inner_vlan_removal_en_flg);
-
+		start.tpa_mode = params->gro_enable ? QED_TPA_MODE_GRO :
+							QED_TPA_MODE_NONE;
+		start.remove_inner_vlan = params->remove_inner_vlan;
+		start.drop_ttl0 = params->drop_ttl0;
+		start.opaque_fid = p_hwfn->hw_info.opaque_fid;
+		start.concrete_fid = p_hwfn->hw_info.concrete_fid;
+		start.vport_id = params->vport_id;
+		start.max_buffers_per_cqe = 16;
+		start.mtu = params->mtu;
+
+		rc = qed_sp_vport_start(p_hwfn, &start);
 		if (rc) {
 			DP_ERR(cdev, "Failed to start VPORT\n");
 			return rc;
@@ -1619,7 +1650,7 @@ static int qed_start_vport(struct qed_dev *cdev,
 
 		DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
 			   "Started V-PORT %d with MTU %d\n",
-			   vport_id, mtu);
+			   start.vport_id, start.mtu);
 	}
 
 	qed_reset_vport_stats(cdev);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index ddd9e4aaa500..f75f334af7bd 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2466,11 +2466,12 @@ static int qede_stop_queues(struct qede_dev *edev)
 static int qede_start_queues(struct qede_dev *edev)
 {
 	int rc, tc, i;
-	int vport_id = 0, drop_ttl0_flg = 1, vlan_removal_en = 1;
+	int vlan_removal_en = 1;
 	struct qed_dev *cdev = edev->cdev;
 	struct qed_update_vport_rss_params *rss_params = &edev->rss_params;
 	struct qed_update_vport_params vport_update_params;
 	struct qed_queue_start_common_params q_params;
+	struct qed_start_vport_params start = {0};
 
 	if (!edev->num_rss) {
 		DP_ERR(edev,
@@ -2478,10 +2479,12 @@ static int qede_start_queues(struct qede_dev *edev)
 		return -EINVAL;
 	}
 
-	rc = edev->ops->vport_start(cdev, vport_id,
-				    edev->ndev->mtu,
-				    drop_ttl0_flg,
-				    vlan_removal_en);
+	start.mtu = edev->ndev->mtu;
+	start.vport_id = 0;
+	start.drop_ttl0 = true;
+	start.remove_inner_vlan = vlan_removal_en;
+
+	rc = edev->ops->vport_start(cdev, &start);
 
 	if (rc) {
 		DP_ERR(edev, "Start V-PORT failed %d\n", rc);
@@ -2490,7 +2493,7 @@ static int qede_start_queues(struct qede_dev *edev)
 
 	DP_VERBOSE(edev, NETIF_MSG_IFUP,
 		   "Start vport ramrod passed, vport_id = %d, MTU = %d, vlan_removal_en = %d\n",
-		   vport_id, edev->ndev->mtu + 0xe, vlan_removal_en);
+		   start.vport_id, edev->ndev->mtu + 0xe, vlan_removal_en);
 
 	for_each_rss(i) {
 		struct qede_fastpath *fp = &edev->fp_array[i];
@@ -2555,7 +2558,7 @@ static int qede_start_queues(struct qede_dev *edev)
 
 	/* Prepare and send the vport enable */
 	memset(&vport_update_params, 0, sizeof(vport_update_params));
-	vport_update_params.vport_id = vport_id;
+	vport_update_params.vport_id = start.vport_id;
 	vport_update_params.update_vport_active_flg = 1;
 	vport_update_params.vport_active_flg = 1;
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e53b0ca49e41..e1d69834a11f 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -39,6 +39,14 @@ struct qed_update_vport_params {
 	struct qed_update_vport_rss_params rss_params;
 };
 
+struct qed_start_vport_params {
+	bool remove_inner_vlan;
+	bool gro_enable;
+	bool drop_ttl0;
+	u8 vport_id;
+	u16 mtu;
+};
+
 struct qed_stop_rxq_params {
 	u8 rss_id;
 	u8 rx_queue_id;
@@ -118,9 +126,7 @@ struct qed_eth_ops {
 			     void *cookie);
 
 	int (*vport_start)(struct qed_dev *cdev,
-			   u8 vport_id, u16 mtu,
-			   u8 drop_ttl0_flg,
-			   u8 inner_vlan_removal_en_flg);
+			   struct qed_start_vport_params *params);
 
 	int (*vport_stop)(struct qed_dev *cdev,
 			  u8 vport_id);
-- 
cgit v1.2.3


From 8050c0f0274a15841756968857cfb07b3ab809ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:02 +0100
Subject: bpf: allow bpf_csum_diff to feed bpf_l3_csum_replace as well

Commit 7d672345ed29 ("bpf: add generic bpf_csum_diff helper") added a
generic checksum diff helper that can feed bpf_l4_csum_replace() with
a target __wsum diff that is to be applied to the L4 checksum. This
facility is very flexible, can be cascaded, allows for adding, removing,
or diffing data, or for calculating the pseudo header checksum from
scratch, but it can also be reused for working with the IPv4 header
checksum.

Thus, analogous to bpf_l4_csum_replace(), add a case for header field
value of 0 to change the checksum at a given offset through a new helper
csum_replace_by_diff(). Also, in addition to that, this provides an
easy to use interface for feeding precalculated diffs f.e. coming from
a map. It nicely complements bpf_l3_csum_replace() that currently allows
only for csum updates of 2 and 4 byte diffs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h | 5 +++++
 net/core/filter.c      | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 10a16b5bd1c7..abffc64e7300 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum)
 
 #define CSUM_MANGLED_0 ((__force __sum16)0xffff)
 
+static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
+{
+	*sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+}
+
 static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to)
 {
 	__wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from);
diff --git a/net/core/filter.c b/net/core/filter.c
index 69f4ffc0a282..356a251657a5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1447,6 +1447,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		csum_replace_by_diff(ptr, to);
+		break;
 	case 2:
 		csum_replace2(ptr, from, to);
 		break;
-- 
cgit v1.2.3


From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:03 +0100
Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash

When overwriting parts of the packet with bpf_skb_store_bytes() that
were fed previously into skb->hash calculation, we should clear the
current hash with skb_clear_hash(), so that a next skb_get_hash() call
can determine the correct hash related to this skb.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ee2193287cbe..2e3e90309904 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -305,6 +305,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_store_bytes flags. */
 #define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
 
 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
  * First 4 bits are for passing the header field size.
diff --git a/net/core/filter.c b/net/core/filter.c
index 356a251657a5..a1fe246a6147 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
 		return -EINVAL;
 
 	/* bpf verifier guarantees that:
@@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 
 	if (flags & BPF_F_RECOMPUTE_CSUM)
 		skb_postpush_rcsum(skb, ptr, len);
+	if (flags & BPF_F_INVALIDATE_HASH)
+		skb_clear_hash(skb);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:05 +0100
Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key

Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to
feed df flag into tunneling facilities (currently supported on TX by
vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key()
helper.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e3e90309904..21ee6d52016f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -330,6 +330,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
diff --git a/net/core/filter.c b/net/core/filter.c
index ce4e18dd2c89..6c9d15561d04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
 
-	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+			       BPF_F_DONT_FRAGMENT)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info->mode = IP_TUNNEL_INFO_TX;
 
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	if (flags & BPF_F_DONT_FRAGMENT)
+		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
 	info->key.ttl = from->tunnel_ttl;
-- 
cgit v1.2.3


From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:06 +0100
Subject: bpf: support for access to tunnel options

After eBPF being able to programmatically access/manage tunnel key meta
data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata")
and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6
for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary
helpers to generically access their auxiliary tunnel options.

Geneve and vxlan support this facility. For geneve, TLVs can be pushed,
and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve
case only makes sense, if we can also read/write TLVs into it. In the GBP
case, it provides the flexibility to easily map the group policy ID in
combination with other helpers or maps.

I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(),
for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather
complex by itself, and there may be cases for tunnel key backends where
tunnel options are not always needed. If we would have integrated this
into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with
remaining helper arguments, so keeping compatibility on structs in case of
passing in a flat buffer gets more cumbersome. Separating both also allows
for more flexibility and future extensibility, f.e. options could be fed
directly from a map, etc.

Moreover, change geneve's xmit path to test only for info->options_len
instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's
xmit path and allows for avoiding to specify a protocol flag in the API on
xmit, so it can be protocol agnostic. Having info->options_len is enough
information that is needed. Tested with vxlan and geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  4 +--
 include/uapi/linux/bpf.h | 11 +++++++
 net/core/filter.c        | 83 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index bc5da357e16d..36db4cf0579c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
@@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 21ee6d52016f..9221f653fee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -298,6 +298,17 @@ enum bpf_func_id {
 	 * Return: csum result
 	 */
 	BPF_FUNC_csum_diff,
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
+	 * retrieve or populate tunnel options metadata
+	 * @skb: pointer to skb
+	 * @opt: pointer to raw tunnel option data
+	 * @size: size of @opt
+	 * Return: 0 on success for set, option size for get
+	 */
+	BPF_FUNC_skb_get_tunnel_opt,
+	BPF_FUNC_skb_set_tunnel_opt,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6c9d15561d04..012a10c2da94 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *to = (u8 *) (long) r2;
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info ||
+		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+		return -ENOENT;
+	if (unlikely(size < info->options_len))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_get(to, info);
+
+	return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+	.func		= bpf_skb_get_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
 static struct metadata_dst __percpu *md_dst;
 
 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+#define BPF_TUNLEN_MAX	255
+
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *from = (u8 *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+		return -EINVAL;
+	if (unlikely(size > BPF_TUNLEN_MAX))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_set(info, from, size);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+	.func		= bpf_skb_set_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		/* race is not possible, since it's called from
-		 * verifier that is holding verifier mutex
+		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
+					  options_len) != 1);
+
+		/* Race is not possible, since it's called from verifier
+		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
 	}
-	return &bpf_skb_set_tunnel_key_proto;
+
+	switch (which) {
+	case BPF_FUNC_skb_set_tunnel_key:
+		return &bpf_skb_set_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return &bpf_skb_set_tunnel_opt_proto;
+	default:
+		return NULL;
+	}
 }
 
 static const struct bpf_func_proto *
@@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-		return bpf_get_skb_set_tunnel_key_proto();
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
 	case BPF_FUNC_redirect:
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
-- 
cgit v1.2.3


From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:07 +0100
Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit

The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.

While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.

Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.

Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.

Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  6 ++----
 drivers/net/vxlan.c      | 24 ++++++++++++------------
 include/net/ip_tunnels.h | 15 +++++++++++++++
 net/core/filter.c        |  2 +-
 net/ipv4/ip_gre.c        | 10 ++++++----
 5 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 36db4cf0579c..6a0cbbe03e5d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -775,10 +775,10 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 				       struct flowi4 *fl4,
 				       struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct dst_cache *dst_cache;
 	struct rtable *rt = NULL;
-	bool use_cache = true;
 	__u8 tos;
 
 	memset(fl4, 0, sizeof(*fl4));
@@ -804,7 +804,6 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 		dst_cache = &geneve->dst_cache;
 	}
 
-	use_cache = use_cache && !skb->mark;
 	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
 		if (rt)
@@ -832,11 +831,11 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 					   struct flowi6 *fl6,
 					   struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs6 = geneve->sock6;
 	struct dst_entry *dst = NULL;
 	struct dst_cache *dst_cache;
-	bool use_cache = true;
 	__u8 prio;
 
 	memset(fl6, 0, sizeof(*fl6));
@@ -862,7 +861,6 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		dst_cache = &geneve->dst_cache;
 	}
 
-	use_cache = use_cache && !skb->mark;
 	if (use_cache) {
 		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
 		if (dst)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index fc998a3bd234..7294a459b13c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1756,17 +1756,15 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 				      struct sk_buff *skb, int oif, u8 tos,
 				      __be32 daddr, __be32 *saddr,
 				      struct dst_cache *dst_cache,
-				      struct ip_tunnel_info *info)
+				      const struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct rtable *rt = NULL;
-	bool use_cache = false;
 	struct flowi4 fl4;
 
-	/* when the ip_tunnel_info is availble, the tos used for lookup is
-	 * packet independent, so we can use the cache
-	 */
-	if (!skb->mark && (!tos || info)) {
-		use_cache = true;
+	if (tos && !info)
+		use_cache = false;
+	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, saddr);
 		if (rt)
 			return rt;
@@ -1794,13 +1792,15 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif,
 					  const struct in6_addr *daddr,
 					  struct in6_addr *saddr,
-					  struct dst_cache *dst_cache)
+					  struct dst_cache *dst_cache,
+					  const struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct dst_entry *ndst;
 	struct flowi6 fl6;
 	int err;
 
-	if (!skb->mark) {
+	if (use_cache) {
 		ndst = dst_cache_get_ip6(dst_cache, saddr);
 		if (ndst)
 			return ndst;
@@ -1820,7 +1820,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 		return ERR_PTR(err);
 
 	*saddr = fl6.saddr;
-	if (!skb->mark)
+	if (use_cache)
 		dst_cache_set_ip6(dst_cache, ndst, saddr);
 	return ndst;
 }
@@ -2018,7 +2018,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0,
 					&dst->sin6.sin6_addr, &saddr,
-					dst_cache);
+					dst_cache, info);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
@@ -2387,7 +2387,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0,
 					&info->key.u.ipv6.dst,
-					&info->key.u.ipv6.src, NULL);
+					&info->key.u.ipv6.src, NULL, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
 		dst_release(ndst);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5f28b606633e..e1395d70fb48 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -140,6 +140,7 @@ struct ip_tunnel {
 #define TUNNEL_CRIT_OPT		__cpu_to_be16(0x0400)
 #define TUNNEL_GENEVE_OPT	__cpu_to_be16(0x0800)
 #define TUNNEL_VXLAN_OPT	__cpu_to_be16(0x1000)
+#define TUNNEL_NOCACHE		__cpu_to_be16(0x2000)
 
 #define TUNNEL_OPTIONS_PRESENT	(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
 
@@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 		       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
 }
 
+static inline bool
+ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
+			   const struct ip_tunnel_info *info)
+{
+	if (skb->mark)
+		return false;
+	if (!info)
+		return true;
+	if (info->key.tun_flags & TUNNEL_NOCACHE)
+		return false;
+
+	return true;
+}
+
 static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
 					       *tun_info)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 012a10c2da94..a66dc03c261f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1870,7 +1870,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info = &md->u.tun_info;
 	info->mode = IP_TUNNEL_INFO_TX;
 
-	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
 	if (flags & BPF_F_DONT_FRAGMENT)
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 202437d6087b..31936d387cfd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
+	struct rtable *rt = NULL;
 	struct flowi4 fl;
-	struct rtable *rt;
 	int min_headroom;
 	int tunnel_hlen;
 	__be16 df, flags;
+	bool use_cache;
 	int err;
 
 	tun_info = skb_tunnel_info(skb);
@@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) :
-			 NULL;
+	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+	if (use_cache)
+		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
 	if (!rt) {
 		rt = gre_get_rt(skb, dev, &fl, key);
 		if (IS_ERR(rt))
 				goto err_free_skb;
-		if (!skb->mark)
+		if (use_cache)
 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 					  fl.saddr);
 	}
-- 
cgit v1.2.3


From 9a03cd8f38efb83c13fbe62aff50eea4efff93da Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 8 Mar 2016 14:44:35 +0100
Subject: ipv6: per netns fib6 walkers

The IPv6 FIB data structures are separated per network namespace but
there is still only one global walkers list and one global walker list
lock. This means changes in one namespace unnecessarily interfere with
walkers in other namespaces.

Replace the global list with per-netns lists (and give each its own
lock).

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  2 ++
 net/ipv6/ip6_fib.c       | 68 +++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c0368db6df54..f0109b973648 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -58,7 +58,9 @@ struct netns_ipv6 {
 	struct timer_list       ip6_fib_timer;
 	struct hlist_head       *fib_table_hash;
 	struct fib6_table       *fib6_main_tbl;
+	struct list_head	fib6_walkers;
 	struct dst_ops		ip6_dst_ops;
+	rwlock_t		fib6_walker_lock;
 	unsigned int		 ip6_rt_gc_expire;
 	unsigned long		 ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d7c715accac9..883f2836beab 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -55,8 +55,6 @@ struct fib6_cleaner {
 	void *arg;
 };
 
-static DEFINE_RWLOCK(fib6_walker_lock);
-
 #ifdef CONFIG_IPV6_SUBTREES
 #define FWS_INIT FWS_S
 #else
@@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock);
 static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk(struct net *net, struct fib6_walker *w);
 static int fib6_walk_continue(struct fib6_walker *w);
 
 /*
@@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w);
 
 static void fib6_gc_timer_cb(unsigned long arg);
 
-static LIST_HEAD(fib6_walkers);
-#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+#define FOR_WALKERS(net, w) \
+	list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
 
-static void fib6_walker_link(struct fib6_walker *w)
+static void fib6_walker_link(struct net *net, struct fib6_walker *w)
 {
-	write_lock_bh(&fib6_walker_lock);
-	list_add(&w->lh, &fib6_walkers);
-	write_unlock_bh(&fib6_walker_lock);
+	write_lock_bh(&net->ipv6.fib6_walker_lock);
+	list_add(&w->lh, &net->ipv6.fib6_walkers);
+	write_unlock_bh(&net->ipv6.fib6_walker_lock);
 }
 
-static void fib6_walker_unlink(struct fib6_walker *w)
+static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
 {
-	write_lock_bh(&fib6_walker_lock);
+	write_lock_bh(&net->ipv6.fib6_walker_lock);
 	list_del(&w->lh);
-	write_unlock_bh(&fib6_walker_lock);
+	write_unlock_bh(&net->ipv6.fib6_walker_lock);
 }
 
 static int fib6_new_sernum(struct net *net)
@@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w)
 
 static void fib6_dump_end(struct netlink_callback *cb)
 {
+	struct net *net = sock_net(cb->skb->sk);
 	struct fib6_walker *w = (void *)cb->args[2];
 
 	if (w) {
 		if (cb->args[4]) {
 			cb->args[4] = 0;
-			fib6_walker_unlink(w);
+			fib6_walker_unlink(net, w);
 		}
 		cb->args[2] = 0;
 		kfree(w);
@@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb)
 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct fib6_walker *w;
 	int res;
 
@@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		w->skip = 0;
 
 		read_lock_bh(&table->tb6_lock);
-		res = fib6_walk(w);
+		res = fib6_walk(net, w);
 		read_unlock_bh(&table->tb6_lock);
 		if (res > 0) {
 			cb->args[4] = 1;
@@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		res = fib6_walk_continue(w);
 		read_unlock_bh(&table->tb6_lock);
 		if (res <= 0) {
-			fib6_walker_unlink(w);
+			fib6_walker_unlink(net, w);
 			cb->args[4] = 0;
 		}
 	}
@@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		}
 #endif
 
-		read_lock(&fib6_walker_lock);
-		FOR_WALKERS(w) {
+		read_lock(&net->ipv6.fib6_walker_lock);
+		FOR_WALKERS(net, w) {
 			if (!child) {
 				if (w->root == fn) {
 					w->root = w->node = NULL;
@@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 				}
 			}
 		}
-		read_unlock(&fib6_walker_lock);
+		read_unlock(&net->ipv6.fib6_walker_lock);
 
 		node_free(fn);
 		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
@@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	}
 
 	/* Adjust walkers */
-	read_lock(&fib6_walker_lock);
-	FOR_WALKERS(w) {
+	read_lock(&net->ipv6.fib6_walker_lock);
+	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
 			w->leaf = rt->dst.rt6_next;
@@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 				w->state = FWS_U;
 		}
 	}
-	read_unlock(&fib6_walker_lock);
+	read_unlock(&net->ipv6.fib6_walker_lock);
 
 	rt->dst.rt6_next = NULL;
 
@@ -1588,17 +1588,17 @@ skip:
 	}
 }
 
-static int fib6_walk(struct fib6_walker *w)
+static int fib6_walk(struct net *net, struct fib6_walker *w)
 {
 	int res;
 
 	w->state = FWS_INIT;
 	w->node = w->root;
 
-	fib6_walker_link(w);
+	fib6_walker_link(net, w);
 	res = fib6_walk_continue(w);
 	if (res <= 0)
-		fib6_walker_unlink(w);
+		fib6_walker_unlink(net, w);
 	return res;
 }
 
@@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.arg = arg;
 	c.net = net;
 
-	fib6_walk(&c.w);
+	fib6_walk(net, &c.w);
 }
 
 static void __fib6_clean_all(struct net *net,
@@ -1816,6 +1816,8 @@ static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 
+	rwlock_init(&net->ipv6.fib6_walker_lock);
+	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
 
 	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
@@ -1976,7 +1978,8 @@ static int ipv6_route_yield(struct fib6_walker *w)
 	return 0;
 }
 
-static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
+				      struct net *net)
 {
 	memset(&iter->w, 0, sizeof(iter->w));
 	iter->w.func = ipv6_route_yield;
@@ -1986,7 +1989,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
 	iter->w.args = iter;
 	iter->sernum = iter->w.root->fn_sernum;
 	INIT_LIST_HEAD(&iter->w.lh);
-	fib6_walker_link(&iter->w);
+	fib6_walker_link(net, &iter->w);
 }
 
 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
@@ -2047,16 +2050,16 @@ iter_table:
 			++*pos;
 		return iter->w.leaf;
 	} else if (r < 0) {
-		fib6_walker_unlink(&iter->w);
+		fib6_walker_unlink(net, &iter->w);
 		return NULL;
 	}
-	fib6_walker_unlink(&iter->w);
+	fib6_walker_unlink(net, &iter->w);
 
 	iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
 	if (!iter->tbl)
 		return NULL;
 
-	ipv6_route_seq_setup_walk(iter);
+	ipv6_route_seq_setup_walk(iter, net);
 	goto iter_table;
 }
 
@@ -2071,7 +2074,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
 	iter->skip = *pos;
 
 	if (iter->tbl) {
-		ipv6_route_seq_setup_walk(iter);
+		ipv6_route_seq_setup_walk(iter, net);
 		return ipv6_route_seq_next(seq, NULL, pos);
 	} else {
 		return NULL;
@@ -2087,10 +2090,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU_BH)
 {
+	struct net *net = seq_file_net(seq);
 	struct ipv6_route_iter *iter = seq->private;
 
 	if (ipv6_route_iter_active(iter))
-		fib6_walker_unlink(&iter->w);
+		fib6_walker_unlink(net, &iter->w);
 
 	rcu_read_unlock_bh();
 }
-- 
cgit v1.2.3


From 3dc94f93be161ec4203673de9a34b7362d8985b5 Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 8 Mar 2016 14:44:45 +0100
Subject: ipv6: per netns FIB garbage collection

One of our customers observed issues with FIB6 garbage collectors
running in different network namespaces blocking each other, resulting
in soft lockups (fib6_run_gc() initiated from timer runs always in
forced mode).

Now that FIB6 walkers are separated per namespace, there is no more need
for instances of fib6_run_gc() in different namespaces blocking each
other. There is still a call to icmp6_dst_gc() which operates on shared
data but this function is protected by its own shared lock.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 net/ipv6/ip6_fib.c       | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index f0109b973648..10d0848f5b8a 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -61,6 +61,7 @@ struct netns_ipv6 {
 	struct list_head	fib6_walkers;
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
+	spinlock_t		fib6_gc_lock;
 	unsigned int		 ip6_rt_gc_expire;
 	unsigned long		 ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 883f2836beab..ea071fad67a0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1776,16 +1776,14 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 	return 0;
 }
 
-static DEFINE_SPINLOCK(fib6_gc_lock);
-
 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 {
 	struct fib6_gc_args gc_args;
 	unsigned long now;
 
 	if (force) {
-		spin_lock_bh(&fib6_gc_lock);
-	} else if (!spin_trylock_bh(&fib6_gc_lock)) {
+		spin_lock_bh(&net->ipv6.fib6_gc_lock);
+	} else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
 		mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
 		return;
 	}
@@ -1804,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 					+ net->ipv6.sysctl.ip6_rt_gc_interval));
 	else
 		del_timer(&net->ipv6.ip6_fib_timer);
-	spin_unlock_bh(&fib6_gc_lock);
+	spin_unlock_bh(&net->ipv6.fib6_gc_lock);
 }
 
 static void fib6_gc_timer_cb(unsigned long arg)
@@ -1816,6 +1814,7 @@ static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 
+	spin_lock_init(&net->ipv6.fib6_gc_lock);
 	rwlock_init(&net->ipv6.fib6_walker_lock);
 	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
-- 
cgit v1.2.3


From b121d1e74d1f24654bdc3165d3db1ca149501356 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:13 -0800
Subject: bpf: prevent kprobe+bpf deadlocks

if kprobe is placed within update or delete hash map helpers
that hold bucket spin lock and triggered bpf program is trying to
grab the spinlock for the same bucket on the same cpu, it will
deadlock.
Fix it by extending existing recursion prevention mechanism.

Note, map_lookup and other tracing helpers don't have this problem,
since they don't hold any locks and don't modify global data.
bpf_trace_printk has its own recursive check and ok as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     | 13 +++++++++++++
 kernel/trace/bpf_trace.c |  2 --
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e498e5470e..4b070827200d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/percpu.h>
 
 struct bpf_map;
 
@@ -163,6 +164,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
+DECLARE_PER_CPU(int, bpf_prog_active);
+
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c95a753c2007..dc99f6a000f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 
+DEFINE_PER_CPU(int, bpf_prog_active);
+
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
 static LIST_HEAD(bpf_map_types);
@@ -347,6 +349,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
@@ -356,6 +363,8 @@ static int map_update_elem(union bpf_attr *attr)
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		rcu_read_unlock();
 	}
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
 free_value:
 	kfree(value);
@@ -394,9 +403,13 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
 	err = map->ops->map_delete_elem(map, key);
 	rcu_read_unlock();
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
 free_key:
 	kfree(key);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4b8caa392b86..3e4ffb3ace5f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,8 +13,6 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
-static DEFINE_PER_CPU(int, bpf_prog_active);
-
 /**
  * trace_call_bpf - invoke BPF program
  * @prog: BPF program
-- 
cgit v1.2.3


From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:15 -0800
Subject: bpf: pre-allocate hash map elements

If kprobe is placed on spin_unlock then calling kmalloc/kfree from
bpf programs is not safe, since the following dead lock is possible:
kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe->
bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock)
and deadlocks.

The following solutions were considered and some implemented, but
eventually discarded
- kmem_cache_create for every map
- add recursion check to slow-path of slub
- use reserved memory in bpf_map_update for in_irq or in preempt_disabled
- kmalloc via irq_work

At the end pre-allocation of all map elements turned out to be the simplest
solution and since the user is charged upfront for all the memory, such
pre-allocation doesn't affect the user space visible behavior.

Since it's impossible to tell whether kprobe is triggered in a safe
location from kmalloc point of view, use pre-allocation by default
and introduce new BPF_F_NO_PREALLOC flag.

While testing of per-cpu hash maps it was discovered
that alloc_percpu(GFP_ATOMIC) has odd corner cases and often
fails to allocate memory even when 90% of it is free.
The pre-allocation of per-cpu hash elements solves this problem as well.

Turned out that bpf_map_update() quickly followed by
bpf_map_lookup()+bpf_map_delete() is very common pattern used
in many of iovisor/bcc/tools, so there is additional benefit of
pre-allocation, since such use cases are must faster.

Since all hash map elements are now pre-allocated we can remove
atomic increment of htab->count and save few more cycles.

Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid
large malloc/free done by users who don't have sufficient limits.

Pre-allocation is done with vmalloc and alloc/free is done
via percpu_freelist. Here are performance numbers for different
pre-allocation algorithms that were implemented, but discarded
in favor of percpu_freelist:

1 cpu:
pcpu_ida	2.1M
pcpu_ida nolock	2.3M
bt		2.4M
kmalloc		1.8M
hlist+spinlock	2.3M
pcpu_freelist	2.6M

4 cpu:
pcpu_ida	1.5M
pcpu_ida nolock	1.8M
bt w/smp_align	1.7M
bt no/smp_align	1.1M
kmalloc		0.7M
hlist+spinlock	0.2M
pcpu_freelist	2.0M

8 cpu:
pcpu_ida	0.7M
bt w/smp_align	0.8M
kmalloc		0.4M
pcpu_freelist	1.5M

32 cpu:
kmalloc		0.13M
pcpu_freelist	0.49M

pcpu_ida nolock is a modified percpu_ida algorithm without
percpu_ida_cpu locks and without cross-cpu tag stealing.
It's faster than existing percpu_ida, but not as fast as pcpu_freelist.

bt is a variant of block/blk-mq-tag.c simlified and customized
for bpf use case. bt w/smp_align is using cache line for every 'long'
(similar to blk-mq-tag). bt no/smp_align allocates 'long'
bitmasks continuously to save memory. It's comparable to percpu_ida
and in some cases faster, but slower than percpu_freelist

hlist+spinlock is the simplest free list with single spinlock.
As expeceted it has very bad scaling in SMP.

kmalloc is existing implementation which is still available via
BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and
in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist,
but saves memory, so in cases where map->max_entries can be large
and number of map update/delete per second is low, it may make
sense to use it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   2 +
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/hashtab.c     | 240 +++++++++++++++++++++++++++++++++--------------
 kernel/bpf/syscall.c     |  15 ++-
 4 files changed, 186 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4b070827200d..efd1d4ca95c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
+	u32 map_flags;
 	u32 pages;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
@@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
+int bpf_map_precharge_memlock(u32 pages);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9221f653fee3..0e30b19012a5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,12 +101,15 @@ enum bpf_prog_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+#define BPF_F_NO_PREALLOC	(1U << 0)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* prealloc or not */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a68e95133fcd..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
+#include "percpu_freelist.h"
 
 struct bucket {
 	struct hlist_head head;
@@ -22,6 +24,8 @@ struct bucket {
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
+	void *elems;
+	struct pcpu_freelist freelist;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -29,15 +33,86 @@ struct bpf_htab {
 
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
-	struct hlist_node hash_node;
-	struct rcu_head rcu;
 	union {
-		u32 hash;
-		u32 key_size;
+		struct hlist_node hash_node;
+		struct bpf_htab *htab;
+		struct pcpu_freelist_node fnode;
 	};
+	struct rcu_head rcu;
+	u32 hash;
 	char key[0] __aligned(8);
 };
 
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+	int i;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto free_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		void __percpu *pptr;
+
+		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+					 htab->map.key_size);
+		free_percpu(pptr);
+	}
+free_elems:
+	vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+	int err = -ENOMEM, i;
+
+	htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+	if (!htab->elems)
+		return -ENOMEM;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto skip_percpu_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		u32 size = round_up(htab->map.value_size, 8);
+		void __percpu *pptr;
+
+		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		if (!pptr)
+			goto free_elems;
+		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+				  pptr);
+	}
+
+skip_percpu_elems:
+	err = pcpu_freelist_init(&htab->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+			       htab->map.max_entries);
+	return 0;
+
+free_elems:
+	htab_free_elems(htab);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	int err, i;
 	u64 cost;
 
+	if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
+	htab->map.map_flags = attr->map_flags;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		htab->elem_size += sizeof(void *);
 	else
-		htab->elem_size += htab->map.value_size;
+		htab->elem_size += round_up(htab->map.value_size, 8);
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
-	atomic_set(&htab->count, 0);
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+		err = prealloc_elems_and_freelist(htab);
+		if (err)
+			goto free_buckets;
+	}
 
 	return &htab->map;
 
+free_buckets:
+	kvfree(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -249,42 +340,42 @@ find_first_elem:
 		}
 	}
 
-	/* itereated over all buckets and all elements */
+	/* iterated over all buckets and all elements */
 	return -ENOENT;
 }
 
-
-static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
-				     void __percpu *pptr)
-{
-	*(void __percpu **)(l->key + key_size) = pptr;
-}
-
-static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
-{
-	return *(void __percpu **)(l->key + key_size);
-}
-
-static void htab_percpu_elem_free(struct htab_elem *l)
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
-	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
+
 }
 
-static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+static void htab_elem_free_rcu(struct rcu_head *head)
 {
 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+	struct bpf_htab *htab = l->htab;
 
-	htab_percpu_elem_free(l);
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+	 * we're calling kfree, otherwise deadlock is possible if kprobes
+	 * are placed somewhere inside of slub
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	htab_elem_free(htab, l);
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 }
 
-static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
-	if (percpu) {
-		l->key_size = key_size;
-		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
-		kfree_rcu(l, rcu);
+		atomic_dec(&htab->count);
+		l->htab = htab;
+		call_rcu(&l->rcu, htab_elem_free_rcu);
 	}
 }
 
@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 bool percpu, bool onallcpus)
 {
 	u32 size = htab->map.value_size;
+	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
 
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return NULL;
+	if (prealloc) {
+		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+		if (!l_new)
+			return ERR_PTR(-E2BIG);
+	} else {
+		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!l_new)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
 		/* round up value_size to 8 bytes */
 		size = round_up(size, 8);
 
-		/* alloc_percpu zero-fills */
-		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
-		if (!pptr) {
-			kfree(l_new);
-			return NULL;
+		if (prealloc) {
+			pptr = htab_elem_get_ptr(l_new, key_size);
+		} else {
+			/* alloc_percpu zero-fills */
+			pptr = __alloc_percpu_gfp(size, 8,
+						  GFP_ATOMIC | __GFP_NOWARN);
+			if (!pptr) {
+				kfree(l_new);
+				return ERR_PTR(-ENOMEM);
+			}
 		}
 
 		if (!onallcpus) {
@@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				off += size;
 			}
 		}
-		htab_elem_set_ptr(l_new, key_size, pptr);
+		if (!prealloc)
+			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
 	}
@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		return -E2BIG;
-
 	if (l_old && map_flags == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	hash = htab_map_hash(key, key_size);
 
-	/* allocate new element outside of the lock, since
-	 * we're most likley going to insert it
-	 */
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
-	if (!l_new)
-		return -ENOMEM;
-
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	if (IS_ERR(l_new)) {
+		/* all pre-allocated elements are in use or memory exhausted */
+		ret = PTR_ERR(l_new);
+		goto err;
+	}
+
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
-	} else {
-		atomic_inc(&htab->count);
+		free_htab_elem(htab, l_old);
 	}
-	raw_spin_unlock_irqrestore(&b->lock, flags);
-	return 0;
+	ret = 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-	kfree(l_new);
 	return ret;
 }
 
@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
 					hash, true, onallcpus);
-		if (!l_new) {
-			ret = -ENOMEM;
+		if (IS_ERR(l_new)) {
+			ret = PTR_ERR(l_new);
 			goto err;
 		}
 		hlist_add_head_rcu(&l_new->hash_node, head);
-		atomic_inc(&htab->count);
 	}
 	ret = 0;
 err:
@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		atomic_dec(&htab->count);
-		free_htab_elem(l, percpu, key_size);
+		free_htab_elem(htab, l);
 		ret = 0;
 	}
 
@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			atomic_dec(&htab->count);
-			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
-				l->key_size = htab->map.key_size;
-				htab_percpu_elem_free(l);
-			} else {
-				kfree(l);
-			}
+			htab_elem_free(htab, l);
 		}
 	}
 }
-
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
-	/* some of kfree_rcu() callbacks for elements of this map may not have
-	 * executed. It's ok. Proceed to free residual elements and map itself
+	/* some of free_htab_elem() callbacks for elements of this map may
+	 * not have executed. Wait for them.
 	 */
-	delete_all_elements(htab);
+	rcu_barrier();
+	if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+		delete_all_elements(htab);
+	} else {
+		htab_free_elems(htab);
+		pcpu_freelist_destroy(&htab->freelist);
+	}
 	kvfree(htab->buckets);
 	kfree(htab);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc99f6a000f5..cbd94b2144ff 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+int bpf_map_precharge_memlock(u32 pages)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit, cur;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	cur = atomic_long_read(&user->locked_vm);
+	free_uid(user);
+	if (cur + pages > memlock_limit)
+		return -EPERM;
+	return 0;
+}
+
 static int bpf_map_charge_memlock(struct bpf_map *map)
 {
 	struct user_struct *user = get_current_user();
@@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
-- 
cgit v1.2.3


From 557c0c6e7df8e14a46bd7560d193fa5bbc00a858 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:17 -0800
Subject: bpf: convert stackmap to pre-allocation

It was observed that calling bpf_get_stackid() from a kprobe inside
slub or from spin_unlock causes similar deadlock as with hashmap,
therefore convert stackmap to use pre-allocated memory.

The call_rcu is no longer feasible mechanism, since delayed freeing
causes bpf_get_stackid() to fail unpredictably when number of actual
stacks is significantly less than user requested max_entries.
Since elements are no longer freed into slub, we can push elements into
freelist immediately and let them be recycled.
However the very unlikley race between user space map_lookup() and
program-side recycling is possible:
     cpu0                          cpu1
     ----                          ----
user does lookup(stackidX)
starts copying ips into buffer
                                   delete(stackidX)
                                   calls bpf_get_stackid()
				   which recyles the element and
                                   overwrites with new stack trace

To avoid user space seeing a partial stack trace consisting of two
merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket);
to preserve consistent stack trace delivery to user space.
Now we can move memset(,0) of left-over element value from critical
path of bpf_get_stackid() into slow-path of user space lookup.
Also disallow lookup() from bpf program, since it's useless and
program shouldn't be messing with collected stack trace.

Note that similar race between user space lookup and kernel side updates
is also present in hashmap, but it's not a new race. bpf programs were
always allowed to modify hash and array map elements while user space
is copying them.

Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/stackmap.c | 86 ++++++++++++++++++++++++++++++++++++++++-----------
 kernel/bpf/syscall.c  |  2 ++
 3 files changed, 71 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index efd1d4ca95c6..21ee41b92e8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 			   u64 flags);
 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 			    u64 flags);
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f0a02c344358..499d9e933f8e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -10,9 +10,10 @@
 #include <linux/vmalloc.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include "percpu_freelist.h"
 
 struct stack_map_bucket {
-	struct rcu_head rcu;
+	struct pcpu_freelist_node fnode;
 	u32 hash;
 	u32 nr;
 	u64 ip[];
@@ -20,10 +21,34 @@ struct stack_map_bucket {
 
 struct bpf_stack_map {
 	struct bpf_map map;
+	void *elems;
+	struct pcpu_freelist freelist;
 	u32 n_buckets;
-	struct stack_map_bucket __rcu *buckets[];
+	struct stack_map_bucket *buckets[];
 };
 
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+	int err;
+
+	smap->elems = vzalloc(elem_size * smap->map.max_entries);
+	if (!smap->elems)
+		return -ENOMEM;
+
+	err = pcpu_freelist_init(&smap->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+			       smap->map.max_entries);
+	return 0;
+
+free_elems:
+	vfree(smap->elems);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	err = bpf_map_precharge_memlock(smap->map.pages);
+	if (err)
+		goto free_smap;
+
 	err = get_callchain_buffers();
 	if (err)
 		goto free_smap;
 
+	err = prealloc_elems_and_freelist(smap);
+	if (err)
+		goto put_buffers;
+
 	return &smap->map;
 
+put_buffers:
+	put_callchain_buffers();
 free_smap:
 	kvfree(smap);
 	return ERR_PTR(err);
@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	ips = trace->ip + skip + init_nr;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
-	bucket = rcu_dereference(smap->buckets[id]);
+	bucket = READ_ONCE(smap->buckets[id]);
 
 	if (bucket && bucket->hash == hash) {
 		if (flags & BPF_F_FAST_STACK_CMP)
@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	if (bucket && !(flags & BPF_F_REUSE_STACKID))
 		return -EEXIST;
 
-	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
-			     GFP_ATOMIC | __GFP_NOWARN);
+	new_bucket = (struct stack_map_bucket *)
+		pcpu_freelist_pop(&smap->freelist);
 	if (unlikely(!new_bucket))
 		return -ENOMEM;
 
 	memcpy(new_bucket->ip, ips, trace_len);
-	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
 	new_bucket->hash = hash;
 	new_bucket->nr = trace_nr;
 
 	old_bucket = xchg(&smap->buckets[id], new_bucket);
 	if (old_bucket)
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 	return id;
 }
 
@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-/* Called from syscall or from eBPF program */
+/* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	struct stack_map_bucket *bucket;
-	u32 id = *(u32 *)key;
+	struct stack_map_bucket *bucket, *old_bucket;
+	u32 id = *(u32 *)key, trace_len;
 
 	if (unlikely(id >= smap->n_buckets))
-		return NULL;
-	bucket = rcu_dereference(smap->buckets[id]);
-	return bucket ? bucket->ip : NULL;
+		return -ENOENT;
+
+	bucket = xchg(&smap->buckets[id], NULL);
+	if (!bucket)
+		return -ENOENT;
+
+	trace_len = bucket->nr * sizeof(u64);
+	memcpy(value, bucket->ip, trace_len);
+	memset(value + trace_len, 0, map->value_size - trace_len);
+
+	old_bucket = xchg(&smap->buckets[id], bucket);
+	if (old_bucket)
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+	return 0;
 }
 
 static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 
 	old_bucket = xchg(&smap->buckets[id], NULL);
 	if (old_bucket) {
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 		return 0;
 	} else {
 		return -ENOENT;
@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 static void stack_map_free(struct bpf_map *map)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	int i;
 
+	/* wait for bpf programs to complete before freeing stack map */
 	synchronize_rcu();
 
-	for (i = 0; i < smap->n_buckets; i++)
-		if (smap->buckets[i])
-			kfree_rcu(smap->buckets[i], rcu);
+	vfree(smap->elems);
+	pcpu_freelist_destroy(&smap->freelist);
 	kvfree(smap);
 	put_callchain_buffers();
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cbd94b2144ff..2978d0d08869 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
-- 
cgit v1.2.3


From e28e87ed474c5a0b378c66fb85efc8e487f4f63f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 8 Mar 2016 23:36:03 +0100
Subject: ip_tunnel, bpf: ip_tunnel_info_opts_{get, set} depends on CONFIG_INET

Helpers like ip_tunnel_info_opts_{get,set}() are only available if
CONFIG_INET is set, thus add an empty definition into the header for
the !CONFIG_INET case, where already other empty inline helpers are
defined.

This avoids ifdef kludge inside filter.c, but also vxlan and geneve
themself where this facility can only be used with, depend on INET
being set. For the !INET case TUNNEL_OPTIONS_PRESENT would never be
set in flags.

Fixes: 14ca0751c96f ("bpf: support for access to tunnel options")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e1395d70fb48..0acd80fadb32 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -369,6 +369,17 @@ static inline void ip_tunnel_unneed_metadata(void)
 {
 }
 
+static inline void ip_tunnel_info_opts_get(void *to,
+					   const struct ip_tunnel_info *info)
+{
+}
+
+static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
+					   const void *from, int len)
+{
+	info->options_len = 0;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
-- 
cgit v1.2.3


From ff3c44e675054533403909ecb76e78c1d4efbd26 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:00 -0800
Subject: rcu: Add list_next_or_null_rcu

This is a convenience function that returns the next entry in an RCU
list or NULL if at the end of the list.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 14ec1652daf4..17d4f849c65e 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -318,6 +318,27 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
 	likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
 })
 
+/**
+ * list_next_or_null_rcu - get the first element from a list
+ * @head:	the head for the list.
+ * @ptr:        the list head to take the next element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * Note that if the ptr is at the end of the list, NULL is returned.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_next_or_null_rcu(head, ptr, type, member) \
+({ \
+	struct list_head *__head = (head); \
+	struct list_head *__ptr = (ptr); \
+	struct list_head *__next = READ_ONCE(__ptr->next); \
+	likely(__next != __head) ? list_entry_rcu(__next, type, \
+						  member) : NULL; \
+})
+
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
  * @pos:	the type * to use as a loop cursor.
-- 
cgit v1.2.3


From f4a00aacdb5f6784d46e8c999b6bb52ece4b306b Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:01 -0800
Subject: net: Make sock_alloc exportable

Export it for cases where we want to create sockets by hand.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 1 +
 net/socket.c        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 0b4ac7da583a..49175e4ced11 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -215,6 +215,7 @@ int __sock_create(struct net *net, int family, int type, int proto,
 int sock_create(int family, int type, int proto, struct socket **res);
 int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res);
 int sock_create_lite(int family, int type, int proto, struct socket **res);
+struct socket *sock_alloc(void);
 void sock_release(struct socket *sock);
 int sock_sendmsg(struct socket *sock, struct msghdr *msg);
 int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/socket.c b/net/socket.c
index c044d1e8508c..38a78d4d50f5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = {
  *	NULL is returned.
  */
 
-static struct socket *sock_alloc(void)
+struct socket *sock_alloc(void)
 {
 	struct inode *inode;
 	struct socket *sock;
@@ -554,6 +554,7 @@ static struct socket *sock_alloc(void)
 	this_cpu_add(sockets_in_use, 1);
 	return sock;
 }
+EXPORT_SYMBOL(sock_alloc);
 
 /**
  *	sock_release	-	close a socket
-- 
cgit v1.2.3


From f092276d85b82504e8a07498f4e9e0c51f06745c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:03 -0800
Subject: net: Add MSG_BATCH flag

Add a new msg flag called MSG_BATCH. This flag is used in sendmsg to
indicate that more messages will follow (i.e. a batch of messages is
being sent). This is similar to MSG_MORE except that the following
messages are not merged into one packet, they are sent individually.
sendmmsg is updated so that each contained message except for the
last one is marked as MSG_BATCH.

MSG_BATCH is a performance optimization in cases where a socket
implementation can benefit by transmitting packets in a batch.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 1 +
 net/socket.c           | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5bf59c8493b7..d834af22a460 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -274,6 +274,7 @@ struct ucred {
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
+#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
diff --git a/net/socket.c b/net/socket.c
index 0dd4dd818f41..886649c88d8f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2008,6 +2008,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	struct compat_mmsghdr __user *compat_entry;
 	struct msghdr msg_sys;
 	struct used_address used_address;
+	unsigned int oflags = flags;
 
 	if (vlen > UIO_MAXIOV)
 		vlen = UIO_MAXIOV;
@@ -2022,8 +2023,12 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	entry = mmsg;
 	compat_entry = (struct compat_mmsghdr __user *)mmsg;
 	err = 0;
+	flags |= MSG_BATCH;
 
 	while (datagrams < vlen) {
+		if (datagrams == vlen - 1)
+			flags = oflags;
+
 		if (MSG_CMSG_COMPAT & flags) {
 			err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
 					     &msg_sys, flags, &used_address, MSG_EOR);
-- 
cgit v1.2.3


From 473bd239b808a8af5241ce9996a16d283d88ddff Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:05 -0800
Subject: tcp: Add tcp_inq to get available receive bytes on socket

Create a common kernel function to get the number of bytes available
on a TCP socket. This is based on code in INQ getsockopt and we now call
the function for that getsockopt.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 24 ++++++++++++++++++++++++
 net/ipv4/tcp.c    | 15 +--------------
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e90db8546806..0302636af98c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1816,4 +1816,28 @@ static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
 	skb->truesize = 2;
 }
 
+static inline int tcp_inq(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int answ;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		answ = 0;
+	} else if (sock_flag(sk, SOCK_URGINLINE) ||
+		   !tp->urg_data ||
+		   before(tp->urg_seq, tp->copied_seq) ||
+		   !before(tp->urg_seq, tp->rcv_nxt)) {
+
+		answ = tp->rcv_nxt - tp->copied_seq;
+
+		/* Subtract 1, if FIN was received */
+		if (answ && sock_flag(sk, SOCK_DONE))
+			answ--;
+	} else {
+		answ = tp->urg_seq - tp->copied_seq;
+	}
+
+	return answ;
+}
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f9faadb42485..a265f00b9df9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			return -EINVAL;
 
 		slow = lock_sock_fast(sk);
-		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
-			answ = 0;
-		else if (sock_flag(sk, SOCK_URGINLINE) ||
-			 !tp->urg_data ||
-			 before(tp->urg_seq, tp->copied_seq) ||
-			 !before(tp->urg_seq, tp->rcv_nxt)) {
-
-			answ = tp->rcv_nxt - tp->copied_seq;
-
-			/* Subtract 1, if FIN was received */
-			if (answ && sock_flag(sk, SOCK_DONE))
-				answ--;
-		} else
-			answ = tp->urg_seq - tp->copied_seq;
+		answ = tcp_inq(sk);
 		unlock_sock_fast(sk, slow);
 		break;
 	case SIOCATMARK:
-- 
cgit v1.2.3


From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:06 -0800
Subject: kcm: Kernel Connection Multiplexor module

This module implements the Kernel Connection Multiplexor.

Kernel Connection Multiplexor (KCM) is a facility that provides a
message based interface over TCP for generic application protocols.
With KCM an application can efficiently send and receive application
protocol messages over TCP using datagram sockets.

For more information see the included Documentation/networking/kcm.txt

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h   |    6 +-
 include/net/kcm.h        |  125 +++
 include/uapi/linux/kcm.h |   40 +
 net/Kconfig              |    1 +
 net/Makefile             |    1 +
 net/kcm/Kconfig          |   10 +
 net/kcm/Makefile         |    3 +
 net/kcm/kcmsock.c        | 2016 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2201 insertions(+), 1 deletion(-)
 create mode 100644 include/net/kcm.h
 create mode 100644 include/uapi/linux/kcm.h
 create mode 100644 net/kcm/Kconfig
 create mode 100644 net/kcm/Makefile
 create mode 100644 net/kcm/kcmsock.c

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index d834af22a460..73bf6c6a833b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,9 @@ struct ucred {
 #define AF_ALG		38	/* Algorithm sockets		*/
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
-#define AF_MAX		41	/* For now.. */
+#define AF_KCM		41	/* Kernel Connection Multiplexor*/
+
+#define AF_MAX		42	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -246,6 +248,7 @@ struct ucred {
 #define PF_ALG		AF_ALG
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
+#define PF_KCM		AF_KCM
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -323,6 +326,7 @@ struct ucred {
 #define SOL_CAIF	278
 #define SOL_ALG		279
 #define SOL_NFC		280
+#define SOL_KCM		281
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/kcm.h b/include/net/kcm.h
new file mode 100644
index 000000000000..1bcae39070ec
--- /dev/null
+++ b/include/net/kcm.h
@@ -0,0 +1,125 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __NET_KCM_H_
+#define __NET_KCM_H_
+
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/kcm.h>
+
+extern unsigned int kcm_net_id;
+
+struct kcm_tx_msg {
+	unsigned int sent;
+	unsigned int fragidx;
+	unsigned int frag_offset;
+	unsigned int msg_flags;
+	struct sk_buff *frag_skb;
+	struct sk_buff *last_skb;
+};
+
+struct kcm_rx_msg {
+	int full_len;
+	int accum_len;
+	int offset;
+};
+
+/* Socket structure for KCM client sockets */
+struct kcm_sock {
+	struct sock sk;
+	struct kcm_mux *mux;
+	struct list_head kcm_sock_list;
+	int index;
+	u32 done : 1;
+	struct work_struct done_work;
+
+	/* Transmit */
+	struct kcm_psock *tx_psock;
+	struct work_struct tx_work;
+	struct list_head wait_psock_list;
+	struct sk_buff *seq_skb;
+
+	/* Don't use bit fields here, these are set under different locks */
+	bool tx_wait;
+	bool tx_wait_more;
+
+	/* Receive */
+	struct kcm_psock *rx_psock;
+	struct list_head wait_rx_list; /* KCMs waiting for receiving */
+	bool rx_wait;
+	u32 rx_disabled : 1;
+};
+
+struct bpf_prog;
+
+/* Structure for an attached lower socket */
+struct kcm_psock {
+	struct sock *sk;
+	struct kcm_mux *mux;
+	int index;
+
+	u32 tx_stopped : 1;
+	u32 rx_stopped : 1;
+	u32 done : 1;
+	u32 unattaching : 1;
+
+	void (*save_state_change)(struct sock *sk);
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+
+	struct list_head psock_list;
+
+	/* Receive */
+	struct sk_buff *rx_skb_head;
+	struct sk_buff **rx_skb_nextp;
+	struct sk_buff *ready_rx_msg;
+	struct list_head psock_ready_list;
+	struct work_struct rx_work;
+	struct delayed_work rx_delayed_work;
+	struct bpf_prog *bpf_prog;
+	struct kcm_sock *rx_kcm;
+
+	/* Transmit */
+	struct kcm_sock *tx_kcm;
+	struct list_head psock_avail_list;
+};
+
+/* Per net MUX list */
+struct kcm_net {
+	struct mutex mutex;
+	struct list_head mux_list;
+	int count;
+};
+
+/* Structure for a MUX */
+struct kcm_mux {
+	struct list_head kcm_mux_list;
+	struct rcu_head rcu;
+	struct kcm_net *knet;
+
+	struct list_head kcm_socks;	/* All KCM sockets on MUX */
+	int kcm_socks_cnt;		/* Total KCM socket count for MUX */
+	struct list_head psocks;	/* List of all psocks on MUX */
+	int psocks_cnt;		/* Total attached sockets */
+
+	/* Receive */
+	spinlock_t rx_lock ____cacheline_aligned_in_smp;
+	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
+	struct list_head psocks_ready;	/* List of psocks with a msg ready */
+	struct sk_buff_head rx_hold_queue;
+
+	/* Transmit */
+	spinlock_t  lock ____cacheline_aligned_in_smp;	/* TX and mux locking */
+	struct list_head psocks_avail;	/* List of available psocks */
+	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
+};
+
+#endif /* __NET_KCM_H_ */
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h
new file mode 100644
index 000000000000..a5a530940b99
--- /dev/null
+++ b/include/uapi/linux/kcm.h
@@ -0,0 +1,40 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * User API to clone KCM sockets and attach transport socket to a KCM
+ * multiplexor.
+ */
+
+#ifndef KCM_KERNEL_H
+#define KCM_KERNEL_H
+
+struct kcm_attach {
+	int fd;
+	int bpf_fd;
+};
+
+struct kcm_unattach {
+	int fd;
+};
+
+struct kcm_clone {
+	int fd;
+};
+
+#define SIOCKCMATTACH	(SIOCPROTOPRIVATE + 0)
+#define SIOCKCMUNATTACH	(SIOCPROTOPRIVATE + 1)
+#define SIOCKCMCLONE	(SIOCPROTOPRIVATE + 2)
+
+#define KCMPROTO_CONNECTED	0
+
+/* Socket options */
+#define KCM_RECV_DISABLE	1
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig
index 2760825e53fa..10640d5f8bee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA)		+= irda/
 obj-$(CONFIG_BT)		+= bluetooth/
 obj-$(CONFIG_SUNRPC)		+= sunrpc/
 obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
+obj-$(CONFIG_AF_KCM)		+= kcm/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_L2TP)		+= l2tp/
 obj-$(CONFIG_DECNET)		+= decnet/
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+	tristate "KCM sockets"
+	depends on INET
+	select BPF_SYSCALL
+	---help---
+	  KCM (Kernel Connection Multiplexor) sockets provide a method
+	  for multiplexing messages of a message based application
+	  protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..cb525f7c5a13
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..30ef69ac6b81
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+	return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_rx_msg *)((void *)skb->cb +
+				     offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+	csk->sk_err = EPIPE;
+	csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+			       struct sk_buff *skb)
+{
+	struct sock *csk = psock->sk;
+
+	/* Unrecoverable error in receive */
+
+	if (psock->rx_stopped)
+		return;
+
+	psock->rx_stopped = 1;
+
+	/* Report an error on the lower socket */
+	report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+			       bool wakeup_kcm)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Unrecoverable error in transmit */
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_stopped) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	psock->tx_stopped = 1;
+
+	if (!psock->tx_kcm) {
+		/* Take off psocks_avail list */
+		list_del(&psock->psock_avail_list);
+	} else if (wakeup_kcm) {
+		/* In this case psock is being aborted while outside of
+		 * write_msgs and psock is reserved. Schedule tx_work
+		 * to handle the failure there. Need to commit tx_stopped
+		 * before queuing work.
+		 */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	/* Report error on lower socket */
+	report_csk_error(csk, err);
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct sk_buff *skb;
+
+	if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+		return;
+
+	while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Assuming buffer limit has been reached */
+			skb_queue_head(&mux->rx_hold_queue, skb);
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+	}
+
+	while (!list_empty(&mux->psocks_ready)) {
+		psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+					 psock_ready_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+			/* Assuming buffer limit has been reached */
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+
+		/* Consumed the ready message on the psock. Schedule rx_work to
+		 * get more messages.
+		 */
+		list_del(&psock->psock_ready_list);
+		psock->ready_rx_msg = NULL;
+
+		/* Commit clearing of ready_rx_msg for queuing work */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->rx_work);
+	}
+
+	/* Buffer limit is okay now, add to ready list */
+	list_add_tail(&kcm->wait_rx_list,
+		      &kcm->mux->kcm_rx_waiters);
+	kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct kcm_mux *mux = kcm->mux;
+	unsigned int len = skb->truesize;
+
+	sk_mem_uncharge(sk, len);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+
+	/* For reading rx_wait and rx_psock without holding lock */
+	smp_mb__after_atomic();
+
+	if (!kcm->rx_wait && !kcm->rx_psock &&
+	    sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+		spin_lock_bh(&mux->rx_lock);
+		kcm_rcv_ready(kcm);
+		spin_unlock_bh(&mux->rx_lock);
+	}
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return -ENOBUFS;
+
+	skb->dev = NULL;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = kcm_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, skb->truesize);
+
+	skb_queue_tail(list, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk);
+
+	return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+	struct sk_buff *skb;
+	struct kcm_sock *kcm;
+
+	while ((skb = __skb_dequeue(head))) {
+		/* Reset destructor to avoid calling kcm_rcv_ready */
+		skb->destructor = sock_rfree;
+		skb_orphan(skb);
+try_again:
+		if (list_empty(&mux->kcm_rx_waiters)) {
+			skb_queue_tail(&mux->rx_hold_queue, skb);
+			continue;
+		}
+
+		kcm = list_first_entry(&mux->kcm_rx_waiters,
+				       struct kcm_sock, wait_rx_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Should mean socket buffer full */
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+
+			/* Commit rx_wait to read in kcm_free */
+			smp_wmb();
+
+			goto try_again;
+		}
+	}
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+				       struct sk_buff *head)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	WARN_ON(psock->ready_rx_msg);
+
+	if (psock->rx_kcm)
+		return psock->rx_kcm;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	if (psock->rx_kcm) {
+		spin_unlock_bh(&mux->rx_lock);
+		return psock->rx_kcm;
+	}
+
+	if (list_empty(&mux->kcm_rx_waiters)) {
+		psock->ready_rx_msg = head;
+		list_add_tail(&psock->psock_ready_list,
+			      &mux->psocks_ready);
+		spin_unlock_bh(&mux->rx_lock);
+		return NULL;
+	}
+
+	kcm = list_first_entry(&mux->kcm_rx_waiters,
+			       struct kcm_sock, wait_rx_list);
+	list_del(&kcm->wait_rx_list);
+	kcm->rx_wait = false;
+
+	psock->rx_kcm = kcm;
+	kcm->rx_psock = psock;
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+	kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+			     bool rcv_ready)
+{
+	struct kcm_sock *kcm = psock->rx_kcm;
+	struct kcm_mux *mux = psock->mux;
+
+	if (!kcm)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	psock->rx_kcm = NULL;
+	kcm->rx_psock = NULL;
+
+	/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+	 * kcm_rfree
+	 */
+	smp_mb();
+
+	if (unlikely(kcm->done)) {
+		spin_unlock_bh(&mux->rx_lock);
+
+		/* Need to run kcm_done in a task since we need to qcquire
+		 * callback locks which may already be held here.
+		 */
+		INIT_WORK(&kcm->done_work, kcm_done_work);
+		schedule_work(&kcm->done_work);
+		return;
+	}
+
+	if (unlikely(kcm->rx_disabled)) {
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	} else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+		/* Check for degenerative race with rx_wait that all
+		 * data was dequeued (accounted for in kcm_rfree).
+		 */
+		kcm_rcv_ready(kcm);
+	}
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+	(*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+			unsigned int orig_offset, size_t orig_len)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+	struct kcm_rx_msg *rxm;
+	struct kcm_sock *kcm;
+	struct sk_buff *head, *skb;
+	size_t eaten = 0, cand_len;
+	ssize_t extra;
+	int err;
+	bool cloned_orig = false;
+
+	if (psock->ready_rx_msg)
+		return 0;
+
+	head = psock->rx_skb_head;
+	if (head) {
+		/* Message already in progress */
+
+		if (unlikely(orig_offset)) {
+			/* Getting data with a non-zero offset when a message is
+			 * in progress is not expected. If it does happen, we
+			 * need to clone and pull since we can't deal with
+			 * offsets in the skbs for a message expect in the head.
+			 */
+			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+			if (!orig_skb) {
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			if (!pskb_pull(orig_skb, orig_offset)) {
+				kfree_skb(orig_skb);
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			cloned_orig = true;
+			orig_offset = 0;
+		}
+
+		if (!psock->rx_skb_nextp) {
+			/* We are going to append to the frags_list of head.
+			 * Need to unshare the frag_list.
+			 */
+			err = skb_unclone(head, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				return 0;
+			}
+
+			if (unlikely(skb_shinfo(head)->frag_list)) {
+				/* We can't append to an sk_buff that already
+				 * has a frag_list. We create a new head, point
+				 * the frag_list of that to the old head, and
+				 * then are able to use the old head->next for
+				 * appending to the message.
+				 */
+				if (WARN_ON(head->next)) {
+					desc->error = -EINVAL;
+					return 0;
+				}
+
+				skb = alloc_skb(0, GFP_ATOMIC);
+				if (!skb) {
+					desc->error = -ENOMEM;
+					return 0;
+				}
+				skb->len = head->len;
+				skb->data_len = head->len;
+				skb->truesize = head->truesize;
+				*kcm_rx_msg(skb) = *kcm_rx_msg(head);
+				psock->rx_skb_nextp = &head->next;
+				skb_shinfo(skb)->frag_list = head;
+				psock->rx_skb_head = skb;
+				head = skb;
+			} else {
+				psock->rx_skb_nextp =
+				    &skb_shinfo(head)->frag_list;
+			}
+		}
+	}
+
+	while (eaten < orig_len) {
+		/* Always clone since we will consume something */
+		skb = skb_clone(orig_skb, GFP_ATOMIC);
+		if (!skb) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		cand_len = orig_len - eaten;
+
+		head = psock->rx_skb_head;
+		if (!head) {
+			head = skb;
+			psock->rx_skb_head = head;
+			/* Will set rx_skb_nextp on next packet if needed */
+			psock->rx_skb_nextp = NULL;
+			rxm = kcm_rx_msg(head);
+			memset(rxm, 0, sizeof(*rxm));
+			rxm->offset = orig_offset + eaten;
+		} else {
+			/* Unclone since we may be appending to an skb that we
+			 * already share a frag_list with.
+			 */
+			err = skb_unclone(skb, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				break;
+			}
+
+			rxm = kcm_rx_msg(head);
+			*psock->rx_skb_nextp = skb;
+			psock->rx_skb_nextp = &skb->next;
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+
+		if (!rxm->full_len) {
+			ssize_t len;
+
+			len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+			if (!len) {
+				/* Need more header to determine length */
+				rxm->accum_len += cand_len;
+				eaten += cand_len;
+				WARN_ON(eaten != orig_len);
+				break;
+			} else if (len <= (ssize_t)head->len -
+					  skb->len - rxm->offset) {
+				/* Length must be into new skb (and also
+				 * greater than zero)
+				 */
+				desc->error = -EPROTO;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EPROTO, head);
+				break;
+			}
+
+			rxm->full_len = len;
+		}
+
+		extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+		if (extra < 0) {
+			/* Message not complete yet. */
+			rxm->accum_len += cand_len;
+			eaten += cand_len;
+			WARN_ON(eaten != orig_len);
+			break;
+		}
+
+		/* Positive extra indicates ore bytes than needed for the
+		 * message
+		 */
+
+		WARN_ON(extra > cand_len);
+
+		eaten += (cand_len - extra);
+
+		/* Hurray, we have a new message! */
+		psock->rx_skb_head = NULL;
+
+try_queue:
+		kcm = reserve_rx_kcm(psock, head);
+		if (!kcm) {
+			/* Unable to reserve a KCM, message is held in psock. */
+			break;
+		}
+
+		if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+			/* Should mean socket buffer full */
+			unreserve_rx_kcm(psock, false);
+			goto try_queue;
+		}
+	}
+
+	if (cloned_orig)
+		kfree_skb(orig_skb);
+
+	return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+	read_descriptor_t desc;
+
+	desc.arg.data = psock;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	/* sk should be locked here, so okay to do tcp_read_sock */
+	tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+	unreserve_rx_kcm(psock, true);
+
+	return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+	struct kcm_psock *psock;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock || psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+	read_descriptor_t rd_desc;
+	struct sock *csk = psock->sk;
+
+	/* We need the read lock to synchronize with psock_tcp_data_ready. We
+	 * need the socket lock for calling tcp_read_sock.
+	 */
+	lock_sock(csk);
+	read_lock_bh(&csk->sk_callback_lock);
+
+	if (unlikely(csk->sk_user_data != psock))
+		goto out;
+
+	if (unlikely(psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	rd_desc.arg.data = psock;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&csk->sk_callback_lock);
+	release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock,
+				      rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+	/* TCP only does a POLLIN for a half close. Do a POLLHUP here
+	 * since application will normally not poll with POLLIN
+	 * on the TCP sockets.
+	 */
+
+	report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux;
+	struct kcm_sock *kcm;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock))
+		goto out;
+
+	mux = psock->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check if the socket is reserved so someone is waiting for sending. */
+	kcm = psock->tx_kcm;
+	if (kcm)
+		queue_work(kcm_wq, &kcm->tx_work);
+
+	spin_unlock_bh(&mux->lock);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+
+	psock = kcm->tx_psock;
+
+	smp_rmb(); /* Must read tx_psock before tx_wait */
+
+	if (psock) {
+		WARN_ON(kcm->tx_wait);
+		if (unlikely(psock->tx_stopped))
+			unreserve_psock(kcm);
+		else
+			return kcm->tx_psock;
+	}
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check again under lock to see if psock was reserved for this
+	 * psock via psock_unreserve.
+	 */
+	psock = kcm->tx_psock;
+	if (unlikely(psock)) {
+		WARN_ON(kcm->tx_wait);
+		spin_unlock_bh(&mux->lock);
+		return kcm->tx_psock;
+	}
+
+	if (!list_empty(&mux->psocks_avail)) {
+		psock = list_first_entry(&mux->psocks_avail,
+					 struct kcm_psock,
+					 psock_avail_list);
+		list_del(&psock->psock_avail_list);
+		if (kcm->tx_wait) {
+			list_del(&kcm->wait_psock_list);
+			kcm->tx_wait = false;
+		}
+		kcm->tx_psock = psock;
+		psock->tx_kcm = kcm;
+	} else if (!kcm->tx_wait) {
+		list_add_tail(&kcm->wait_psock_list,
+			      &mux->kcm_tx_waiters);
+		kcm->tx_wait = true;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	if (list_empty(&mux->kcm_tx_waiters)) {
+		list_add_tail(&psock->psock_avail_list,
+			      &mux->psocks_avail);
+	} else {
+		kcm = list_first_entry(&mux->kcm_tx_waiters,
+				       struct kcm_sock,
+				       wait_psock_list);
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+		psock->tx_kcm = kcm;
+
+		/* Commit before changing tx_psock since that is read in
+		 * reserve_psock before queuing work.
+		 */
+		smp_mb();
+
+		kcm->tx_psock = psock;
+		queue_work(kcm_wq, &kcm->tx_work);
+	}
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	psock = kcm->tx_psock;
+
+	if (WARN_ON(!psock)) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	smp_rmb(); /* Read tx_psock before tx_wait */
+
+	WARN_ON(kcm->tx_wait);
+
+	kcm->tx_psock = NULL;
+	psock->tx_kcm = NULL;
+
+	if (unlikely(psock->tx_stopped)) {
+		if (psock->done) {
+			/* Deferred free */
+			list_del(&psock->psock_list);
+			mux->psocks_cnt--;
+			sock_put(psock->sk);
+			fput(psock->sk->sk_socket->file);
+			kmem_cache_free(kcm_psockp, psock);
+		}
+
+		/* Don't put back on available list */
+
+		spin_unlock_bh(&mux->lock);
+
+		return;
+	}
+
+	psock_now_avail(psock);
+
+	spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket.  Called with kcm sock lock
+ * held.  Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+	struct sock *sk = &kcm->sk;
+	struct kcm_psock *psock;
+	struct sk_buff *skb, *head;
+	struct kcm_tx_msg *txm;
+	unsigned short fragidx, frag_offset;
+	unsigned int sent, total_sent = 0;
+	int ret = 0;
+
+	kcm->tx_wait_more = false;
+	psock = kcm->tx_psock;
+	if (unlikely(psock && psock->tx_stopped)) {
+		/* A reserved psock was aborted asynchronously. Unreserve
+		 * it and we'll retry the message.
+		 */
+		unreserve_psock(kcm);
+		if (skb_queue_empty(&sk->sk_write_queue))
+			return 0;
+
+		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+	} else if (skb_queue_empty(&sk->sk_write_queue)) {
+		return 0;
+	}
+
+	head = skb_peek(&sk->sk_write_queue);
+	txm = kcm_tx_msg(head);
+
+	if (txm->sent) {
+		/* Send of first skbuff in queue already in progress */
+		if (WARN_ON(!psock)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		sent = txm->sent;
+		frag_offset = txm->frag_offset;
+		fragidx = txm->fragidx;
+		skb = txm->frag_skb;
+
+		goto do_frag;
+	}
+
+try_again:
+	psock = reserve_psock(kcm);
+	if (!psock)
+		goto out;
+
+	do {
+		skb = head;
+		txm = kcm_tx_msg(head);
+		sent = 0;
+
+do_frag_list:
+		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+		     fragidx++) {
+			skb_frag_t *frag;
+
+			frag_offset = 0;
+do_frag:
+			frag = &skb_shinfo(skb)->frags[fragidx];
+			if (WARN_ON(!frag->size)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = kernel_sendpage(psock->sk->sk_socket,
+					      frag->page.p,
+					      frag->page_offset + frag_offset,
+					      frag->size - frag_offset,
+					      MSG_DONTWAIT);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					/* Save state to try again when there's
+					 * write space on the socket
+					 */
+					txm->sent = sent;
+					txm->frag_offset = frag_offset;
+					txm->fragidx = fragidx;
+					txm->frag_skb = skb;
+
+					ret = 0;
+					goto out;
+				}
+
+				/* Hard failure in sending message, abort this
+				 * psock since it has lost framing
+				 * synchonization and retry sending the
+				 * message from the beginning.
+				 */
+				kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+						   true);
+				unreserve_psock(kcm);
+
+				txm->sent = 0;
+				ret = 0;
+
+				goto try_again;
+			}
+
+			sent += ret;
+			frag_offset += ret;
+			if (frag_offset < frag->size) {
+				/* Not finished with this frag */
+				goto do_frag;
+			}
+		}
+
+		if (skb == head) {
+			if (skb_has_frag_list(skb)) {
+				skb = skb_shinfo(skb)->frag_list;
+				goto do_frag_list;
+			}
+		} else if (skb->next) {
+			skb = skb->next;
+			goto do_frag_list;
+		}
+
+		/* Successfully sent the whole packet, account for it. */
+		skb_dequeue(&sk->sk_write_queue);
+		kfree_skb(head);
+		sk->sk_wmem_queued -= sent;
+		total_sent += sent;
+	} while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+	if (!head) {
+		/* Done with all queued messages. */
+		WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+		unreserve_psock(kcm);
+	}
+
+	/* Check if write space is available */
+	sk->sk_write_space(sk);
+
+	return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+	struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+	struct sock *sk = &kcm->sk;
+	int err;
+
+	lock_sock(sk);
+
+	/* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+	 * aborts
+	 */
+	err = kcm_write_msgs(kcm);
+	if (err < 0) {
+		/* Hard failure in write, report error on KCM socket */
+		pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+		report_csk_error(&kcm->sk, -err);
+		goto out;
+	}
+
+	/* Primarily for SOCK_SEQPACKET sockets */
+	if (likely(sk->sk_socket) &&
+	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_space(sk);
+	}
+
+out:
+	release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+	if (kcm->tx_wait_more)
+		kcm_write_msgs(kcm);
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct sk_buff *skb = NULL, *head = NULL;
+	size_t copy, copied = 0;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	int eor = (sock->type == SOCK_DGRAM) ?
+		  !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+	int err = -EPIPE;
+
+	lock_sock(sk);
+
+	/* Per tcp_sendmsg this should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err)
+		goto out_error;
+
+	if (kcm->seq_skb) {
+		/* Previously opened message */
+		head = kcm->seq_skb;
+		skb = kcm_tx_msg(head)->last_skb;
+		goto start;
+	}
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	if (!sk_stream_memory_free(sk)) {
+		kcm_push(kcm);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	/* New message, alloc head skb */
+	head = alloc_skb(0, sk->sk_allocation);
+	while (!head) {
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+
+		head = alloc_skb(0, sk->sk_allocation);
+	}
+
+	skb = head;
+
+	/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+	 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+	 */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+	while (msg_data_left(msg)) {
+		bool merge = true;
+		int i = skb_shinfo(skb)->nr_frags;
+		struct page_frag *pfrag = sk_page_frag(sk);
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			goto wait_for_memory;
+
+		if (!skb_can_coalesce(skb, i, pfrag->page,
+				      pfrag->offset)) {
+			if (i == MAX_SKB_FRAGS) {
+				struct sk_buff *tskb;
+
+				tskb = alloc_skb(0, sk->sk_allocation);
+				if (!tskb)
+					goto wait_for_memory;
+
+				if (head == skb)
+					skb_shinfo(head)->frag_list = tskb;
+				else
+					skb->next = tskb;
+
+				skb = tskb;
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				continue;
+			}
+			merge = false;
+		}
+
+		copy = min_t(int, msg_data_left(msg),
+			     pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+
+		err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+					       pfrag->page,
+					       pfrag->offset,
+					       copy);
+		if (err)
+			goto out_error;
+
+		/* Update the skb. */
+		if (merge) {
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+		} else {
+			skb_fill_page_desc(skb, i, pfrag->page,
+					   pfrag->offset, copy);
+			get_page(pfrag->page);
+		}
+
+		pfrag->offset += copy;
+		copied += copy;
+		if (head != skb) {
+			head->len += copy;
+			head->data_len += copy;
+		}
+
+		continue;
+
+wait_for_memory:
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	if (eor) {
+		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+		/* Message complete, queue it on send buffer */
+		__skb_queue_tail(&sk->sk_write_queue, head);
+		kcm->seq_skb = NULL;
+
+		if (msg->msg_flags & MSG_BATCH) {
+			kcm->tx_wait_more = true;
+		} else if (kcm->tx_wait_more || not_busy) {
+			err = kcm_write_msgs(kcm);
+			if (err < 0) {
+				/* We got a hard error in write_msgs but have
+				 * already queued this message. Report an error
+				 * in the socket, but don't affect return value
+				 * from sendmsg
+				 */
+				pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+				report_csk_error(&kcm->sk, -err);
+			}
+		}
+	} else {
+		/* Message not complete, save state */
+partial_message:
+		kcm->seq_skb = head;
+		kcm_tx_msg(head)->last_skb = skb;
+	}
+
+	release_sock(sk);
+	return copied;
+
+out_error:
+	kcm_push(kcm);
+
+	if (copied && sock->type == SOCK_SEQPACKET) {
+		/* Wrote some bytes before encountering an
+		 * error, return partial success.
+		 */
+		goto partial_message;
+	}
+
+	if (head != kcm->seq_skb)
+		kfree_skb(head);
+
+	err = sk_stream_error(sk, msg->msg_flags, err);
+
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+		sk->sk_write_space(sk);
+
+	release_sock(sk);
+	return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct sk_buff *skb;
+
+	while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		sk_wait_data(sk, &timeo, NULL);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+		       size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	long timeo;
+	struct kcm_rx_msg *rxm;
+	int copied = 0;
+	struct sk_buff *skb;
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	skb = kcm_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto out;
+
+	/* Okay, have a message on the receive queue */
+
+	rxm = kcm_rx_msg(skb);
+
+	if (len > rxm->full_len)
+		len = rxm->full_len;
+
+	err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+	if (err < 0)
+		goto out;
+
+	copied = len;
+	if (likely(!(flags & MSG_PEEK))) {
+		if (copied < rxm->full_len) {
+			if (sock->type == SOCK_DGRAM) {
+				/* Truncated message */
+				msg->msg_flags |= MSG_TRUNC;
+				goto msg_finished;
+			}
+			rxm->offset += copied;
+			rxm->full_len -= copied;
+		} else {
+msg_finished:
+			/* Finished with message */
+			msg->msg_flags |= MSG_EOR;
+			skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
+		}
+	}
+
+out:
+	release_sock(sk);
+
+	return copied ? : err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 1;
+
+	/* If a psock is reserved we'll do cleanup in unreserve */
+	if (!kcm->rx_psock) {
+		if (kcm->rx_wait) {
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+		}
+
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (!kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 0;
+	kcm_rcv_ready(kcm);
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, valbool;
+	int err = 0;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EINVAL;
+
+	valbool = val ? 1 : 0;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		lock_sock(&kcm->sk);
+		if (valbool)
+			kcm_recv_disable(kcm);
+		else
+			kcm_recv_enable(kcm);
+		release_sock(&kcm->sk);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+	return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, len;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		val = kcm->rx_disabled;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+	struct kcm_sock *tkcm;
+	struct list_head *head;
+	int index = 0;
+
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * POLLHUP
+	 */
+	kcm->sk.sk_state = TCP_ESTABLISHED;
+
+	/* Add to mux's kcm sockets list */
+	kcm->mux = mux;
+	spin_lock_bh(&mux->lock);
+
+	head = &mux->kcm_socks;
+	list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+		if (tkcm->index != index)
+			break;
+		head = &tkcm->kcm_sock_list;
+		index++;
+	}
+
+	list_add(&kcm->kcm_sock_list, head);
+	kcm->index = index;
+
+	mux->kcm_socks_cnt++;
+	spin_unlock_bh(&mux->lock);
+
+	INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+	spin_lock_bh(&mux->rx_lock);
+	kcm_rcv_ready(kcm);
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+		      struct bpf_prog *prog)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *csk;
+	struct kcm_psock *psock = NULL, *tpsock;
+	struct list_head *head;
+	int index = 0;
+
+	if (csock->ops->family != PF_INET &&
+	    csock->ops->family != PF_INET6)
+		return -EINVAL;
+
+	csk = csock->sk;
+	if (!csk)
+		return -EINVAL;
+
+	/* Only support TCP for now */
+	if (csk->sk_protocol != IPPROTO_TCP)
+		return -EINVAL;
+
+	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+	if (!psock)
+		return -ENOMEM;
+
+	psock->mux = mux;
+	psock->sk = csk;
+	psock->bpf_prog = prog;
+	INIT_WORK(&psock->rx_work, psock_rx_work);
+	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+	sock_hold(csk);
+
+	write_lock_bh(&csk->sk_callback_lock);
+	psock->save_data_ready = csk->sk_data_ready;
+	psock->save_write_space = csk->sk_write_space;
+	psock->save_state_change = csk->sk_state_change;
+	csk->sk_user_data = psock;
+	csk->sk_data_ready = psock_tcp_data_ready;
+	csk->sk_write_space = psock_tcp_write_space;
+	csk->sk_state_change = psock_tcp_state_change;
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	/* Finished initialization, now add the psock to the MUX. */
+	spin_lock_bh(&mux->lock);
+	head = &mux->psocks;
+	list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+		if (tpsock->index != index)
+			break;
+		head = &tpsock->psock_list;
+		index++;
+	}
+
+	list_add(&psock->psock_list, head);
+	psock->index = index;
+
+	mux->psocks_cnt++;
+	psock_now_avail(psock);
+	spin_unlock_bh(&mux->lock);
+
+	/* Schedule RX work in case there are already bytes queued */
+	queue_work(kcm_wq, &psock->rx_work);
+
+	return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+	struct socket *csock;
+	struct bpf_prog *prog;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	prog = bpf_prog_get(info->bpf_fd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out;
+	}
+
+	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(prog);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = kcm_attach(sock, csock, prog);
+	if (err) {
+		bpf_prog_put(prog);
+		goto out;
+	}
+
+	/* Keep reference on file also */
+
+	return 0;
+out:
+	fput(csock->file);
+	return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Stop getting callbacks from TCP socket. After this there should
+	 * be no way to reserve a kcm for this psock.
+	 */
+	write_lock_bh(&csk->sk_callback_lock);
+	csk->sk_user_data = NULL;
+	csk->sk_data_ready = psock->save_data_ready;
+	csk->sk_write_space = psock->save_write_space;
+	csk->sk_state_change = psock->save_state_change;
+	psock->rx_stopped = 1;
+
+	if (WARN_ON(psock->rx_kcm)) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		return;
+	}
+
+	spin_lock_bh(&mux->rx_lock);
+
+	/* Stop receiver activities. After this point psock should not be
+	 * able to get onto ready list either through callbacks or work.
+	 */
+	if (psock->ready_rx_msg) {
+		list_del(&psock->psock_ready_list);
+		kfree_skb(psock->ready_rx_msg);
+		psock->ready_rx_msg = NULL;
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	cancel_work_sync(&psock->rx_work);
+	cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+	bpf_prog_put(psock->bpf_prog);
+
+	kfree_skb(psock->rx_skb_head);
+	psock->rx_skb_head = NULL;
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_kcm) {
+		/* psock was reserved.  Just mark it finished and we will clean
+		 * up in the kcm paths, we need kcm lock which can not be
+		 * acquired here.
+		 */
+		spin_unlock_bh(&mux->lock);
+
+		/* We are unattaching a socket that is reserved. Abort the
+		 * socket since we may be out of sync in sending on it. We need
+		 * to do this without the mux lock.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+
+		spin_lock_bh(&mux->lock);
+		if (!psock->tx_kcm) {
+			/* psock now unreserved in window mux was unlocked */
+			goto no_reserved;
+		}
+		psock->done = 1;
+
+		/* Commit done before queuing work to process it */
+		smp_mb();
+
+		/* Queue tx work to make sure psock->done is handled */
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+		spin_unlock_bh(&mux->lock);
+	} else {
+no_reserved:
+		if (!psock->tx_stopped)
+			list_del(&psock->psock_avail_list);
+		list_del(&psock->psock_list);
+		mux->psocks_cnt--;
+		spin_unlock_bh(&mux->lock);
+
+		sock_put(csk);
+		fput(csk->sk_socket->file);
+		kmem_cache_free(kcm_psockp, psock);
+	}
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct socket *csock;
+	struct sock *csk;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	csk = csock->sk;
+	if (!csk) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = -ENOENT;
+
+	spin_lock_bh(&mux->lock);
+
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		if (psock->sk != csk)
+			continue;
+
+		/* Found the matching psock */
+
+		if (psock->unattaching || WARN_ON(psock->done)) {
+			err = -EALREADY;
+			break;
+		}
+
+		psock->unattaching = 1;
+
+		spin_unlock_bh(&mux->lock);
+
+		kcm_unattach(psock);
+
+		err = 0;
+		goto out;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+out:
+	fput(csock->file);
+	return err;
+}
+
+static struct proto kcm_proto = {
+	.name	= "KCM",
+	.owner	= THIS_MODULE,
+	.obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+		     struct socket **newsockp)
+{
+	struct socket *newsock;
+	struct sock *newsk;
+	struct file *newfile;
+	int err, newfd;
+
+	err = -ENFILE;
+	newsock = sock_alloc();
+	if (!newsock)
+		goto out;
+
+	newsock->type = osock->type;
+	newsock->ops = osock->ops;
+
+	__module_get(newsock->ops->owner);
+
+	newfd = get_unused_fd_flags(0);
+	if (unlikely(newfd < 0)) {
+		err = newfd;
+		goto out_fd_fail;
+	}
+
+	newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+	if (unlikely(IS_ERR(newfile))) {
+		err = PTR_ERR(newfile);
+		goto out_sock_alloc_fail;
+	}
+
+	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+			 &kcm_proto, true);
+	if (!newsk) {
+		err = -ENOMEM;
+		goto out_sk_alloc_fail;
+	}
+
+	sock_init_data(newsock, newsk);
+	init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+	fd_install(newfd, newfile);
+	*newsockp = newsock;
+	info->fd = newfd;
+
+	return 0;
+
+out_sk_alloc_fail:
+	fput(newfile);
+out_sock_alloc_fail:
+	put_unused_fd(newfd);
+out_fd_fail:
+	sock_release(newsock);
+out:
+	return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case SIOCKCMATTACH: {
+		struct kcm_attach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_attach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMUNATTACH: {
+		struct kcm_unattach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_unattach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMCLONE: {
+		struct kcm_clone info;
+		struct socket *newsock = NULL;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_clone(sock, &info, &newsock);
+
+		if (!err) {
+			if (copy_to_user((void __user *)arg, &info,
+					 sizeof(info))) {
+				err = -EFAULT;
+				sock_release(newsock);
+			}
+		}
+
+		break;
+	}
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+	struct kcm_mux *mux = container_of(rcu,
+	    struct kcm_mux, rcu);
+
+	kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+	struct kcm_psock *psock, *tmp_psock;
+
+	/* Release psocks */
+	list_for_each_entry_safe(psock, tmp_psock,
+				 &mux->psocks, psock_list) {
+		if (!WARN_ON(psock->unattaching))
+			kcm_unattach(psock);
+	}
+
+	if (WARN_ON(mux->psocks_cnt))
+		return;
+
+	__skb_queue_purge(&mux->rx_hold_queue);
+
+	mutex_lock(&knet->mutex);
+	list_del_rcu(&mux->kcm_mux_list);
+	knet->count--;
+	mutex_unlock(&knet->mutex);
+
+	call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *sk = &kcm->sk;
+	int socks_cnt;
+
+	spin_lock_bh(&mux->rx_lock);
+	if (kcm->rx_psock) {
+		/* Cleanup in unreserve_rx_kcm */
+		WARN_ON(kcm->done);
+		kcm->rx_disabled = 1;
+		kcm->done = 1;
+		spin_unlock_bh(&mux->rx_lock);
+		return;
+	}
+
+	if (kcm->rx_wait) {
+		list_del(&kcm->wait_rx_list);
+		kcm->rx_wait = false;
+	}
+	/* Move any pending receive messages to other kcm sockets */
+	requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	if (WARN_ON(sk_rmem_alloc_get(sk)))
+		return;
+
+	/* Detach from MUX */
+	spin_lock_bh(&mux->lock);
+
+	list_del(&kcm->kcm_sock_list);
+	mux->kcm_socks_cnt--;
+	socks_cnt = mux->kcm_socks_cnt;
+
+	spin_unlock_bh(&mux->lock);
+
+	if (!socks_cnt) {
+		/* We are done with the mux now. */
+		release_mux(mux);
+	}
+
+	WARN_ON(kcm->rx_wait);
+
+	sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+
+	if (!sk)
+		return 0;
+
+	kcm = kcm_sk(sk);
+	mux = kcm->mux;
+
+	sock_orphan(sk);
+	kfree_skb(kcm->seq_skb);
+
+	lock_sock(sk);
+	/* Purge queue under lock to avoid race condition with tx_work trying
+	 * to act when queue is nonempty. If tx_work runs after this point
+	 * it will just return.
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+	release_sock(sk);
+
+	spin_lock_bh(&mux->lock);
+	if (kcm->tx_wait) {
+		/* Take of tx_wait list, after this point there should be no way
+		 * that a psock will be assigned to this kcm.
+		 */
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+	}
+	spin_unlock_bh(&mux->lock);
+
+	/* Cancel work. After this point there should be no outside references
+	 * to the kcm socket.
+	 */
+	cancel_work_sync(&kcm->tx_work);
+
+	lock_sock(sk);
+	psock = kcm->tx_psock;
+	if (psock) {
+		/* A psock was reserved, so we need to kill it since it
+		 * may already have some bytes queued from a message. We
+		 * need to do this after removing kcm from tx_wait list.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+		unreserve_psock(kcm);
+	}
+	release_sock(sk);
+
+	WARN_ON(kcm->tx_wait);
+	WARN_ON(kcm->tx_psock);
+
+	sock->sk = NULL;
+
+	kcm_done(kcm);
+
+	return 0;
+}
+
+static const struct proto_ops kcm_ops = {
+	.family =	PF_KCM,
+	.owner =	THIS_MODULE,
+	.release =	kcm_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		datagram_poll,
+	.ioctl =	kcm_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	kcm_setsockopt,
+	.getsockopt =	kcm_getsockopt,
+	.sendmsg =	kcm_sendmsg,
+	.recvmsg =	kcm_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+		      int protocol, int kern)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct sock *sk;
+	struct kcm_mux *mux;
+
+	switch (sock->type) {
+	case SOCK_DGRAM:
+	case SOCK_SEQPACKET:
+		sock->ops = &kcm_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	if (protocol != KCMPROTO_CONNECTED)
+		return -EPROTONOSUPPORT;
+
+	sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	/* Allocate a kcm mux, shared between KCM sockets */
+	mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+	if (!mux) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&mux->lock);
+	spin_lock_init(&mux->rx_lock);
+	INIT_LIST_HEAD(&mux->kcm_socks);
+	INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+	INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+	INIT_LIST_HEAD(&mux->psocks);
+	INIT_LIST_HEAD(&mux->psocks_ready);
+	INIT_LIST_HEAD(&mux->psocks_avail);
+
+	mux->knet = knet;
+
+	/* Add new MUX to list */
+	mutex_lock(&knet->mutex);
+	list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+	knet->count++;
+	mutex_unlock(&knet->mutex);
+
+	skb_queue_head_init(&mux->rx_hold_queue);
+
+	/* Init KCM socket */
+	sock_init_data(sock, sk);
+	init_kcm_sock(kcm_sk(sk), mux);
+
+	return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+	.family = PF_KCM,
+	.create = kcm_create,
+	.owner  = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	INIT_LIST_HEAD_RCU(&knet->mux_list);
+	mutex_init(&knet->mutex);
+
+	return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	/* All KCM sockets should be closed at this point, which should mean
+	 * that all multiplexors and psocks have been destroyed.
+	 */
+	WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_init_net,
+	.exit = kcm_exit_net,
+	.id   = &kcm_net_id,
+	.size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+	int err = -ENOMEM;
+
+	kcm_muxp = kmem_cache_create("kcm_mux_cache",
+				     sizeof(struct kcm_mux), 0,
+				     SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_muxp)
+		goto fail;
+
+	kcm_psockp = kmem_cache_create("kcm_psock_cache",
+				       sizeof(struct kcm_psock), 0,
+					SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_psockp)
+		goto fail;
+
+	kcm_wq = create_singlethread_workqueue("kkcmd");
+	if (!kcm_wq)
+		goto fail;
+
+	err = proto_register(&kcm_proto, 1);
+	if (err)
+		goto fail;
+
+	err = sock_register(&kcm_family_ops);
+	if (err)
+		goto sock_register_fail;
+
+	err = register_pernet_device(&kcm_net_ops);
+	if (err)
+		goto net_ops_fail;
+
+	return 0;
+
+net_ops_fail:
+	sock_unregister(PF_KCM);
+
+sock_register_fail:
+	proto_unregister(&kcm_proto);
+
+fail:
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+
+	if (kcm_wq)
+		destroy_workqueue(kcm_wq);
+
+	return err;
+}
+
+static void __exit kcm_exit(void)
+{
+	unregister_pernet_device(&kcm_net_ops);
+	sock_unregister(PF_KCM);
+	proto_unregister(&kcm_proto);
+	destroy_workqueue(kcm_wq);
+
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
-- 
cgit v1.2.3


From cd6e111bf5be5c70aef96a86d791ee7be0c0e137 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:07 -0800
Subject: kcm: Add statistics and proc interfaces

This patch adds various counters for KCM. These include counters for
messages and bytes received or sent, as well as counters for number of
attached/unattached TCP sockets and other error or edge events.

The statistics are exposed via a proc interface. /proc/net/kcm provides
statistics per KCM socket and per psock (attached TCP sockets).
/proc/net/kcm_stats provides aggregate statistics.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  94 ++++++++++++
 net/kcm/Makefile  |   2 +-
 net/kcm/kcmproc.c | 422 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/kcm/kcmsock.c |  80 +++++++++++
 4 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 net/kcm/kcmproc.c

(limited to 'include')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 1bcae39070ec..39c7abe98552 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -17,6 +17,42 @@
 
 extern unsigned int kcm_net_id;
 
+#define KCM_STATS_ADD(stat, count) ((stat) += (count))
+#define KCM_STATS_INCR(stat) ((stat)++)
+
+struct kcm_psock_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+	unsigned int rx_aborts;
+	unsigned int rx_mem_fail;
+	unsigned int rx_need_more_hdr;
+	unsigned int rx_bad_hdr_len;
+	unsigned long long reserved;
+	unsigned long long unreserved;
+	unsigned int tx_aborts;
+};
+
+struct kcm_mux_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+	unsigned int rx_ready_drops;
+	unsigned int tx_retries;
+	unsigned int psock_attach;
+	unsigned int psock_unattach_rsvd;
+	unsigned int psock_unattach;
+};
+
+struct kcm_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+};
+
 struct kcm_tx_msg {
 	unsigned int sent;
 	unsigned int fragidx;
@@ -41,6 +77,8 @@ struct kcm_sock {
 	u32 done : 1;
 	struct work_struct done_work;
 
+	struct kcm_stats stats;
+
 	/* Transmit */
 	struct kcm_psock *tx_psock;
 	struct work_struct tx_work;
@@ -77,6 +115,8 @@ struct kcm_psock {
 
 	struct list_head psock_list;
 
+	struct kcm_psock_stats stats;
+
 	/* Receive */
 	struct sk_buff *rx_skb_head;
 	struct sk_buff **rx_skb_nextp;
@@ -86,15 +126,21 @@ struct kcm_psock {
 	struct delayed_work rx_delayed_work;
 	struct bpf_prog *bpf_prog;
 	struct kcm_sock *rx_kcm;
+	unsigned long long saved_rx_bytes;
+	unsigned long long saved_rx_msgs;
 
 	/* Transmit */
 	struct kcm_sock *tx_kcm;
 	struct list_head psock_avail_list;
+	unsigned long long saved_tx_bytes;
+	unsigned long long saved_tx_msgs;
 };
 
 /* Per net MUX list */
 struct kcm_net {
 	struct mutex mutex;
+	struct kcm_psock_stats aggregate_psock_stats;
+	struct kcm_mux_stats aggregate_mux_stats;
 	struct list_head mux_list;
 	int count;
 };
@@ -110,6 +156,9 @@ struct kcm_mux {
 	struct list_head psocks;	/* List of all psocks on MUX */
 	int psocks_cnt;		/* Total attached sockets */
 
+	struct kcm_mux_stats stats;
+	struct kcm_psock_stats aggregate_psock_stats;
+
 	/* Receive */
 	spinlock_t rx_lock ____cacheline_aligned_in_smp;
 	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
@@ -122,4 +171,49 @@ struct kcm_mux {
 	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
 };
 
+#ifdef CONFIG_PROC_FS
+int kcm_proc_init(void);
+void kcm_proc_exit(void);
+#else
+static int kcm_proc_init(void) { return 0; }
+static void kcm_proc_exit(void) { }
+#endif
+
+static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
+					 struct kcm_psock_stats *agg_stats)
+{
+	/* Save psock statistics in the mux when psock is being unattached. */
+
+#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat)
+	SAVE_PSOCK_STATS(rx_msgs);
+	SAVE_PSOCK_STATS(rx_bytes);
+	SAVE_PSOCK_STATS(rx_aborts);
+	SAVE_PSOCK_STATS(rx_mem_fail);
+	SAVE_PSOCK_STATS(rx_need_more_hdr);
+	SAVE_PSOCK_STATS(rx_bad_hdr_len);
+	SAVE_PSOCK_STATS(tx_msgs);
+	SAVE_PSOCK_STATS(tx_bytes);
+	SAVE_PSOCK_STATS(reserved);
+	SAVE_PSOCK_STATS(unreserved);
+	SAVE_PSOCK_STATS(tx_aborts);
+#undef SAVE_PSOCK_STATS
+}
+
+static inline void aggregate_mux_stats(struct kcm_mux_stats *stats,
+				       struct kcm_mux_stats *agg_stats)
+{
+	/* Save psock statistics in the mux when psock is being unattached. */
+
+#define SAVE_MUX_STATS(_stat) (agg_stats->_stat += stats->_stat)
+	SAVE_MUX_STATS(rx_msgs);
+	SAVE_MUX_STATS(rx_bytes);
+	SAVE_MUX_STATS(tx_msgs);
+	SAVE_MUX_STATS(tx_bytes);
+	SAVE_MUX_STATS(rx_ready_drops);
+	SAVE_MUX_STATS(psock_attach);
+	SAVE_MUX_STATS(psock_unattach_rsvd);
+	SAVE_MUX_STATS(psock_unattach);
+#undef SAVE_MUX_STATS
+}
+
 #endif /* __NET_KCM_H_ */
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
index cb525f7c5a13..71256133e677 100644
--- a/net/kcm/Makefile
+++ b/net/kcm/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_AF_KCM) += kcm.o
 
-kcm-y := kcmsock.o
+kcm-y := kcmsock.o kcmproc.o
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
new file mode 100644
index 000000000000..5eb9809c0f59
--- /dev/null
+++ b/net/kcm/kcmproc.c
@@ -0,0 +1,422 @@
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/rculist.h>
+#include <linux/seq_file.h>
+#include <linux/socket.h>
+#include <net/inet_sock.h>
+#include <net/kcm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_PROC_FS
+struct kcm_seq_muxinfo {
+	char				*name;
+	const struct file_operations	*seq_fops;
+	const struct seq_operations	seq_ops;
+};
+
+static struct kcm_mux *kcm_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	return list_first_or_null_rcu(&knet->mux_list,
+				      struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_next(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+
+	return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list,
+				     struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct kcm_mux *m;
+
+	list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) {
+		if (!pos)
+			return m;
+		--pos;
+	}
+	return NULL;
+}
+
+static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	void *p;
+
+	if (v == SEQ_START_TOKEN)
+		p = kcm_get_first(seq);
+	else
+		p = kcm_get_next(v);
+	++*pos;
+	return p;
+}
+
+static void *kcm_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	else
+		return kcm_get_idx(seq, *pos - 1);
+}
+
+static void kcm_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+struct kcm_proc_mux_state {
+	struct seq_net_private p;
+	int idx;
+};
+
+static int kcm_seq_open(struct inode *inode, struct file *file)
+{
+	struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
+	int err;
+
+	err = seq_open_net(inode, file, &muxinfo->seq_ops,
+			   sizeof(struct kcm_proc_mux_state));
+	if (err < 0)
+		return err;
+	return err;
+}
+
+static void kcm_format_mux_header(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	seq_printf(seq,
+		   "*** KCM statistics (%d MUX) ****\n",
+		   knet->count);
+
+	seq_printf(seq,
+		   "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s",
+		   "Object",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "Recv-Q",
+		   "Rmem",
+		   "Send-Q",
+		   "Smem",
+		   "Status");
+
+	/* XXX: pdsts header stuff here */
+	seq_puts(seq, "\n");
+}
+
+static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq,
+			    int i, int *len)
+{
+	seq_printf(seq,
+		   "   kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ",
+		   kcm->index,
+		   kcm->stats.rx_msgs,
+		   kcm->stats.rx_bytes,
+		   kcm->stats.tx_msgs,
+		   kcm->stats.tx_bytes,
+		   kcm->sk.sk_receive_queue.qlen,
+		   sk_rmem_alloc_get(&kcm->sk),
+		   kcm->sk.sk_write_queue.qlen,
+		   "-");
+
+	if (kcm->tx_psock)
+		seq_printf(seq, "Psck-%u ", kcm->tx_psock->index);
+
+	if (kcm->tx_wait)
+		seq_puts(seq, "TxWait ");
+
+	if (kcm->tx_wait_more)
+		seq_puts(seq, "WMore ");
+
+	if (kcm->rx_wait)
+		seq_puts(seq, "RxWait ");
+
+	seq_puts(seq, "\n");
+}
+
+static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
+			     int i, int *len)
+{
+	seq_printf(seq,
+		   "   psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
+		   psock->index,
+		   psock->stats.rx_msgs,
+		   psock->stats.rx_bytes,
+		   psock->stats.tx_msgs,
+		   psock->stats.tx_bytes,
+		   psock->sk->sk_receive_queue.qlen,
+		   atomic_read(&psock->sk->sk_rmem_alloc),
+		   psock->sk->sk_write_queue.qlen,
+		   atomic_read(&psock->sk->sk_wmem_alloc));
+
+	if (psock->done)
+		seq_puts(seq, "Done ");
+
+	if (psock->tx_stopped)
+		seq_puts(seq, "TxStop ");
+
+	if (psock->rx_stopped)
+		seq_puts(seq, "RxStop ");
+
+	if (psock->tx_kcm)
+		seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
+
+	if (psock->ready_rx_msg)
+		seq_puts(seq, "RdyRx ");
+
+	seq_puts(seq, "\n");
+}
+
+static void
+kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq)
+{
+	int i, len;
+	struct kcm_sock *kcm;
+	struct kcm_psock *psock;
+
+	/* mux information */
+	seq_printf(seq,
+		   "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ",
+		   "mux", "",
+		   mux->stats.rx_msgs,
+		   mux->stats.rx_bytes,
+		   mux->stats.tx_msgs,
+		   mux->stats.tx_bytes,
+		   "-", "-", "-", "-");
+
+	seq_printf(seq, "KCMs: %d, Psocks %d\n",
+		   mux->kcm_socks_cnt, mux->psocks_cnt);
+
+	/* kcm sock information */
+	i = 0;
+	spin_lock_bh(&mux->lock);
+	list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) {
+		kcm_format_sock(kcm, seq, i, &len);
+		i++;
+	}
+	i = 0;
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		kcm_format_psock(psock, seq, i, &len);
+		i++;
+	}
+	spin_unlock_bh(&mux->lock);
+}
+
+static int kcm_seq_show(struct seq_file *seq, void *v)
+{
+	struct kcm_proc_mux_state *mux_state;
+
+	mux_state = seq->private;
+	if (v == SEQ_START_TOKEN) {
+		mux_state->idx = 0;
+		kcm_format_mux_header(seq);
+	} else {
+		kcm_format_mux(v, mux_state->idx, seq);
+		mux_state->idx++;
+	}
+	return 0;
+}
+
+static const struct file_operations kcm_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= kcm_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
+	.name		= "kcm",
+	.seq_fops	= &kcm_seq_fops,
+	.seq_ops	= {
+		.show	= kcm_seq_show,
+		.start	= kcm_seq_start,
+		.next	= kcm_seq_next,
+		.stop	= kcm_seq_stop,
+	}
+};
+
+static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net,
+			     muxinfo->seq_fops, muxinfo);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+EXPORT_SYMBOL(kcm_proc_register);
+
+static void kcm_proc_unregister(struct net *net,
+				struct kcm_seq_muxinfo *muxinfo)
+{
+	remove_proc_entry(muxinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(kcm_proc_unregister);
+
+static int kcm_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct kcm_psock_stats psock_stats;
+	struct kcm_mux_stats mux_stats;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+	struct net *net = seq->private;
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	memset(&mux_stats, 0, sizeof(mux_stats));
+	memset(&psock_stats, 0, sizeof(psock_stats));
+
+	mutex_lock(&knet->mutex);
+
+	aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
+	aggregate_psock_stats(&knet->aggregate_psock_stats,
+			      &psock_stats);
+
+	list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+		spin_lock_bh(&mux->lock);
+		aggregate_mux_stats(&mux->stats, &mux_stats);
+		aggregate_psock_stats(&mux->aggregate_psock_stats,
+				      &psock_stats);
+		list_for_each_entry(psock, &mux->psocks, psock_list)
+			aggregate_psock_stats(&psock->stats, &psock_stats);
+		spin_unlock_bh(&mux->lock);
+	}
+
+	mutex_unlock(&knet->mutex);
+
+	seq_printf(seq,
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n",
+		   "MUX",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "TX-Retries",
+		   "Attach",
+		   "Unattach",
+		   "UnattchRsvd",
+		   "RX-RdyDrops");
+
+	seq_printf(seq,
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "",
+		   mux_stats.rx_msgs,
+		   mux_stats.rx_bytes,
+		   mux_stats.tx_msgs,
+		   mux_stats.tx_bytes,
+		   mux_stats.tx_retries,
+		   mux_stats.psock_attach,
+		   mux_stats.psock_unattach_rsvd,
+		   mux_stats.psock_unattach,
+		   mux_stats.rx_ready_drops);
+
+	seq_printf(seq,
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "Psock",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "Reserved",
+		   "Unreserved",
+		   "RX-Aborts",
+		   "RX-MemFail",
+		   "RX-NeedMor",
+		   "RX-BadLen",
+		   "TX-Aborts");
+
+	seq_printf(seq,
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "",
+		   psock_stats.rx_msgs,
+		   psock_stats.rx_bytes,
+		   psock_stats.tx_msgs,
+		   psock_stats.tx_bytes,
+		   psock_stats.reserved,
+		   psock_stats.unreserved,
+		   psock_stats.rx_aborts,
+		   psock_stats.rx_mem_fail,
+		   psock_stats.rx_need_more_hdr,
+		   psock_stats.rx_bad_hdr_len,
+		   psock_stats.tx_aborts);
+
+	return 0;
+}
+
+static int kcm_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, kcm_stats_seq_show);
+}
+
+static const struct file_operations kcm_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kcm_stats_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release_net,
+};
+
+static int kcm_proc_init_net(struct net *net)
+{
+	int err;
+
+	if (!proc_create("kcm_stats", S_IRUGO, net->proc_net,
+			 &kcm_stats_seq_fops)) {
+		err = -ENOMEM;
+		goto out_kcm_stats;
+	}
+
+	err = kcm_proc_register(net, &kcm_seq_muxinfo);
+	if (err)
+		goto out_kcm;
+
+	return 0;
+
+out_kcm:
+	remove_proc_entry("kcm_stats", net->proc_net);
+out_kcm_stats:
+	return err;
+}
+
+static void kcm_proc_exit_net(struct net *net)
+{
+	kcm_proc_unregister(net, &kcm_seq_muxinfo);
+	remove_proc_entry("kcm_stats", net->proc_net);
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_proc_init_net,
+	.exit = kcm_proc_exit_net,
+};
+
+int __init kcm_proc_init(void)
+{
+	return register_pernet_subsys(&kcm_net_ops);
+}
+
+void __exit kcm_proc_exit(void)
+{
+	unregister_pernet_subsys(&kcm_net_ops);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 30ef69ac6b81..f938d7d3e6e2 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -59,6 +59,7 @@ static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
 		return;
 
 	psock->rx_stopped = 1;
+	KCM_STATS_INCR(psock->stats.rx_aborts);
 
 	/* Report an error on the lower socket */
 	report_csk_error(csk, err);
@@ -80,6 +81,7 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
 	}
 
 	psock->tx_stopped = 1;
+	KCM_STATS_INCR(psock->stats.tx_aborts);
 
 	if (!psock->tx_kcm) {
 		/* Take off psocks_avail list */
@@ -101,6 +103,29 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
 	report_csk_error(csk, err);
 }
 
+/* RX mux lock held. */
+static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
+				    struct kcm_psock *psock)
+{
+	KCM_STATS_ADD(mux->stats.rx_bytes,
+		      psock->stats.rx_bytes - psock->saved_rx_bytes);
+	mux->stats.rx_msgs +=
+		psock->stats.rx_msgs - psock->saved_rx_msgs;
+	psock->saved_rx_msgs = psock->stats.rx_msgs;
+	psock->saved_rx_bytes = psock->stats.rx_bytes;
+}
+
+static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
+				    struct kcm_psock *psock)
+{
+	KCM_STATS_ADD(mux->stats.tx_bytes,
+		      psock->stats.tx_bytes - psock->saved_tx_bytes);
+	mux->stats.tx_msgs +=
+		psock->stats.tx_msgs - psock->saved_tx_msgs;
+	psock->saved_tx_msgs = psock->stats.tx_msgs;
+	psock->saved_tx_bytes = psock->stats.tx_bytes;
+}
+
 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 /* KCM is ready to receive messages on its queue-- either the KCM is new or
@@ -254,6 +279,8 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
 		return psock->rx_kcm;
 	}
 
+	kcm_update_rx_mux_stats(mux, psock);
+
 	if (list_empty(&mux->kcm_rx_waiters)) {
 		psock->ready_rx_msg = head;
 		list_add_tail(&psock->psock_ready_list,
@@ -356,10 +383,12 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
 			if (!orig_skb) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = -ENOMEM;
 				return 0;
 			}
 			if (!pskb_pull(orig_skb, orig_offset)) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				kfree_skb(orig_skb);
 				desc->error = -ENOMEM;
 				return 0;
@@ -374,6 +403,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			err = skb_unclone(head, GFP_ATOMIC);
 			if (err) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = err;
 				return 0;
 			}
@@ -392,6 +422,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 				skb = alloc_skb(0, GFP_ATOMIC);
 				if (!skb) {
+					KCM_STATS_INCR(psock->stats.rx_mem_fail);
 					desc->error = -ENOMEM;
 					return 0;
 				}
@@ -414,6 +445,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		/* Always clone since we will consume something */
 		skb = skb_clone(orig_skb, GFP_ATOMIC);
 		if (!skb) {
+			KCM_STATS_INCR(psock->stats.rx_mem_fail);
 			desc->error = -ENOMEM;
 			break;
 		}
@@ -435,6 +467,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			err = skb_unclone(skb, GFP_ATOMIC);
 			if (err) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = err;
 				break;
 			}
@@ -456,6 +489,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				/* Need more header to determine length */
 				rxm->accum_len += cand_len;
 				eaten += cand_len;
+				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
 				WARN_ON(eaten != orig_len);
 				break;
 			} else if (len <= (ssize_t)head->len -
@@ -463,6 +497,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				/* Length must be into new skb (and also
 				 * greater than zero)
 				 */
+				KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
 				desc->error = -EPROTO;
 				psock->rx_skb_head = NULL;
 				kcm_abort_rx_psock(psock, EPROTO, head);
@@ -492,6 +527,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 		/* Hurray, we have a new message! */
 		psock->rx_skb_head = NULL;
+		KCM_STATS_INCR(psock->stats.rx_msgs);
 
 try_queue:
 		kcm = reserve_rx_kcm(psock, head);
@@ -510,6 +546,8 @@ try_queue:
 	if (cloned_orig)
 		kfree_skb(orig_skb);
 
+	KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
+
 	return eaten;
 }
 
@@ -671,6 +709,7 @@ static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
 		}
 		kcm->tx_psock = psock;
 		psock->tx_kcm = kcm;
+		KCM_STATS_INCR(psock->stats.reserved);
 	} else if (!kcm->tx_wait) {
 		list_add_tail(&kcm->wait_psock_list,
 			      &mux->kcm_tx_waiters);
@@ -705,6 +744,7 @@ static void psock_now_avail(struct kcm_psock *psock)
 		smp_mb();
 
 		kcm->tx_psock = psock;
+		KCM_STATS_INCR(psock->stats.reserved);
 		queue_work(kcm_wq, &kcm->tx_work);
 	}
 }
@@ -726,10 +766,13 @@ static void unreserve_psock(struct kcm_sock *kcm)
 
 	smp_rmb(); /* Read tx_psock before tx_wait */
 
+	kcm_update_tx_mux_stats(mux, psock);
+
 	WARN_ON(kcm->tx_wait);
 
 	kcm->tx_psock = NULL;
 	psock->tx_kcm = NULL;
+	KCM_STATS_INCR(psock->stats.unreserved);
 
 	if (unlikely(psock->tx_stopped)) {
 		if (psock->done) {
@@ -753,6 +796,15 @@ static void unreserve_psock(struct kcm_sock *kcm)
 	spin_unlock_bh(&mux->lock);
 }
 
+static void kcm_report_tx_retry(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+	KCM_STATS_INCR(mux->stats.tx_retries);
+	spin_unlock_bh(&mux->lock);
+}
+
 /* Write any messages ready on the kcm socket.  Called with kcm sock lock
  * held.  Return bytes actually sent or error.
  */
@@ -773,6 +825,7 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
 		 * it and we'll retry the message.
 		 */
 		unreserve_psock(kcm);
+		kcm_report_tx_retry(kcm);
 		if (skb_queue_empty(&sk->sk_write_queue))
 			return 0;
 
@@ -856,6 +909,7 @@ do_frag:
 				unreserve_psock(kcm);
 
 				txm->sent = 0;
+				kcm_report_tx_retry(kcm);
 				ret = 0;
 
 				goto try_again;
@@ -863,6 +917,7 @@ do_frag:
 
 			sent += ret;
 			frag_offset += ret;
+			KCM_STATS_ADD(psock->stats.tx_bytes, ret);
 			if (frag_offset < frag->size) {
 				/* Not finished with this frag */
 				goto do_frag;
@@ -884,6 +939,7 @@ do_frag:
 		kfree_skb(head);
 		sk->sk_wmem_queued -= sent;
 		total_sent += sent;
+		KCM_STATS_INCR(psock->stats.tx_msgs);
 	} while ((head = skb_peek(&sk->sk_write_queue)));
 out:
 	if (!head) {
@@ -1061,6 +1117,7 @@ wait_for_memory:
 		/* Message complete, queue it on send buffer */
 		__skb_queue_tail(&sk->sk_write_queue, head);
 		kcm->seq_skb = NULL;
+		KCM_STATS_INCR(kcm->stats.tx_msgs);
 
 		if (msg->msg_flags & MSG_BATCH) {
 			kcm->tx_wait_more = true;
@@ -1083,6 +1140,8 @@ partial_message:
 		kcm_tx_msg(head)->last_skb = skb;
 	}
 
+	KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
+
 	release_sock(sk);
 	return copied;
 
@@ -1144,6 +1203,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 		       size_t len, int flags)
 {
 	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
 	int err = 0;
 	long timeo;
 	struct kcm_rx_msg *rxm;
@@ -1171,6 +1231,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	copied = len;
 	if (likely(!(flags & MSG_PEEK))) {
+		KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
 		if (copied < rxm->full_len) {
 			if (sock->type == SOCK_DGRAM) {
 				/* Truncated message */
@@ -1183,6 +1244,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 msg_finished:
 			/* Finished with message */
 			msg->msg_flags |= MSG_EOR;
+			KCM_STATS_INCR(kcm->stats.rx_msgs);
 			skb_unlink(skb, &sk->sk_receive_queue);
 			kfree_skb(skb);
 		}
@@ -1394,6 +1456,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	list_add(&psock->psock_list, head);
 	psock->index = index;
 
+	KCM_STATS_INCR(mux->stats.psock_attach);
 	mux->psocks_cnt++;
 	psock_now_avail(psock);
 	spin_unlock_bh(&mux->lock);
@@ -1469,6 +1532,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 		list_del(&psock->psock_ready_list);
 		kfree_skb(psock->ready_rx_msg);
 		psock->ready_rx_msg = NULL;
+		KCM_STATS_INCR(mux->stats.rx_ready_drops);
 	}
 
 	spin_unlock_bh(&mux->rx_lock);
@@ -1485,11 +1549,16 @@ static void kcm_unattach(struct kcm_psock *psock)
 
 	spin_lock_bh(&mux->lock);
 
+	aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
+
+	KCM_STATS_INCR(mux->stats.psock_unattach);
+
 	if (psock->tx_kcm) {
 		/* psock was reserved.  Just mark it finished and we will clean
 		 * up in the kcm paths, we need kcm lock which can not be
 		 * acquired here.
 		 */
+		KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
 		spin_unlock_bh(&mux->lock);
 
 		/* We are unattaching a socket that is reserved. Abort the
@@ -1717,6 +1786,9 @@ static void release_mux(struct kcm_mux *mux)
 	__skb_queue_purge(&mux->rx_hold_queue);
 
 	mutex_lock(&knet->mutex);
+	aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
+	aggregate_psock_stats(&mux->aggregate_psock_stats,
+			      &knet->aggregate_psock_stats);
 	list_del_rcu(&mux->kcm_mux_list);
 	knet->count--;
 	mutex_unlock(&knet->mutex);
@@ -1979,8 +2051,15 @@ static int __init kcm_init(void)
 	if (err)
 		goto net_ops_fail;
 
+	err = kcm_proc_init();
+	if (err)
+		goto proc_init_fail;
+
 	return 0;
 
+proc_init_fail:
+	unregister_pernet_device(&kcm_net_ops);
+
 net_ops_fail:
 	sock_unregister(PF_KCM);
 
@@ -1999,6 +2078,7 @@ fail:
 
 static void __exit kcm_exit(void)
 {
+	kcm_proc_exit();
 	unregister_pernet_device(&kcm_net_ops);
 	sock_unregister(PF_KCM);
 	proto_unregister(&kcm_proto);
-- 
cgit v1.2.3


From 7ced95ef525c329f947c424859cf2b0a3b731f8c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:10 -0800
Subject: kcm: Add memory limit for receive message construction

Message assembly is performed on the TCP socket. This is logically
equivalent of an application that performs a peek on the socket to find
out how much memory is needed for a receive buffer. The receive socket
buffer also provides the maximum message size which is checked.

The receive algorithm is something like:

   1) Receive the first skbuf for a message (or skbufs if multiple are
      needed to determine message length).
   2) Check the message length against the number of bytes in the TCP
      receive queue (tcp_inq()).
	- If all the bytes of the message are in the queue (incluing the
	  skbuf received), then proceed with message assembly (it should
	  complete with the tcp_read_sock)
        - Else, mark the psock with the number of bytes needed to
	  complete the message.
   3) In TCP data ready function, if the psock indicates that we are
      waiting for the rest of the bytes of a messages, check the number
      of queued bytes against that.
        - If there are still not enough bytes for the message, just
	  return
        - Else, clear the waiting bytes and proceed to receive the
	  skbufs.  The message should now be received in one
	  tcp_read_sock

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  4 ++++
 net/kcm/kcmproc.c |  6 ++++--
 net/kcm/kcmsock.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 39c7abe98552..d892956ff552 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -28,6 +28,7 @@ struct kcm_psock_stats {
 	unsigned int rx_aborts;
 	unsigned int rx_mem_fail;
 	unsigned int rx_need_more_hdr;
+	unsigned int rx_msg_too_big;
 	unsigned int rx_bad_hdr_len;
 	unsigned long long reserved;
 	unsigned long long unreserved;
@@ -66,6 +67,7 @@ struct kcm_rx_msg {
 	int full_len;
 	int accum_len;
 	int offset;
+	int early_eaten;
 };
 
 /* Socket structure for KCM client sockets */
@@ -128,6 +130,7 @@ struct kcm_psock {
 	struct kcm_sock *rx_kcm;
 	unsigned long long saved_rx_bytes;
 	unsigned long long saved_rx_msgs;
+	unsigned int rx_need_bytes;
 
 	/* Transmit */
 	struct kcm_sock *tx_kcm;
@@ -190,6 +193,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
 	SAVE_PSOCK_STATS(rx_aborts);
 	SAVE_PSOCK_STATS(rx_mem_fail);
 	SAVE_PSOCK_STATS(rx_need_more_hdr);
+	SAVE_PSOCK_STATS(rx_msg_too_big);
 	SAVE_PSOCK_STATS(rx_bad_hdr_len);
 	SAVE_PSOCK_STATS(tx_msgs);
 	SAVE_PSOCK_STATS(tx_bytes);
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 5eb9809c0f59..7638b3555b17 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -331,7 +331,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   mux_stats.rx_ready_drops);
 
 	seq_printf(seq,
-		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
 		   "Psock",
 		   "RX-Msgs",
 		   "RX-Bytes",
@@ -343,10 +343,11 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   "RX-MemFail",
 		   "RX-NeedMor",
 		   "RX-BadLen",
+		   "RX-TooBig",
 		   "TX-Aborts");
 
 	seq_printf(seq,
-		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u\n",
 		   "",
 		   psock_stats.rx_msgs,
 		   psock_stats.rx_bytes,
@@ -358,6 +359,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   psock_stats.rx_mem_fail,
 		   psock_stats.rx_need_more_hdr,
 		   psock_stats.rx_bad_hdr_len,
+		   psock_stats.rx_msg_too_big,
 		   psock_stats.tx_aborts);
 
 	return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 9ac24995691c..8bc38d3fff9a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -375,6 +375,19 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	if (head) {
 		/* Message already in progress */
 
+		rxm = kcm_rx_msg(head);
+		if (unlikely(rxm->early_eaten)) {
+			/* Already some number of bytes on the receive sock
+			 * data saved in rx_skb_head, just indicate they
+			 * are consumed.
+			 */
+			eaten = orig_len <= rxm->early_eaten ?
+				orig_len : rxm->early_eaten;
+			rxm->early_eaten -= eaten;
+
+			return eaten;
+		}
+
 		if (unlikely(orig_offset)) {
 			/* Getting data with a non-zero offset when a message is
 			 * in progress is not expected. If it does happen, we
@@ -492,6 +505,13 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
 				WARN_ON(eaten != orig_len);
 				break;
+			} else if (len > psock->sk->sk_rcvbuf) {
+				/* Message length exceeds maximum allowed */
+				KCM_STATS_INCR(psock->stats.rx_msg_too_big);
+				desc->error = -EMSGSIZE;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EMSGSIZE, head);
+				break;
 			} else if (len <= (ssize_t)head->len -
 					  skb->len - rxm->offset) {
 				/* Length must be into new skb (and also
@@ -511,6 +531,23 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 		if (extra < 0) {
 			/* Message not complete yet. */
+			if (rxm->full_len - rxm->accum_len >
+			    tcp_inq(psock->sk)) {
+				/* Don't have the whole messages in the socket
+				 * buffer. Set psock->rx_need_bytes to wait for
+				 * the rest of the message. Also, set "early
+				 * eaten" since we've already buffered the skb
+				 * but don't consume yet per tcp_read_sock.
+				 */
+
+				psock->rx_need_bytes = rxm->full_len -
+						       rxm->accum_len;
+				rxm->accum_len += cand_len;
+				rxm->early_eaten = cand_len;
+				KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
+				desc->count = 0; /* Stop reading socket */
+				break;
+			}
 			rxm->accum_len += cand_len;
 			eaten += cand_len;
 			WARN_ON(eaten != orig_len);
@@ -582,6 +619,13 @@ static void psock_tcp_data_ready(struct sock *sk)
 	if (psock->ready_rx_msg)
 		goto out;
 
+	if (psock->rx_need_bytes) {
+		if (tcp_inq(sk) >= psock->rx_need_bytes)
+			psock->rx_need_bytes = 0;
+		else
+			goto out;
+	}
+
 	if (psock_tcp_read_sock(psock) == -ENOMEM)
 		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
 
-- 
cgit v1.2.3


From 29152a34f72cb4d7ab32885ad2f20a482c92a8f3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:11 -0800
Subject: kcm: Add receive message timeout

This patch adds receive timeout for message assembly on the attached TCP
sockets. The timeout is set when a new messages is started and the whole
message has not been received by TCP (not in the receive queue). If the
completely message is subsequently received the timer is cancelled, if the
timer expires the RX side is aborted.

The timeout value is taken from the socket timeout (SO_RCVTIMEO) that is
set on a TCP socket (i.e. set by get sockopt before attaching a TCP socket
to KCM.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  3 +++
 net/kcm/kcmproc.c |  6 ++++--
 net/kcm/kcmsock.c | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index d892956ff552..95c425ca97b6 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -29,6 +29,7 @@ struct kcm_psock_stats {
 	unsigned int rx_mem_fail;
 	unsigned int rx_need_more_hdr;
 	unsigned int rx_msg_too_big;
+	unsigned int rx_msg_timeouts;
 	unsigned int rx_bad_hdr_len;
 	unsigned long long reserved;
 	unsigned long long unreserved;
@@ -130,6 +131,7 @@ struct kcm_psock {
 	struct kcm_sock *rx_kcm;
 	unsigned long long saved_rx_bytes;
 	unsigned long long saved_rx_msgs;
+	struct timer_list rx_msg_timer;
 	unsigned int rx_need_bytes;
 
 	/* Transmit */
@@ -194,6 +196,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
 	SAVE_PSOCK_STATS(rx_mem_fail);
 	SAVE_PSOCK_STATS(rx_need_more_hdr);
 	SAVE_PSOCK_STATS(rx_msg_too_big);
+	SAVE_PSOCK_STATS(rx_msg_timeouts);
 	SAVE_PSOCK_STATS(rx_bad_hdr_len);
 	SAVE_PSOCK_STATS(tx_msgs);
 	SAVE_PSOCK_STATS(tx_bytes);
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 7638b3555b17..738008726cc6 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -331,7 +331,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   mux_stats.rx_ready_drops);
 
 	seq_printf(seq,
-		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
 		   "Psock",
 		   "RX-Msgs",
 		   "RX-Bytes",
@@ -344,10 +344,11 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   "RX-NeedMor",
 		   "RX-BadLen",
 		   "RX-TooBig",
+		   "RX-Timeout",
 		   "TX-Aborts");
 
 	seq_printf(seq,
-		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u\n",
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
 		   "",
 		   psock_stats.rx_msgs,
 		   psock_stats.rx_bytes,
@@ -360,6 +361,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   psock_stats.rx_need_more_hdr,
 		   psock_stats.rx_bad_hdr_len,
 		   psock_stats.rx_msg_too_big,
+		   psock_stats.rx_msg_timeouts,
 		   psock_stats.tx_aborts);
 
 	return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 8bc38d3fff9a..40662d73204f 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -55,6 +55,8 @@ static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
 
 	/* Unrecoverable error in receive */
 
+	del_timer(&psock->rx_msg_timer);
+
 	if (psock->rx_stopped)
 		return;
 
@@ -351,6 +353,12 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
 	spin_unlock_bh(&mux->rx_lock);
 }
 
+static void kcm_start_rx_timer(struct kcm_psock *psock)
+{
+	if (psock->sk->sk_rcvtimeo)
+		mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
+}
+
 /* Macro to invoke filter function. */
 #define KCM_RUN_FILTER(prog, ctx) \
 	(*prog->bpf_func)(ctx, prog->insnsi)
@@ -500,6 +508,10 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 			if (!len) {
 				/* Need more header to determine length */
+				if (!rxm->accum_len) {
+					/* Start RX timer for new message */
+					kcm_start_rx_timer(psock);
+				}
 				rxm->accum_len += cand_len;
 				eaten += cand_len;
 				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
@@ -540,6 +552,11 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				 * but don't consume yet per tcp_read_sock.
 				 */
 
+				if (!rxm->accum_len) {
+					/* Start RX timer for new message */
+					kcm_start_rx_timer(psock);
+				}
+
 				psock->rx_need_bytes = rxm->full_len -
 						       rxm->accum_len;
 				rxm->accum_len += cand_len;
@@ -563,6 +580,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		eaten += (cand_len - extra);
 
 		/* Hurray, we have a new message! */
+		del_timer(&psock->rx_msg_timer);
 		psock->rx_skb_head = NULL;
 		KCM_STATS_INCR(psock->stats.rx_msgs);
 
@@ -1656,6 +1674,15 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	spin_unlock_bh(&mux->rx_lock);
 }
 
+static void kcm_rx_msg_timeout(unsigned long arg)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)arg;
+
+	/* Message assembly timed out */
+	KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
+	kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
+}
+
 static int kcm_attach(struct socket *sock, struct socket *csock,
 		      struct bpf_prog *prog)
 {
@@ -1685,6 +1712,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	psock->mux = mux;
 	psock->sk = csk;
 	psock->bpf_prog = prog;
+
+	setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout,
+		    (unsigned long)psock);
+
 	INIT_WORK(&psock->rx_work, psock_rx_work);
 	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
 
@@ -1796,6 +1827,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 
 	write_unlock_bh(&csk->sk_callback_lock);
 
+	del_timer_sync(&psock->rx_msg_timer);
 	cancel_work_sync(&psock->rx_work);
 	cancel_delayed_work_sync(&psock->rx_delayed_work);
 
-- 
cgit v1.2.3


From 87aca73737e379f079993802d2c43606f7c5d26c Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Thu, 21 Jan 2016 09:20:12 +0100
Subject: NFC: microread: Drop platform data header file

Originally I only wanted to drop the unneeded inclusion of
<linux/i2c.h>, but then noticed that struct
microread_nfc_platform_data isn't actually used, and
MICROREAD_DRIVER_NAME is redefined in the only file where it is used,
so we can get rid of the header file and dead code altogether.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Cc: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 MAINTAINERS                             |  1 -
 drivers/nfc/microread/i2c.c             |  8 --------
 include/linux/platform_data/microread.h | 35 ---------------------------------
 3 files changed, 44 deletions(-)
 delete mode 100644 include/linux/platform_data/microread.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 355e1c85bad6..5e4e50ff87bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7644,7 +7644,6 @@ F:	net/nfc/
 F:	include/net/nfc/
 F:	include/uapi/linux/nfc.h
 F:	drivers/nfc/
-F:	include/linux/platform_data/microread.h
 F:	include/linux/platform_data/nfcmrvl.h
 F:	include/linux/platform_data/nxp-nci.h
 F:	include/linux/platform_data/pn544.h
diff --git a/drivers/nfc/microread/i2c.c b/drivers/nfc/microread/i2c.c
index 918e8f2eac47..e0e8afd27849 100644
--- a/drivers/nfc/microread/i2c.c
+++ b/drivers/nfc/microread/i2c.c
@@ -246,18 +246,10 @@ static int microread_i2c_probe(struct i2c_client *client,
 			       const struct i2c_device_id *id)
 {
 	struct microread_i2c_phy *phy;
-	struct microread_nfc_platform_data *pdata =
-		dev_get_platdata(&client->dev);
 	int r;
 
 	dev_dbg(&client->dev, "client %p\n", client);
 
-	if (!pdata) {
-		nfc_err(&client->dev, "client %p: missing platform data\n",
-			client);
-		return -EINVAL;
-	}
-
 	phy = devm_kzalloc(&client->dev, sizeof(struct microread_i2c_phy),
 			   GFP_KERNEL);
 	if (!phy)
diff --git a/include/linux/platform_data/microread.h b/include/linux/platform_data/microread.h
deleted file mode 100644
index ca13992089b8..000000000000
--- a/include/linux/platform_data/microread.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Driver include for the Inside Secure microread NFC Chip.
- *
- * Copyright (C) 2011 Tieto Poland
- * Copyright (C) 2012 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _MICROREAD_H
-#define _MICROREAD_H
-
-#include <linux/i2c.h>
-
-#define MICROREAD_DRIVER_NAME	"microread"
-
-/* board config platform data for microread */
-struct microread_nfc_platform_data {
-	unsigned int rst_gpio;
-	unsigned int irq_gpio;
-	unsigned int ioh_gpio;
-};
-
-#endif /* _MICROREAD_H */
-- 
cgit v1.2.3


From 2793a23aacbd754dbbb5cb75093deb7e4103bace Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 9 Mar 2016 21:58:32 -0500
Subject: net: validate variable length ll headers

Netdevice parameter hard_header_len is variously interpreted both as
an upper and lower bound on link layer header length. The field is
used as upper bound when reserving room at allocation, as lower bound
when validating user input in PF_PACKET.

Clarify the definition to be maximum header length. For validation
of untrusted headers, add an optional validate member to header_ops.

Allow bypassing of validation by passing CAP_SYS_RAWIO, for instance
for deliberate testing of corrupt input. In this case, pad trailing
bytes, as some device drivers expect completely initialized headers.

See also http://comments.gmane.org/gmane.linux.network/401064

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index efe7cec111fa..fd30cb545c45 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -268,6 +268,7 @@ struct header_ops {
 	void	(*cache_update)(struct hh_cache *hh,
 				const struct net_device *dev,
 				const unsigned char *haddr);
+	bool	(*validate)(const char *ll_header, unsigned int len);
 };
 
 /* These flag bits are private to the generic network queueing
@@ -1459,8 +1460,7 @@ enum netdev_priv_flags {
  *	@dma:		DMA channel
  *	@mtu:		Interface MTU value
  *	@type:		Interface hardware type
- *	@hard_header_len: Hardware header length, which means that this is the
- *			  minimum size of a packet.
+ *	@hard_header_len: Maximum hardware header length.
  *
  *	@needed_headroom: Extra headroom the hardware may need, but not in all
  *			  cases can this be guaranteed
@@ -2687,6 +2687,24 @@ static inline int dev_parse_header(const struct sk_buff *skb,
 	return dev->header_ops->parse(skb, haddr);
 }
 
+/* ll_header must have at least hard_header_len allocated */
+static inline bool dev_validate_header(const struct net_device *dev,
+				       char *ll_header, int len)
+{
+	if (likely(len >= dev->hard_header_len))
+		return true;
+
+	if (capable(CAP_SYS_RAWIO)) {
+		memset(ll_header + len, 0, dev->hard_header_len - len);
+		return true;
+	}
+
+	if (dev->header_ops && dev->header_ops->validate)
+		return dev->header_ops->validate(ll_header, len);
+
+	return false;
+}
+
 typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len);
 int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
 static inline int unregister_gifconf(unsigned int family)
-- 
cgit v1.2.3


From f16089209e1029d45ae78dd238b6ab9b2c9a886c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Fri, 4 Mar 2016 10:10:20 +0100
Subject: mac802154: use put and get unaligned functions

This patch removes the swap pointer and memmove functionality. Instead
we use the well known put/get unaligned access with specific byte order
handling.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Suggested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 2e3cdd2048d2..6cd7a70706a9 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -16,10 +16,10 @@
 #ifndef NET_MAC802154_H
 #define NET_MAC802154_H
 
+#include <asm/unaligned.h>
 #include <net/af_ieee802154.h>
 #include <linux/ieee802154.h>
 #include <linux/skbuff.h>
-#include <linux/unaligned/memmove.h>
 
 #include <net/cfg802154.h>
 
@@ -254,7 +254,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
 		return cpu_to_le16(0);
 	}
 
-	return (__force __le16)__get_unaligned_memmove16(skb_mac_header(skb));
+	return get_unaligned_le16(skb_mac_header(skb));
 }
 
 /**
@@ -264,7 +264,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb)
  */
 static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src)
 {
-	__put_unaligned_memmove64(swab64p(be64_src), le64_dst);
+	put_unaligned_le64(get_unaligned_be64(be64_src), le64_dst);
 }
 
 /**
@@ -274,7 +274,7 @@ static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src)
  */
 static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src)
 {
-	__put_unaligned_memmove64(swab64p(le64_src), be64_dst);
+	put_unaligned_be64(get_unaligned_le64(le64_src), be64_dst);
 }
 
 /**
@@ -284,7 +284,7 @@ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src)
  */
 static inline void ieee802154_le16_to_be16(void *be16_dst, const void *le16_src)
 {
-	__put_unaligned_memmove16(swab16p(le16_src), be16_dst);
+	put_unaligned_be16(get_unaligned_le16(le16_src), be16_dst);
 }
 
 /**
-- 
cgit v1.2.3


From 82a37adeedd38880940e2772ec1ae27a09353e5a Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Wed, 9 Mar 2016 17:30:34 +0200
Subject: Bluetooth: Add support for limited privacy mode

Introduce a limited privacy mode indicated by value 0x02 to the mgmt
Set Privacy command.

With value 0x02 the kernel will use privacy mode with a resolvable
private address. In case the controller is bondable and discoverable
the identity address will be used.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_conn.c    | 13 ++++++++++--
 net/bluetooth/hci_request.c | 51 +++++++++++++++++++++++++++++++++++++++------
 net/bluetooth/hci_request.h |  2 +-
 net/bluetooth/mgmt.c        | 20 ++++++++++++++++--
 5 files changed, 76 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 339ea57be423..5d38d980b89d 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -233,6 +233,7 @@ enum {
 	HCI_SC_ENABLED,
 	HCI_SC_ONLY,
 	HCI_PRIVACY,
+	HCI_LIMITED_PRIVACY,
 	HCI_RPA_EXPIRED,
 	HCI_RPA_RESOLVING,
 	HCI_HS_ENABLED,
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7264025dc781..bf9f8a801a2e 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -719,6 +719,13 @@ done:
 	hci_dev_unlock(hdev);
 }
 
+static bool conn_use_rpa(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
 static void hci_req_add_le_create_conn(struct hci_request *req,
 				       struct hci_conn *conn)
 {
@@ -729,7 +736,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
 	/* Update random address, but set require_privacy to false so
 	 * that we never connect with an non-resolvable address.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type))
+	if (hci_update_random_address(req, false, conn_use_rpa(conn),
+				      &own_addr_type))
 		return;
 
 	memset(&cp, 0, sizeof(cp));
@@ -774,7 +782,8 @@ static void hci_req_directed_advertising(struct hci_request *req,
 	/* Set require_privacy to false so that the remote device has a
 	 * chance of identifying us.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type) < 0)
+	if (hci_update_random_address(req, false, conn_use_rpa(conn),
+				      &own_addr_type) < 0)
 		return;
 
 	memset(&cp, 0, sizeof(cp));
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 77be344efd18..95a545ca9dbc 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -771,6 +771,11 @@ static u8 update_white_list(struct hci_request *req)
 	return 0x01;
 }
 
+static bool scan_use_rpa(struct hci_dev *hdev)
+{
+	return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
 void hci_req_add_le_passive_scan(struct hci_request *req)
 {
 	struct hci_cp_le_set_scan_param param_cp;
@@ -785,7 +790,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 	 * advertising with our address will be correctly reported
 	 * by the controller.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type))
+	if (hci_update_random_address(req, false, scan_use_rpa(hdev),
+				      &own_addr_type))
 		return;
 
 	/* Adding or removing entries from the white list must
@@ -881,6 +887,29 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
 	return adv_instance->flags;
 }
 
+static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
+{
+	/* If privacy is not enabled don't use RPA */
+	if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+		return false;
+
+	/* If basic privacy mode is enabled use RPA */
+	if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+		return true;
+
+	/* If limited privacy mode is enabled don't use RPA if we're
+	 * both discoverable and bondable.
+	 */
+	if ((flags & MGMT_ADV_FLAG_DISCOV) &&
+	    hci_dev_test_flag(hdev, HCI_BONDABLE))
+		return false;
+
+	/* We're neither bondable nor discoverable in the limited
+	 * privacy mode, therefore use RPA.
+	 */
+	return true;
+}
+
 void __hci_req_enable_advertising(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -914,7 +943,9 @@ void __hci_req_enable_advertising(struct hci_request *req)
 	 * advertising is used. In that case it is fine to use a
 	 * non-resolvable private address.
 	 */
-	if (hci_update_random_address(req, !connectable, &own_addr_type) < 0)
+	if (hci_update_random_address(req, !connectable,
+				      adv_use_rpa(hdev, flags),
+				      &own_addr_type) < 0)
 		return;
 
 	memset(&cp, 0, sizeof(cp));
@@ -1328,7 +1359,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 }
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      u8 *own_addr_type)
+			      bool use_rpa, u8 *own_addr_type)
 {
 	struct hci_dev *hdev = req->hdev;
 	int err;
@@ -1337,7 +1368,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 	 * current RPA has expired or there is something else than
 	 * the current RPA in use, then generate a new one.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+	if (use_rpa) {
 		int to;
 
 		*own_addr_type = ADDR_LE_DEV_RANDOM;
@@ -1599,9 +1630,16 @@ static int discoverable_update(struct hci_request *req, unsigned long opt)
 	/* Advertising instances don't use the global discoverable setting, so
 	 * only update AD if advertising was enabled using Set Advertising.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+	if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
 		__hci_req_update_adv_data(req, 0x00);
 
+		/* Discoverable mode affects the local advertising
+		 * address in limited privacy mode.
+		 */
+		if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+			__hci_req_enable_advertising(req);
+	}
+
 	hci_dev_unlock(hdev);
 
 	return 0;
@@ -1944,7 +1982,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
 	 * address (when privacy feature has been enabled) or non-resolvable
 	 * private address.
 	 */
-	err = hci_update_random_address(req, true, &own_addr_type);
+	err = hci_update_random_address(req, true, scan_use_rpa(hdev),
+					&own_addr_type);
 	if (err < 0)
 		own_addr_type = ADDR_LE_DEV_PUBLIC;
 
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 64ff8c040d50..b2d044bdc732 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -89,7 +89,7 @@ static inline void hci_req_update_scan(struct hci_dev *hdev)
 void __hci_req_update_scan(struct hci_request *req);
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      u8 *own_addr_type);
+			      bool use_rpa, u8 *own_addr_type);
 
 int hci_abort_conn(struct hci_conn *conn, u8 reason);
 void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 5a5089cb6570..2ca355519d79 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1382,8 +1382,19 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
 	if (err < 0)
 		goto unlock;
 
-	if (changed)
+	if (changed) {
+		/* In limited privacy mode the change of bondable mode
+		 * may affect the local advertising address.
+		 */
+		if (hdev_is_powered(hdev) &&
+		    hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+		    hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+		    hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+			queue_work(hdev->req_workqueue,
+				   &hdev->discoverable_update);
+
 		err = new_settings(hdev, sk);
+	}
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -4423,7 +4434,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
 				       MGMT_STATUS_NOT_SUPPORTED);
 
-	if (cp->privacy != 0x00 && cp->privacy != 0x01)
+	if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
 				       MGMT_STATUS_INVALID_PARAMS);
 
@@ -4442,10 +4453,15 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 		changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
 		memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+		if (cp->privacy == 0x02)
+			hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
+		else
+			hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
 	} else {
 		changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
 		memset(hdev->irk, 0, sizeof(hdev->irk));
 		hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+		hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
 	}
 
 	err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev);
-- 
cgit v1.2.3


From f720d0caa0af2c33ad15310974c7320345ab4468 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 10 Mar 2016 19:31:12 +0100
Subject: kcm: mark helper functions inline

The stub helper functions for the newly added kcm_proc_init/exit interfaces
are defined as 'static' in a header file, which leads to build warnings for
each file that includes them without calling them:

include/net/kcm.h:183:12: error: 'kcm_proc_init' defined but not used [-Werror=unused-function]
include/net/kcm.h:184:13: error: 'kcm_proc_exit' defined but not used [-Werror=unused-function]

This marks the two functions as 'static inline' instead, which avoids the
warnings and is obviously what was meant here.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: cd6e111bf5be ("kcm: Add statistics and proc interfaces")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 95c425ca97b6..2840b5825dcc 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -180,8 +180,8 @@ struct kcm_mux {
 int kcm_proc_init(void);
 void kcm_proc_exit(void);
 #else
-static int kcm_proc_init(void) { return 0; }
-static void kcm_proc_exit(void) { }
+static inline int kcm_proc_init(void) { return 0; }
+static inline void kcm_proc_exit(void) { }
 #endif
 
 static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
-- 
cgit v1.2.3


From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:29 +0200
Subject: net/flower: Introduce hardware offload support

This patch is based on a patch made by John Fastabend.
It adds support for offloading cls_flower.
when NETIF_F_HW_TC is on:
  flags = 0       => Rule will be processed twice - by hardware, and if
                     still relevant, by software.
  flags = SKIP_HW => Rull will be processed by software only

If hardware fail/not capabale to apply the rule, operation will NOT
fail. Filter will be processed by SW only.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Suggested-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 ++
 include/net/pkt_cls.h        | 14 ++++++++++
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 64 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fd30cb545c45..41df0b450757 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -786,6 +786,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 enum {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
+	TC_SETUP_CLSFLOWER,
 };
 
 struct tc_cls_u32_offload;
@@ -795,6 +796,7 @@ struct tc_to_netdev {
 	union {
 		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
+		struct tc_cls_flower_offload *cls_flower;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bea14eee373e..5b4e8f08b8f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 	return true;
 }
 
+enum tc_fl_command {
+	TC_CLSFLOWER_REPLACE,
+	TC_CLSFLOWER_DESTROY,
+};
+
+struct tc_cls_flower_offload {
+	enum tc_fl_command command;
+	u64 cookie;
+	struct flow_dissector *dissector;
+	struct fl_flow_key *mask;
+	struct fl_flow_key *key;
+	struct tcf_exts *exts;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9874f5680926..c43c5f78b9c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -417,6 +417,8 @@ enum {
 	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 95b021243233..25d87666bf1e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
+static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, 0))
+		return;
+
+	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.cookie = cookie;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+				 struct flow_dissector *dissector,
+				 struct fl_flow_key *mask,
+				 struct fl_flow_key *key,
+				 struct tcf_exts *actions,
+				 u64 cookie, u32 flags)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, flags))
+		return;
+
+	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.cookie = cookie;
+	offload.dissector = dissector;
+	offload.mask = mask;
+	offload.key = key;
+	offload.exts = actions;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
+		fl_hw_destroy_filter(tp, (u64)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_filter *fnew;
 	struct nlattr *tb[TCA_FLOWER_MAX + 1];
 	struct fl_flow_mask mask = {};
+	u32 flags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	}
 	fnew->handle = handle;
 
+	if (tb[TCA_FLOWER_FLAGS])
+		flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
 	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
@@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 				     head->ht_params);
 	if (err)
 		goto errout;
-	if (fold)
+
+	fl_hw_replace_filter(tp,
+			     &head->dissector,
+			     &mask.key,
+			     &fnew->key,
+			     &fnew->exts,
+			     (u64)fnew,
+			     flags);
+
+	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
+		fl_hw_destroy_filter(tp, (u64)fold);
+	}
 
 	*arg = (unsigned long) fnew;
 
@@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
+	fl_hw_destroy_filter(tp, (u64)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3


From 8de2d793daf784f8f109565bcc023a6d198bad85 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:30 +0200
Subject: net/flow_dissector: Make dissector_uses_key() and
 skb_flow_dissector_target() public

Will be used in a following patch to query if a key is being used, and
what it's value in the target object.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 8c8548cf5888..d3d60dccd19f 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -184,4 +184,17 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys)
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
 
+static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+				      enum flow_dissector_key_id key_id)
+{
+	return flow_dissector->used_keys & (1 << key_id);
+}
+
+static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+					      enum flow_dissector_key_id key_id,
+					      void *target_container)
+{
+	return ((char *)target_container) + flow_dissector->offset[key_id];
+}
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7c7b8739b8b8..a669dea146c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
-			       enum flow_dissector_key_id key_id)
-{
-	return flow_dissector->used_keys & (1 << key_id);
-}
-
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
 {
 	flow_dissector->used_keys |= (1 << key_id);
 }
 
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
-				       enum flow_dissector_key_id key_id,
-				       void *target_container)
-{
-	return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count)
-- 
cgit v1.2.3


From 00175aec941e9c306d8a5ce930b2d91f7c04468c Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:31 +0200
Subject: net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef

Introduce the macros tc_no_actions and tc_for_each_action to make code
clearer.
Extracted struct tc_action out of the ifdef to make calls to
is_tcf_gact_shot() and similar functions valid, even when it is a nop.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        | 21 ++++++++++++++++-----
 include/net/tc_act/tc_gact.h |  4 ++--
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 342be6c5ab5c..2a19fe111c78 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -78,11 +78,6 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
 		tm->lastuse = now;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-
-#define ACT_P_CREATED 1
-#define ACT_P_DELETED 1
-
 struct tc_action {
 	void			*priv;
 	const struct tc_action_ops	*ops;
@@ -92,6 +87,11 @@ struct tc_action {
 	struct tcf_hashinfo	*hinfo;
 };
 
+#ifdef CONFIG_NET_CLS_ACT
+
+#define ACT_P_CREATED 1
+#define ACT_P_DELETED 1
+
 struct tc_action_ops {
 	struct list_head head;
 	char    kind[IFNAMSIZ];
@@ -171,5 +171,16 @@ int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
+
+#define tc_no_actions(_exts) \
+	(list_empty(&(_exts)->actions))
+
+#define tc_for_each_action(_a, _exts) \
+	list_for_each_entry(a, &(_exts)->actions, list)
+#else /* CONFIG_NET_CLS_ACT */
+
+#define tc_no_actions(_exts) true
+#define tc_for_each_action(_a, _exts) while (0)
+
 #endif /* CONFIG_NET_CLS_ACT */
 #endif
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 04a31830711b..93c520b83d10 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -16,9 +16,9 @@ struct tcf_gact {
 #define to_gact(a) \
 	container_of(a->priv, struct tcf_gact, common)
 
-#ifdef CONFIG_NET_CLS_ACT
 static inline bool is_tcf_gact_shot(const struct tc_action *a)
 {
+#ifdef CONFIG_NET_CLS_ACT
 	struct tcf_gact *gact;
 
 	if (a->ops && a->ops->type != TCA_ACT_GACT)
@@ -28,7 +28,7 @@ static inline bool is_tcf_gact_shot(const struct tc_action *a)
 	if (gact->tcf_action == TC_ACT_SHOT)
 		return true;
 
+#endif
 	return false;
 }
-#endif
 #endif /* __NET_TC_GACT_H */
-- 
cgit v1.2.3


From 519afb1813eab066a0c9995a08861fd0af75d5ae Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:32 +0200
Subject: net/act_skbedit: Utility functions for mark action

Enable device drivers to query the action, if and only if is a mark
action and what value to use for marking.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbedit.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 0df9a0db4a8e..b496d5ad7d42 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -20,6 +20,7 @@
 #define __NET_TC_SKBEDIT_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_skbedit.h>
 
 struct tcf_skbedit {
 	struct tcf_common	common;
@@ -32,4 +33,19 @@ struct tcf_skbedit {
 #define to_skbedit(a) \
 	container_of(a->priv, struct tcf_skbedit, common)
 
+/* Return true iff action is mark */
+static inline bool is_tcf_skbedit_mark(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->type == TCA_ACT_SKBEDIT)
+		return to_skbedit(a)->flags == SKBEDIT_F_MARK;
+#endif
+	return false;
+}
+
+static inline u32 tcf_skbedit_mark(const struct tc_action *a)
+{
+	return to_skbedit(a)->mark;
+}
+
 #endif /* __NET_TC_SKBEDIT_H */
-- 
cgit v1.2.3


From 8208d21bf309551686b7a76d19059ae182a956d0 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Fri, 11 Mar 2016 11:08:45 +0200
Subject: net/flower: Fix pointer cast

Cast pointer to unsigned long instead of u64, to fix compilation warning
on 32 bit arch, spotted by 0day build.

Fixes: 5b33f48 ("net/flower: Introduce hardware offload support")
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  |  2 +-
 net/sched/cls_flower.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 5b4e8f08b8f0..caa5e18636df 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -416,7 +416,7 @@ enum tc_fl_command {
 
 struct tc_cls_flower_offload {
 	enum tc_fl_command command;
-	u64 cookie;
+	unsigned long cookie;
 	struct flow_dissector *dissector;
 	struct fl_flow_key *mask;
 	struct fl_flow_key *key;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 25d87666bf1e..2181ffc76638 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,7 +165,7 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -188,7 +188,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
 				 struct fl_flow_key *mask,
 				 struct fl_flow_key *key,
 				 struct tcf_exts *actions,
-				 u64 cookie, u32 flags)
+				 unsigned long cookie, u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -219,7 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
-		fl_hw_destroy_filter(tp, (u64)f);
+		fl_hw_destroy_filter(tp, (unsigned long)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -554,13 +554,13 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			     &mask.key,
 			     &fnew->key,
 			     &fnew->exts,
-			     (u64)fnew,
+			     (unsigned long)fnew,
 			     flags);
 
 	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
-		fl_hw_destroy_filter(tp, (u64)fold);
+		fl_hw_destroy_filter(tp, (unsigned long)fold);
 	}
 
 	*arg = (unsigned long) fnew;
@@ -588,7 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
-	fl_hw_destroy_filter(tp, (u64)f);
+	fl_hw_destroy_filter(tp, (unsigned long)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3


From 4c656c13b254d598e83e586b7b4d36a2043dad85 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemming@brocade.com>
Date: Tue, 8 Mar 2016 12:59:35 -0800
Subject: bridge: allow zero ageing time

This fixes a regression in the bridge ageing time caused by:
commit c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev")

There are users of Linux bridge which use the feature that if ageing time
is set to 0 it causes entries to never expire. See:
  https://www.linuxfoundation.org/collaborate/workgroups/networking/bridge

For a pure software bridge, it is unnecessary for the code to have
arbitrary restrictions on what values are allowable.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  4 ----
 net/bridge/br_stp.c       | 11 ++++++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index a338a688ee4a..dcb89e3515db 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -46,10 +46,6 @@ struct br_ip_list {
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
 
-/* values as per ieee8021QBridgeFdbAgingTime */
-#define BR_MIN_AGEING_TIME	(10 * HZ)
-#define BR_MAX_AGEING_TIME	(1000000 * HZ)
-
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index c22816a0b1b1..e23449094188 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -562,6 +562,14 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
 
 }
 
+/* Set time interval that dynamic forwarding entries live
+ * For pure software bridge, allow values outside the 802.1
+ * standard specification for special cases:
+ *  0 - entry never ages (all permanant)
+ *  1 - entry disappears (no persistance)
+ *
+ * Offloaded switch entries maybe more restrictive
+ */
 int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
 {
 	struct switchdev_attr attr = {
@@ -573,9 +581,6 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
 	unsigned long t = clock_t_to_jiffies(ageing_time);
 	int err;
 
-	if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
-		return -ERANGE;
-
 	err = switchdev_port_attr_set(br->dev, &attr);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 134611446dc657e1bbc73ca0e4e6b599df687db0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:02 +0100
Subject: ip_tunnel: add support for setting flow label via collect metadata

This patch extends udp_tunnel6_xmit_skb() to pass in the IPv6 flow label
from call sites. Currently, there's no such option and it's always set to
zero when writing ip6_flow_hdr(). Add a label member to ip_tunnel_key, so
that flow-based tunnels via collect metadata frontends can make use of it.
vxlan and geneve will be converted to add flow label support separately.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c       | 2 +-
 drivers/net/vxlan.c        | 2 +-
 include/net/dst_metadata.h | 5 ++++-
 include/net/ip_tunnels.h   | 4 +++-
 include/net/udp_tunnel.h   | 4 ++--
 net/ipv6/ip6_udp_tunnel.c  | 6 +++---
 net/tipc/udp_media.c       | 2 +-
 7 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6a0cbbe03e5d..89ccff79d76c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1054,7 +1054,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl,
+			     &fl6.saddr, &fl6.daddr, prio, ttl, 0,
 			     sport, geneve->dst_port,
 			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
 	return NETDEV_TX_OK;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2399099e68cf..8bdcd5ea8424 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2066,7 +2066,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 		udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
-				     &saddr, &dst->sin6.sin6_addr, tos, ttl,
+				     &saddr, &dst->sin6.sin6_addr, tos, ttl, 0,
 				     src_port, dst_port, !udp_sum);
 #endif
 	}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 84b833af6882..5db9f5910428 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -126,7 +126,7 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 
 	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
 			   iph->saddr, iph->daddr, iph->tos, iph->ttl,
-			   0, 0, tunnel_id, flags);
+			   0, 0, 0, tunnel_id, flags);
 	return tun_dst;
 }
 
@@ -152,8 +152,11 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 
 	info->key.u.ipv6.src = ip6h->saddr;
 	info->key.u.ipv6.dst = ip6h->daddr;
+
 	info->key.tos = ipv6_get_dsfield(ip6h);
 	info->key.ttl = ip6h->hop_limit;
+	info->key.label = ip6_flowlabel(ip6h);
+
 	return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0acd80fadb32..5dc2e454f866 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -48,6 +48,7 @@ struct ip_tunnel_key {
 	__be16			tun_flags;
 	u8			tos;		/* TOS for IPv4, TC for IPv6 */
 	u8			ttl;		/* TTL for IPv4, HL for IPv6 */
+	__be32			label;		/* Flow Label for IPv6 */
 	__be16			tp_src;
 	__be16			tp_dst;
 };
@@ -181,7 +182,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 
 static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 				      __be32 saddr, __be32 daddr,
-				      u8 tos, u8 ttl,
+				      u8 tos, u8 ttl, __be32 label,
 				      __be16 tp_src, __be16 tp_dst,
 				      __be64 tun_id, __be16 tun_flags)
 {
@@ -192,6 +193,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 	       0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
 	key->tos = tos;
 	key->ttl = ttl;
+	key->label = label;
 	key->tun_flags = tun_flags;
 
 	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 97f5adb121a6..b83114077cee 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -88,8 +88,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
 			 struct net_device *dev, struct in6_addr *saddr,
 			 struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be16 src_port,
-			 __be16 dst_port, bool nocheck);
+			 __u8 prio, __u8 ttl, __be32 label,
+			 __be16 src_port, __be16 dst_port, bool nocheck);
 #endif
 
 void udp_tunnel_sock_release(struct socket *sock);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 14dacf1df529..a7520528ecd2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
 			 struct net_device *dev, struct in6_addr *saddr,
 			 struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be16 src_port,
-			 __be16 dst_port, bool nocheck)
+			 __u8 prio, __u8 ttl, __be32 label,
+			 __be16 src_port, __be16 dst_port, bool nocheck)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
@@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	__skb_push(skb, sizeof(*ip6h));
 	skb_reset_network_header(skb);
 	ip6h		  = ipv6_hdr(skb);
-	ip6_flow_hdr(ip6h, prio, htonl(0));
+	ip6_flow_hdr(ip6h, prio, label);
 	ip6h->payload_len = htons(skb->len);
 	ip6h->nexthdr     = IPPROTO_UDP;
 	ip6h->hop_limit   = ttl;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 49b3c2ede7ab..c94f9a15e2cd 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -196,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 		ttl = ip6_dst_hoplimit(ndst);
 		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
 					   ndst->dev, &src->ipv6,
-					   &dst->ipv6, 0, ttl, src->udp_port,
+					   &dst->ipv6, 0, ttl, 0, src->udp_port,
 					   dst->udp_port, false);
 #endif
 	}
-- 
cgit v1.2.3


From e7f70af111f086a20800ad2e17f544b2e3e0f375 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:03 +0100
Subject: vxlan: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for vxlan per
device and through collect metadata (ip_tunnel_key) frontends. The
vxlan dst cache does not need any special considerations here, for
the cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 26 +++++++++++++++++++++-----
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8bdcd5ea8424..8eda76f9e474 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1790,6 +1790,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif, u8 tos,
+					  __be32 label,
 					  const struct in6_addr *daddr,
 					  struct in6_addr *saddr,
 					  struct dst_cache *dst_cache,
@@ -1813,6 +1814,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 	fl6.flowi6_tos = RT_TOS(tos);
 	fl6.daddr = *daddr;
 	fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
+	fl6.flowlabel = label;
 	fl6.flowi6_mark = skb->mark;
 	fl6.flowi6_proto = IPPROTO_UDP;
 
@@ -1888,7 +1890,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
-	__be32 vni;
+	__be32 vni, label;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
@@ -1939,12 +1941,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	if (tos == 1)
 		tos = ip_tunnel_get_dsfield(old_iph, skb);
 
+	label = vxlan->cfg.label;
 	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
 				     vxlan->cfg.port_max, true);
 
 	if (info) {
 		ttl = info->key.ttl;
 		tos = info->key.tos;
+		label = info->key.label;
 		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 
 		if (info->options_len)
@@ -2020,7 +2024,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0, tos,
-					&dst->sin6.sin6_addr, &saddr,
+					label, &dst->sin6.sin6_addr, &saddr,
 					dst_cache, info);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
@@ -2066,8 +2070,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 		udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
-				     &saddr, &dst->sin6.sin6_addr, tos, ttl, 0,
-				     src_port, dst_port, !udp_sum);
+				     &saddr, &dst->sin6.sin6_addr, tos, ttl,
+				     label, src_port, dst_port, !udp_sum);
 #endif
 	}
 
@@ -2390,7 +2394,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 		if (!vxlan->vn6_sock)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos,
-					&info->key.u.ipv6.dst,
+					info->key.label, &info->key.u.ipv6.dst,
 					&info->key.u.ipv6.src, NULL, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
@@ -2505,6 +2509,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
@@ -2739,6 +2744,11 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 		vxlan->flags |= VXLAN_F_IPV6;
 	}
 
+	if (conf->label && !use_ipv6) {
+		pr_info("label only supported in use with IPv6\n");
+		return -EINVAL;
+	}
+
 	if (conf->remote_ifindex) {
 		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
 		dst->remote_ifindex = conf->remote_ifindex;
@@ -2887,6 +2897,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_TTL])
 		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_LABEL])
+		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
+			     IPV6_FLOWLABEL_MASK;
+
 	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
 		conf.flags |= VXLAN_F_LEARN;
 
@@ -2990,6 +3004,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
@@ -3053,6 +3068,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6eda4ed4d78b..a763c96ecde4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -162,6 +162,7 @@ struct vxlan_config {
 	u16			port_max;
 	u8			tos;
 	u8			ttl;
+	__be32			label;
 	u32			flags;
 	unsigned long		age_interval;
 	unsigned int		addrmax;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d452cea59020..6bebc975031d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -456,6 +456,7 @@ enum {
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 	IFLA_VXLAN_COLLECT_METADATA,
+	IFLA_VXLAN_LABEL,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 8eb3b99554b82da968d1fbc00df9f3156c5e2d63 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:04 +0100
Subject: geneve: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for geneve per
device and through collect metadata (ip_tunnel_key) frontends. Also here,
the geneve dst cache does not need any special considerations, for the
cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 35 +++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 89ccff79d76c..33185b9a435e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -68,6 +68,7 @@ struct geneve_dev {
 	u8                 tos;		/* TOS override */
 	union geneve_addr  remote;	/* IP address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
+	__be32		   label;	/* IPv6 flowlabel override */
 	__be16		   dst_port;
 	bool		   collect_md;
 	struct gro_cells   gro_cells;
@@ -846,6 +847,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		fl6->daddr = info->key.u.ipv6.dst;
 		fl6->saddr = info->key.u.ipv6.src;
 		fl6->flowi6_tos = RT_TOS(info->key.tos);
+		fl6->flowlabel = info->key.label;
 		dst_cache = &info->dst_cache;
 	} else {
 		prio = geneve->tos;
@@ -857,6 +859,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		}
 
 		fl6->flowi6_tos = RT_TOS(prio);
+		fl6->flowlabel = geneve->label;
 		fl6->daddr = geneve->remote.sin6.sin6_addr;
 		dst_cache = &geneve->dst_cache;
 	}
@@ -998,6 +1001,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct flowi6 fl6;
 	__u8 prio, ttl;
 	__be16 sport;
+	__be32 label;
 	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 	u32 flags = geneve->flags;
 
@@ -1041,6 +1045,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 
 		prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
 		ttl = key->ttl;
+		label = info->key.label;
 	} else {
 		err = geneve6_build_skb(dst, skb, 0, geneve->vni,
 					0, NULL, flags, xnet);
@@ -1052,9 +1057,11 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
 			ttl = 1;
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
+		label = geneve->label;
 	}
+
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl, 0,
+			     &fl6.saddr, &fl6.daddr, prio, ttl, label,
 			     sport, geneve->dst_port,
 			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
 	return NETDEV_TX_OK;
@@ -1238,6 +1245,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_REMOTE6]		= { .len = sizeof(struct in6_addr) },
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GENEVE_LABEL]		= { .type = NLA_U32 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
 	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
@@ -1295,8 +1303,8 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
 
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    union geneve_addr *remote,
-			    __u32 vni, __u8 ttl, __u8 tos, __be16 dst_port,
-			    bool metadata, u32 flags)
+			    __u32 vni, __u8 ttl, __u8 tos, __be32 label,
+			    __be16 dst_port, bool metadata, u32 flags)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1306,7 +1314,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	if (!remote)
 		return -EINVAL;
 	if (metadata &&
-	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl))
+	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label))
 		return -EINVAL;
 
 	geneve->net = net;
@@ -1321,10 +1329,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	    (remote->sa.sa_family == AF_INET6 &&
 	     ipv6_addr_is_multicast(&remote->sin6.sin6_addr)))
 		return -EINVAL;
+	if (label && remote->sa.sa_family != AF_INET6)
+		return -EINVAL;
+
 	geneve->remote = *remote;
 
 	geneve->ttl = ttl;
 	geneve->tos = tos;
+	geneve->label = label;
 	geneve->dst_port = dst_port;
 	geneve->collect_md = metadata;
 	geneve->flags = flags;
@@ -1367,6 +1379,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	__u8 ttl = 0, tos = 0;
 	bool metadata = false;
 	union geneve_addr remote = geneve_remote_unspec;
+	__be32 label = 0;
 	__u32 vni = 0;
 	u32 flags = 0;
 
@@ -1403,6 +1416,10 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (data[IFLA_GENEVE_TOS])
 		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_LABEL])
+		label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
+			IPV6_FLOWLABEL_MASK;
+
 	if (data[IFLA_GENEVE_PORT])
 		dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
 
@@ -1421,8 +1438,8 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
 		flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
 
-	return geneve_configure(net, dev, &remote, vni, ttl, tos, dst_port,
-				metadata, flags);
+	return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
+				dst_port, metadata, flags);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
@@ -1439,6 +1456,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
@@ -1469,7 +1487,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	}
 
 	if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
-	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
+	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) ||
+	    nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label))
 		goto nla_put_failure;
 
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
@@ -1521,7 +1540,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	err = geneve_configure(net, dev, &geneve_remote_unspec,
-			       0, 0, 0, htons(dst_port), true,
+			       0, 0, 0, 0, htons(dst_port), true,
 			       GENEVE_F_UDP_ZERO_CSUM6_RX);
 	if (err)
 		goto err;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6bebc975031d..249eef9a21bd 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum {
 	IFLA_GENEVE_UDP_CSUM,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
+	IFLA_GENEVE_LABEL,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:05 +0100
Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key

This patch extends bpf_tunnel_key with a tunnel_label member, that maps
to ip_tunnel_key's label so underlying backends like vxlan and geneve
can propagate the label to udp_tunnel6_xmit_skb(), where it's being set
in the IPv6 header. It allows for having 20 more bits to encode/decode
flow related meta information programmatically. Tested with vxlan and
geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e30b19012a5..924f537183fd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
+	__u32 tunnel_label;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index a66dc03c261f..6fc3893a6170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1770,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EPROTO;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
+			goto set_compat;
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
 			 */
 			if (ip_tunnel_info_af(info) != AF_INET)
 				return -EINVAL;
+set_compat:
 			to = (struct bpf_tunnel_key *)compat;
 			break;
 		default:
@@ -1787,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
 
-	if (flags & BPF_F_TUNINFO_IPV6)
+	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
 		       sizeof(to->remote_ipv6));
-	else
+		to->tunnel_label = be32_to_cpu(info->key.label);
+	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
 		memcpy((void *)(long) r2, to, size);
@@ -1850,6 +1855,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
@@ -1862,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			return -EINVAL;
 		}
 	}
+	if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+		return -EINVAL;
 
 	skb_dst_drop(skb);
 	dst_hold((struct dst_entry *) md);
@@ -1882,6 +1890,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		info->mode |= IP_TUNNEL_INFO_IPV6;
 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
 		       sizeof(from->remote_ipv6));
+		info->key.label = cpu_to_be32(from->tunnel_label) &
+				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 		if (flags & BPF_F_ZERO_CSUM_TX)
-- 
cgit v1.2.3


From 338039635d01524090e7bd706a3e555e20d5b337 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 9 Mar 2016 09:25:26 -0800
Subject: csum: Update csum_block_add to use rotate instead of byteswap

The code for csum_block_add was doing a funky byteswap to swap the even and
odd bytes of the checksum if the offset was odd.  Instead of doing this we
can save ourselves some trouble and just shift by 8 as this should have the
same effect in terms of the final checksum value and only requires one
instruction.

In addition we can update csum_block_sub to just use csum_block_add with a
inverse value for csum2.  This way we follow the same code path as
csum_block_add without having to duplicate it.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index abffc64e7300..5c30891e84e5 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -88,8 +88,11 @@ static inline __wsum
 csum_block_add(__wsum csum, __wsum csum2, int offset)
 {
 	u32 sum = (__force u32)csum2;
-	if (offset&1)
-		sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF);
+
+	/* rotate sum to align it with a 16b boundary */
+	if (offset & 1)
+		sum = ror32(sum, 8);
+
 	return csum_add(csum, (__force __wsum)sum);
 }
 
@@ -102,10 +105,7 @@ csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len)
 static inline __wsum
 csum_block_sub(__wsum csum, __wsum csum2, int offset)
 {
-	u32 sum = (__force u32)csum2;
-	if (offset&1)
-		sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF);
-	return csum_sub(csum, (__force __wsum)sum);
+	return csum_block_add(csum, ~csum2, offset);
 }
 
 static inline __wsum csum_unfold(__sum16 n)
-- 
cgit v1.2.3


From 136ba622de49a6bf1f6e5eab3391ed5d5dbe30e3 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 10 Mar 2016 08:55:50 +0000
Subject: netconf: add macro to represent all attributes

This patch adds macro NETCONFA_ALL to represent all type of netconf
attributes for IPv4 and IPv6.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 40 +++++++++++++++++++++++-----------------
 net/ipv6/addrconf.c          | 36 +++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 23cbd34e4ac7..45dfad509c4d 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
+#define NETCONFA_ALL	-1
 
 #define NETCONFA_IFINDEX_ALL		-1
 #define NETCONFA_IFINDEX_DEFAULT	-2
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2ccba45..65e76a48382c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1753,17 +1753,20 @@ static int inet_netconf_msgsize_devconf(int type)
 {
 	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
 		   + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_RP_FILTER)
+	if (all || type == NETCONFA_RP_FILTER)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -1776,36 +1779,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING,
 			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+	if ((all || type == NETCONFA_RP_FILTER) &&
 	    nla_put_s32(skb, NETCONFA_RP_FILTER,
 			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
 		goto nla_put_failure;
@@ -1893,14 +1899,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet_netconf_fill_devconf(skb, ifindex, devconf,
 					NETLINK_CB(in_skb).portid,
 					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					-1);
+					NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -1944,7 +1950,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      RTM_NEWNETCONF,
 						      NLM_F_MULTI,
-						      -1) < 0) {
+						      NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1960,7 +1966,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -1971,7 +1977,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c0dab2de5c9..27aed1afcf81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -473,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type)
 {
 	int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
 		    + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
 #ifdef CONFIG_IPV6_MROUTE
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
 #endif
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -497,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET6;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_IPV6_MROUTE
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			devconf->mc_forwarding) < 0)
 		goto nla_put_failure;
 #endif
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			devconf->ignore_routes_with_linkdown) < 0)
 		goto nla_put_failure;
@@ -609,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
 					 NETLINK_CB(in_skb).portid,
 					 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					 -1);
+					 NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -660,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
 						       cb->nlh->nlmsg_seq,
 						       RTM_NEWNETCONF,
 						       NLM_F_MULTI,
-						       -1) < 0) {
+						       NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -676,7 +682,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -687,7 +693,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
-- 
cgit v1.2.3


From 6b8abef5f833b03be1b5af491193477ad609ad35 Mon Sep 17 00:00:00 2001
From: Paul Durrant <Paul.Durrant@citrix.com>
Date: Thu, 10 Mar 2016 12:30:26 +0000
Subject: xen-netback: re-import canonical netif header

The canonical netif header (in the Xen source repo) and the Linux variant
have diverged significantly. Recently much documentation has been added to
the canonical header which is highly useful for developers making
modifications to either xen-netfront or xen-netback. This patch therefore
re-imports the canonical header in its entirity.

To maintain compatibility and some style consistency with the old Linux
variant, the header was stripped of its emacs boilerplate, and
post-processed and copied into place with the following commands:

ed -s netif.h << EOF
H
,s/NETTXF_/XEN_NETTXF_/g
,s/NETRXF_/XEN_NETRXF_/g
,s/NETIF_/XEN_NETIF_/g
,s/XEN_XEN_/XEN_/g
,s/netif/xen_netif/g
,s/xen_xen_/xen_/g
,s/^typedef.*$//g
,s/^    /${TAB}/g
w
$
w
EOF

indent --line-length 80 --linux-style netif.h \
-o include/xen/interface/io/netif.h

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/xen/interface/io/netif.h | 861 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 766 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
index 252ffd4801ef..4f20dbc42910 100644
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -1,16 +1,34 @@
 /******************************************************************************
- * netif.h
+ * xen_netif.h
  *
  * Unified network-device I/O interface for Xen guest OSes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2003-2004, Keir Fraser
  */
 
-#ifndef __XEN_PUBLIC_IO_NETIF_H__
-#define __XEN_PUBLIC_IO_NETIF_H__
+#ifndef __XEN_PUBLIC_IO_XEN_NETIF_H__
+#define __XEN_PUBLIC_IO_XEN_NETIF_H__
 
-#include <xen/interface/io/ring.h>
-#include <xen/interface/grant_table.h>
+#include "ring.h"
+#include "../grant_table.h"
 
 /*
  * Older implementation of Xen network frontend / backend has an
@@ -38,10 +56,10 @@
  * that it cannot safely queue packets (as it may not be kicked to send them).
  */
 
- /*
+/*
  * "feature-split-event-channels" is introduced to separate guest TX
- * and RX notificaion. Backend either doesn't support this feature or
- * advertise it via xenstore as 0 (disabled) or 1 (enabled).
+ * and RX notification. Backend either doesn't support this feature or
+ * advertises it via xenstore as 0 (disabled) or 1 (enabled).
  *
  * To make use of this feature, frontend should allocate two event
  * channels for TX and RX, advertise them to backend as
@@ -118,151 +136,804 @@
  */
 
 /*
- * This is the 'wire' format for packets:
- *  Request 1: xen_netif_tx_request  -- XEN_NETTXF_* (any flags)
- * [Request 2: xen_netif_extra_info]    (only if request 1 has XEN_NETTXF_extra_info)
- * [Request 3: xen_netif_extra_info]    (only if request 2 has XEN_NETIF_EXTRA_MORE)
- *  Request 4: xen_netif_tx_request  -- XEN_NETTXF_more_data
- *  Request 5: xen_netif_tx_request  -- XEN_NETTXF_more_data
+ * "feature-multicast-control" and "feature-dynamic-multicast-control"
+ * advertise the capability to filter ethernet multicast packets in the
+ * backend. If the frontend wishes to take advantage of this feature then
+ * it may set "request-multicast-control". If the backend only advertises
+ * "feature-multicast-control" then "request-multicast-control" must be set
+ * before the frontend moves into the connected state. The backend will
+ * sample the value on this state transition and any subsequent change in
+ * value will have no effect. However, if the backend also advertises
+ * "feature-dynamic-multicast-control" then "request-multicast-control"
+ * may be set by the frontend at any time. In this case, the backend will
+ * watch the value and re-sample on watch events.
+ *
+ * If the sampled value of "request-multicast-control" is set then the
+ * backend transmit side should no longer flood multicast packets to the
+ * frontend, it should instead drop any multicast packet that does not
+ * match in a filter list.
+ * The list is amended by the frontend by sending dummy transmit requests
+ * containing XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} extra-info fragments as
+ * specified below.
+ * Note that the filter list may be amended even if the sampled value of
+ * "request-multicast-control" is not set, however the filter should only
+ * be applied if it is set.
+ */
+
+/*
+ * Control ring
+ * ============
+ *
+ * Some features, such as hashing (detailed below), require a
+ * significant amount of out-of-band data to be passed from frontend to
+ * backend. Use of xenstore is not suitable for large quantities of data
+ * because of quota limitations and so a dedicated 'control ring' is used.
+ * The ability of the backend to use a control ring is advertised by
+ * setting:
+ *
+ * /local/domain/X/backend/<domid>/<vif>/feature-ctrl-ring = "1"
+ *
+ * The frontend provides a control ring to the backend by setting:
+ *
+ * /local/domain/<domid>/device/vif/<vif>/ctrl-ring-ref = <gref>
+ * /local/domain/<domid>/device/vif/<vif>/event-channel-ctrl = <port>
+ *
+ * where <gref> is the grant reference of the shared page used to
+ * implement the control ring and <port> is an event channel to be used
+ * as a mailbox interrupt. These keys must be set before the frontend
+ * moves into the connected state.
+ *
+ * The control ring uses a fixed request/response message size and is
+ * balanced (i.e. one request to one response), so operationally it is much
+ * the same as a transmit or receive ring.
+ * Note that there is no requirement that responses are issued in the same
+ * order as requests.
+ */
+
+/*
+ * Hash types
+ * ==========
+ *
+ * For the purposes of the definitions below, 'Packet[]' is an array of
+ * octets containing an IP packet without options, 'Array[X..Y]' means a
+ * sub-array of 'Array' containing bytes X thru Y inclusive, and '+' is
+ * used to indicate concatenation of arrays.
+ */
+
+/*
+ * A hash calculated over an IP version 4 header as follows:
+ *
+ * Buffer[0..8] = Packet[12..15] (source address) +
+ *                Packet[16..19] (destination address)
+ *
+ * Result = Hash(Buffer, 8)
+ */
+#define _XEN_NETIF_CTRL_HASH_TYPE_IPV4 0
+#define XEN_NETIF_CTRL_HASH_TYPE_IPV4 \
+	(1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV4)
+
+/*
+ * A hash calculated over an IP version 4 header and TCP header as
+ * follows:
+ *
+ * Buffer[0..12] = Packet[12..15] (source address) +
+ *                 Packet[16..19] (destination address) +
+ *                 Packet[20..21] (source port) +
+ *                 Packet[22..23] (destination port)
+ *
+ * Result = Hash(Buffer, 12)
+ */
+#define _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP 1
+#define XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP \
+	(1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP)
+
+/*
+ * A hash calculated over an IP version 6 header as follows:
+ *
+ * Buffer[0..32] = Packet[8..23]  (source address ) +
+ *                 Packet[24..39] (destination address)
+ *
+ * Result = Hash(Buffer, 32)
+ */
+#define _XEN_NETIF_CTRL_HASH_TYPE_IPV6 2
+#define XEN_NETIF_CTRL_HASH_TYPE_IPV6 \
+	(1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV6)
+
+/*
+ * A hash calculated over an IP version 6 header and TCP header as
+ * follows:
+ *
+ * Buffer[0..36] = Packet[8..23]  (source address) +
+ *                 Packet[24..39] (destination address) +
+ *                 Packet[40..41] (source port) +
+ *                 Packet[42..43] (destination port)
+ *
+ * Result = Hash(Buffer, 36)
+ */
+#define _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP 3
+#define XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP \
+	(1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP)
+
+/*
+ * Hash algorithms
+ * ===============
+ */
+
+#define XEN_NETIF_CTRL_HASH_ALGORITHM_NONE 0
+
+/*
+ * Toeplitz hash:
+ */
+
+#define XEN_NETIF_CTRL_HASH_ALGORITHM_TOEPLITZ 1
+
+/*
+ * This algorithm uses a 'key' as well as the data buffer itself.
+ * (Buffer[] and Key[] are treated as shift-registers where the MSB of
+ * Buffer/Key[0] is considered 'left-most' and the LSB of Buffer/Key[N-1]
+ * is the 'right-most').
+ *
+ * Value = 0
+ * For number of bits in Buffer[]
+ *    If (left-most bit of Buffer[] is 1)
+ *        Value ^= left-most 32 bits of Key[]
+ *    Key[] << 1
+ *    Buffer[] << 1
+ *
+ * The code below is provided for convenience where an operating system
+ * does not already provide an implementation.
+ */
+#ifdef XEN_NETIF_DEFINE_TOEPLITZ
+static uint32_t xen_netif_toeplitz_hash(const uint8_t *key,
+					unsigned int keylen,
+					const uint8_t *buf, unsigned int buflen)
+{
+	unsigned int keyi, bufi;
+	uint64_t prefix = 0;
+	uint64_t hash = 0;
+
+	/* Pre-load prefix with the first 8 bytes of the key */
+	for (keyi = 0; keyi < 8; keyi++) {
+		prefix <<= 8;
+		prefix |= (keyi < keylen) ? key[keyi] : 0;
+	}
+
+	for (bufi = 0; bufi < buflen; bufi++) {
+		uint8_t byte = buf[bufi];
+		unsigned int bit;
+
+		for (bit = 0; bit < 8; bit++) {
+			if (byte & 0x80)
+				hash ^= prefix;
+			prefix <<= 1;
+			byte <<= 1;
+		}
+
+		/*
+		 * 'prefix' has now been left-shifted by 8, so
+		 * OR in the next byte.
+		 */
+		prefix |= (keyi < keylen) ? key[keyi] : 0;
+		keyi++;
+	}
+
+	/* The valid part of the hash is in the upper 32 bits. */
+	return hash >> 32;
+}
+#endif				/* XEN_NETIF_DEFINE_TOEPLITZ */
+
+/*
+ * Control requests (struct xen_netif_ctrl_request)
+ * ================================================
+ *
+ * All requests have the following format:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    id     |   type    |         data[0]       |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |         data[1]       |         data[2]       |
+ * +-----+-----+-----+-----+-----------------------+
+ *
+ * id: the request identifier, echoed in response.
+ * type: the type of request (see below)
+ * data[]: any data associated with the request (determined by type)
+ */
+
+struct xen_netif_ctrl_request {
+	uint16_t id;
+	uint16_t type;
+
+#define XEN_NETIF_CTRL_TYPE_INVALID               0
+#define XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS        1
+#define XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS        2
+#define XEN_NETIF_CTRL_TYPE_SET_HASH_KEY          3
+#define XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE 4
+#define XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE 5
+#define XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING      6
+#define XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM    7
+
+	uint32_t data[3];
+};
+
+/*
+ * Control responses (struct xen_netif_ctrl_response)
+ * ==================================================
+ *
+ * All responses have the following format:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    id     |   type    |         status        |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |         data          |
+ * +-----+-----+-----+-----+
+ *
+ * id: the corresponding request identifier
+ * type: the type of the corresponding request
+ * status: the status of request processing
+ * data: any data associated with the response (determined by type and
+ *       status)
+ */
+
+struct xen_netif_ctrl_response {
+	uint16_t id;
+	uint16_t type;
+	uint32_t status;
+
+#define XEN_NETIF_CTRL_STATUS_SUCCESS           0
+#define XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     1
+#define XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER 2
+#define XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW   3
+
+	uint32_t data;
+};
+
+/*
+ * Control messages
+ * ================
+ *
+ * XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM
+ * --------------------------------------
+ *
+ * This is sent by the frontend to set the desired hash algorithm.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM
+ *  data[0] = a XEN_NETIF_CTRL_HASH_ALGORITHM_* value
+ *  data[1] = 0
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     - Operation not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - The algorithm is not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS           - Operation successful
+ *
+ * NOTE: Setting data[0] to XEN_NETIF_CTRL_HASH_ALGORITHM_NONE disables
+ *       hashing and the backend is free to choose how it steers packets
+ *       to queues (which is the default behaviour).
+ *
+ * XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS
+ * ----------------------------------
+ *
+ * This is sent by the frontend to query the types of hash supported by
+ * the backend.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS
+ *  data[0] = 0
+ *  data[1] = 0
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not supported
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS       - Operation successful
+ *  data   = supported hash types (if operation was successful)
+ *
+ * NOTE: A valid hash algorithm must be selected before this operation can
+ *       succeed.
+ *
+ * XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS
+ * ----------------------------------
+ *
+ * This is sent by the frontend to set the types of hash that the backend
+ * should calculate. (See above for hash type definitions).
+ * Note that the 'maximal' type of hash should always be chosen. For
+ * example, if the frontend sets both IPV4 and IPV4_TCP hash types then
+ * the latter hash type should be calculated for any TCP packet and the
+ * former only calculated for non-TCP packets.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS
+ *  data[0] = bitwise OR of XEN_NETIF_CTRL_HASH_TYPE_* values
+ *  data[1] = 0
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     - Operation not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - One or more flag
+ *                                                     value is invalid or
+ *                                                     unsupported
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS           - Operation successful
+ *  data   = 0
+ *
+ * NOTE: A valid hash algorithm must be selected before this operation can
+ *       succeed.
+ *       Also, setting data[0] to zero disables hashing and the backend
+ *       is free to choose how it steers packets to queues.
+ *
+ * XEN_NETIF_CTRL_TYPE_SET_HASH_KEY
+ * --------------------------------
+ *
+ * This is sent by the frontend to set the key of the hash if the algorithm
+ * requires it. (See hash algorithms above).
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_SET_HASH_KEY
+ *  data[0] = grant reference of page containing the key (assumed to
+ *            start at beginning of grant)
+ *  data[1] = size of key in octets
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     - Operation not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Key size is invalid
+ *           XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW   - Key size is larger
+ *                                                     than the backend
+ *                                                     supports
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS           - Operation successful
+ *  data   = 0
+ *
+ * NOTE: Any key octets not specified are assumed to be zero (the key
+ *       is assumed to be empty by default) and specifying a new key
+ *       invalidates any previous key, hence specifying a key size of
+ *       zero will clear the key (which ensures that the calculated hash
+ *       will always be zero).
+ *       The maximum size of key is algorithm and backend specific, but
+ *       is also limited by the single grant reference.
+ *       The grant reference may be read-only and must remain valid until
+ *       the response has been processed.
+ *
+ * XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE
+ * -----------------------------------------
+ *
+ * This is sent by the frontend to query the maximum size of mapping
+ * table supported by the backend. The size is specified in terms of
+ * table entries.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE
+ *  data[0] = 0
+ *  data[1] = 0
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not supported
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS       - Operation successful
+ *  data   = maximum number of entries allowed in the mapping table
+ *           (if operation was successful) or zero if a mapping table is
+ *           not supported (i.e. hash mapping is done only by modular
+ *           arithmetic).
+ *
+ * XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE
+ * -------------------------------------
+ *
+ * This is sent by the frontend to set the actual size of the mapping
+ * table to be used by the backend. The size is specified in terms of
+ * table entries.
+ * Any previous table is invalidated by this message and any new table
+ * is assumed to be zero filled.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE
+ *  data[0] = number of entries in mapping table
+ *  data[1] = 0
+ *  data[2] = 0
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     - Operation not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Table size is invalid
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS           - Operation successful
+ *  data   = 0
+ *
+ * NOTE: Setting data[0] to 0 means that hash mapping should be done
+ *       using modular arithmetic.
+ *
+ * XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING
+ * ------------------------------------
+ *
+ * This is sent by the frontend to set the content of the table mapping
+ * hash value to queue number. The backend should calculate the hash from
+ * the packet header, use it as an index into the table (modulo the size
+ * of the table) and then steer the packet to the queue number found at
+ * that index.
+ *
+ * Request:
+ *
+ *  type    = XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING
+ *  data[0] = grant reference of page containing the mapping (sub-)table
+ *            (assumed to start at beginning of grant)
+ *  data[1] = size of (sub-)table in entries
+ *  data[2] = offset, in entries, of sub-table within overall table
+ *
+ * Response:
+ *
+ *  status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED     - Operation not
+ *                                                     supported
+ *           XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Table size or content
+ *                                                     is invalid
+ *           XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW   - Table size is larger
+ *                                                     than the backend
+ *                                                     supports
+ *           XEN_NETIF_CTRL_STATUS_SUCCESS           - Operation successful
+ *  data   = 0
+ *
+ * NOTE: The overall table has the following format:
+ *
+ *          0     1     2     3     4     5     6     7  octet
+ *       +-----+-----+-----+-----+-----+-----+-----+-----+
+ *       |       mapping[0]      |       mapping[1]      |
+ *       +-----+-----+-----+-----+-----+-----+-----+-----+
+ *       |                       .                       |
+ *       |                       .                       |
+ *       |                       .                       |
+ *       +-----+-----+-----+-----+-----+-----+-----+-----+
+ *       |      mapping[N-2]     |      mapping[N-1]     |
+ *       +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ *       where N is specified by a XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE
+ *       message and each  mapping must specifies a queue between 0 and
+ *       "multi-queue-num-queues" (see above).
+ *       The backend may support a mapping table larger than can be
+ *       mapped by a single grant reference. Thus sub-tables within a
+ *       larger table can be individually set by sending multiple messages
+ *       with differing offset values. Specifying a new sub-table does not
+ *       invalidate any table data outside that range.
+ *       The grant reference may be read-only and must remain valid until
+ *       the response has been processed.
+ */
+
+DEFINE_RING_TYPES(xen_netif_ctrl,
+		  struct xen_netif_ctrl_request,
+		  struct xen_netif_ctrl_response);
+
+/*
+ * Guest transmit
+ * ==============
+ *
+ * This is the 'wire' format for transmit (frontend -> backend) packets:
+ *
+ *  Fragment 1: xen_netif_tx_request_t  - flags = XEN_NETTXF_*
+ *                                    size = total packet size
+ * [Extra 1: xen_netif_extra_info_t]    - (only if fragment 1 flags include
+ *                                     XEN_NETTXF_extra_info)
+ *  ...
+ * [Extra N: xen_netif_extra_info_t]    - (only if extra N-1 flags include
+ *                                     XEN_NETIF_EXTRA_MORE)
  *  ...
- *  Request N: xen_netif_tx_request  -- 0
+ *  Fragment N: xen_netif_tx_request_t  - (only if fragment N-1 flags include
+ *                                     XEN_NETTXF_more_data - flags on preceding
+ *                                     extras are not relevant here)
+ *                                    flags = 0
+ *                                    size = fragment size
+ *
+ * NOTE:
+ *
+ * This format slightly is different from that used for receive
+ * (backend -> frontend) packets. Specifically, in a multi-fragment
+ * packet the actual size of fragment 1 can only be determined by
+ * subtracting the sizes of fragments 2..N from the total packet size.
+ *
+ * Ring slot size is 12 octets, however not all request/response
+ * structs use the full size.
+ *
+ * tx request data (xen_netif_tx_request_t)
+ * ------------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | grant ref             | offset    | flags     |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | size      |
+ * +-----+-----+-----+-----+
+ *
+ * grant ref: Reference to buffer page.
+ * offset: Offset within buffer page.
+ * flags: XEN_NETTXF_*.
+ * id: request identifier, echoed in response.
+ * size: packet size in bytes.
+ *
+ * tx response (xen_netif_tx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | status    | unused                |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | unused                |
+ * +-----+-----+-----+-----+
+ *
+ * id: reflects id in transmit request
+ * status: XEN_NETIF_RSP_*
+ *
+ * Guest receive
+ * =============
+ *
+ * This is the 'wire' format for receive (backend -> frontend) packets:
+ *
+ *  Fragment 1: xen_netif_rx_request_t  - flags = XEN_NETRXF_*
+ *                                    size = fragment size
+ * [Extra 1: xen_netif_extra_info_t]    - (only if fragment 1 flags include
+ *                                     XEN_NETRXF_extra_info)
+ *  ...
+ * [Extra N: xen_netif_extra_info_t]    - (only if extra N-1 flags include
+ *                                     XEN_NETIF_EXTRA_MORE)
+ *  ...
+ *  Fragment N: xen_netif_rx_request_t  - (only if fragment N-1 flags include
+ *                                     XEN_NETRXF_more_data - flags on preceding
+ *                                     extras are not relevant here)
+ *                                    flags = 0
+ *                                    size = fragment size
+ *
+ * NOTE:
+ *
+ * This format slightly is different from that used for transmit
+ * (frontend -> backend) packets. Specifically, in a multi-fragment
+ * packet the size of the packet can only be determined by summing the
+ * sizes of fragments 1..N.
+ *
+ * Ring slot size is 8 octets.
+ *
+ * rx request (xen_netif_rx_request_t)
+ * -------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | pad       | gref                  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: request identifier, echoed in response.
+ * gref: reference to incoming granted frame.
+ *
+ * rx response (xen_netif_rx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | offset    | flags     | status    |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: reflects id in receive request
+ * offset: offset in page of start of received packet
+ * flags: XEN_NETRXF_*
+ * status: -ve: XEN_NETIF_RSP_*; +ve: Rx'ed pkt size.
+ *
+ * NOTE: Historically, to support GSO on the frontend receive side, Linux
+ *       netfront does not make use of the rx response id (because, as
+ *       described below, extra info structures overlay the id field).
+ *       Instead it assumes that responses always appear in the same ring
+ *       slot as their corresponding request. Thus, to maintain
+ *       compatibility, backends must make sure this is the case.
+ *
+ * Extra Info
+ * ==========
+ *
+ * Can be present if initial request or response has NET{T,R}XF_extra_info,
+ * or previous extra request has XEN_NETIF_EXTRA_MORE.
+ *
+ * The struct therefore needs to fit into either a tx or rx slot and
+ * is therefore limited to 8 octets.
+ *
+ * NOTE: Because extra info data overlays the usual request/response
+ *       structures, there is no id information in the opposite direction.
+ *       So, if an extra info overlays an rx response the frontend can
+ *       assume that it is in the same ring slot as the request that was
+ *       consumed to make the slot available, and the backend must ensure
+ *       this assumption is true.
+ *
+ * extra info (xen_netif_extra_info_t)
+ * -------------------------------
+ *
+ * General format:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| type specific data                |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | padding for tx        |
+ * +-----+-----+-----+-----+
+ *
+ * type: XEN_NETIF_EXTRA_TYPE_*
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * padding for tx: present only in the tx case due to 8 octet limit
+ *                 from rx case. Not shown in type specific entries
+ *                 below.
+ *
+ * XEN_NETIF_EXTRA_TYPE_GSO:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| size      |type | pad | features  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_GSO
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * size: Maximum payload size of each segment. For example,
+ *       for TCP this is just the path MSS.
+ * type: XEN_NETIF_GSO_TYPE_*: This determines the protocol of
+ *       the packet and any extra features required to segment the
+ *       packet properly.
+ * features: EN_XEN_NETIF_GSO_FEAT_*: This specifies any extra GSO
+ *           features required to process this packet, such as ECN
+ *           support for TCPv4.
+ *
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| addr                              |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * addr: address to add/remove
+ *
+ * XEN_NETIF_EXTRA_TYPE_HASH:
+ *
+ * A backend that supports teoplitz hashing is assumed to accept
+ * this type of extra info in transmit packets.
+ * A frontend that enables hashing is assumed to accept
+ * this type of extra info in receive packets.
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags|htype| alg |LSB ---- value ---- MSB|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_HASH
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * htype: Hash type (one of _XEN_NETIF_CTRL_HASH_TYPE_* - see above)
+ * alg: The algorithm used to calculate the hash (one of
+ *      XEN_NETIF_CTRL_HASH_TYPE_ALGORITHM_* - see above)
+ * value: Hash value
  */
 
 /* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _XEN_NETTXF_csum_blank		(0)
-#define  XEN_NETTXF_csum_blank		(1U<<_XEN_NETTXF_csum_blank)
+#define _XEN_NETTXF_csum_blank     (0)
+#define  XEN_NETTXF_csum_blank     (1U<<_XEN_NETTXF_csum_blank)
 
 /* Packet data has been validated against protocol checksum. */
-#define _XEN_NETTXF_data_validated	(1)
-#define  XEN_NETTXF_data_validated	(1U<<_XEN_NETTXF_data_validated)
+#define _XEN_NETTXF_data_validated (1)
+#define  XEN_NETTXF_data_validated (1U<<_XEN_NETTXF_data_validated)
 
 /* Packet continues in the next request descriptor. */
-#define _XEN_NETTXF_more_data		(2)
-#define  XEN_NETTXF_more_data		(1U<<_XEN_NETTXF_more_data)
+#define _XEN_NETTXF_more_data      (2)
+#define  XEN_NETTXF_more_data      (1U<<_XEN_NETTXF_more_data)
 
 /* Packet to be followed by extra descriptor(s). */
-#define _XEN_NETTXF_extra_info		(3)
-#define  XEN_NETTXF_extra_info		(1U<<_XEN_NETTXF_extra_info)
+#define _XEN_NETTXF_extra_info     (3)
+#define  XEN_NETTXF_extra_info     (1U<<_XEN_NETTXF_extra_info)
 
 #define XEN_NETIF_MAX_TX_SIZE 0xFFFF
 struct xen_netif_tx_request {
-    grant_ref_t gref;      /* Reference to buffer page */
-    uint16_t offset;       /* Offset within buffer page */
-    uint16_t flags;        /* XEN_NETTXF_* */
-    uint16_t id;           /* Echoed in response message. */
-    uint16_t size;         /* Packet size in bytes.       */
+	grant_ref_t gref;
+	uint16_t offset;
+	uint16_t flags;
+	uint16_t id;
+	uint16_t size;
 };
 
 /* Types of xen_netif_extra_info descriptors. */
-#define XEN_NETIF_EXTRA_TYPE_NONE	(0)  /* Never used - invalid */
-#define XEN_NETIF_EXTRA_TYPE_GSO	(1)  /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD	(2)  /* u.mcast */
-#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL	(3)  /* u.mcast */
-#define XEN_NETIF_EXTRA_TYPE_MAX	(4)
+#define XEN_NETIF_EXTRA_TYPE_NONE      (0)	/* Never used - invalid */
+#define XEN_NETIF_EXTRA_TYPE_GSO       (1)	/* u.gso */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2)	/* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3)	/* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_HASH      (4)	/* u.hash */
+#define XEN_NETIF_EXTRA_TYPE_MAX       (5)
 
-/* xen_netif_extra_info flags. */
-#define _XEN_NETIF_EXTRA_FLAG_MORE	(0)
-#define  XEN_NETIF_EXTRA_FLAG_MORE	(1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
+/* xen_netif_extra_info_t flags. */
+#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
+#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
 
 /* GSO types */
-#define XEN_NETIF_GSO_TYPE_NONE		(0)
-#define XEN_NETIF_GSO_TYPE_TCPV4	(1)
-#define XEN_NETIF_GSO_TYPE_TCPV6	(2)
+#define XEN_NETIF_GSO_TYPE_NONE         (0)
+#define XEN_NETIF_GSO_TYPE_TCPV4        (1)
+#define XEN_NETIF_GSO_TYPE_TCPV6        (2)
 
 /*
- * This structure needs to fit within both netif_tx_request and
- * netif_rx_response for compatibility.
+ * This structure needs to fit within both xen_netif_tx_request_t and
+ * xen_netif_rx_response_t for compatibility.
  */
 struct xen_netif_extra_info {
-	uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
-	uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
-
+	uint8_t type;
+	uint8_t flags;
 	union {
 		struct {
-			/*
-			 * Maximum payload size of each segment. For
-			 * example, for TCP this is just the path MSS.
-			 */
 			uint16_t size;
-
-			/*
-			 * GSO type. This determines the protocol of
-			 * the packet and any extra features required
-			 * to segment the packet properly.
-			 */
-			uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
-
-			/* Future expansion. */
+			uint8_t type;
 			uint8_t pad;
-
-			/*
-			 * GSO features. This specifies any extra GSO
-			 * features required to process this packet,
-			 * such as ECN support for TCPv4.
-			 */
-			uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
+			uint16_t features;
 		} gso;
-
 		struct {
-			uint8_t addr[6]; /* Address to add/remove. */
+			uint8_t addr[6];
 		} mcast;
-
+		struct {
+			uint8_t type;
+			uint8_t algorithm;
+			uint8_t value[4];
+		} hash;
 		uint16_t pad[3];
 	} u;
 };
 
 struct xen_netif_tx_response {
 	uint16_t id;
-	int16_t  status;       /* XEN_NETIF_RSP_* */
+	int16_t status;
 };
 
 struct xen_netif_rx_request {
-	uint16_t    id;        /* Echoed in response message.        */
-	grant_ref_t gref;      /* Reference to incoming granted frame */
+	uint16_t id;		/* Echoed in response message.        */
+	uint16_t pad;
+	grant_ref_t gref;
 };
 
 /* Packet data has been validated against protocol checksum. */
-#define _XEN_NETRXF_data_validated	(0)
-#define  XEN_NETRXF_data_validated	(1U<<_XEN_NETRXF_data_validated)
+#define _XEN_NETRXF_data_validated (0)
+#define  XEN_NETRXF_data_validated (1U<<_XEN_NETRXF_data_validated)
 
 /* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _XEN_NETRXF_csum_blank		(1)
-#define  XEN_NETRXF_csum_blank		(1U<<_XEN_NETRXF_csum_blank)
+#define _XEN_NETRXF_csum_blank     (1)
+#define  XEN_NETRXF_csum_blank     (1U<<_XEN_NETRXF_csum_blank)
 
 /* Packet continues in the next request descriptor. */
-#define _XEN_NETRXF_more_data		(2)
-#define  XEN_NETRXF_more_data		(1U<<_XEN_NETRXF_more_data)
+#define _XEN_NETRXF_more_data      (2)
+#define  XEN_NETRXF_more_data      (1U<<_XEN_NETRXF_more_data)
 
 /* Packet to be followed by extra descriptor(s). */
-#define _XEN_NETRXF_extra_info		(3)
-#define  XEN_NETRXF_extra_info		(1U<<_XEN_NETRXF_extra_info)
+#define _XEN_NETRXF_extra_info     (3)
+#define  XEN_NETRXF_extra_info     (1U<<_XEN_NETRXF_extra_info)
 
-/* GSO Prefix descriptor. */
-#define _XEN_NETRXF_gso_prefix		(4)
-#define  XEN_NETRXF_gso_prefix		(1U<<_XEN_NETRXF_gso_prefix)
+/* Packet has GSO prefix. Deprecated but included for compatibility */
+#define _XEN_NETRXF_gso_prefix     (4)
+#define  XEN_NETRXF_gso_prefix     (1U<<_XEN_NETRXF_gso_prefix)
 
 struct xen_netif_rx_response {
-    uint16_t id;
-    uint16_t offset;       /* Offset in page of start of received packet  */
-    uint16_t flags;        /* XEN_NETRXF_* */
-    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+	uint16_t id;
+	uint16_t offset;
+	uint16_t flags;
+	int16_t status;
 };
 
 /*
- * Generate netif ring structures and types.
+ * Generate xen_netif ring structures and types.
  */
 
-DEFINE_RING_TYPES(xen_netif_tx,
-		  struct xen_netif_tx_request,
+DEFINE_RING_TYPES(xen_netif_tx, struct xen_netif_tx_request,
 		  struct xen_netif_tx_response);
-DEFINE_RING_TYPES(xen_netif_rx,
-		  struct xen_netif_rx_request,
+DEFINE_RING_TYPES(xen_netif_rx, struct xen_netif_rx_request,
 		  struct xen_netif_rx_response);
 
-#define XEN_NETIF_RSP_DROPPED	-2
-#define XEN_NETIF_RSP_ERROR	-1
-#define XEN_NETIF_RSP_OKAY	 0
-/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
-#define XEN_NETIF_RSP_NULL	 1
+#define XEN_NETIF_RSP_DROPPED         -2
+#define XEN_NETIF_RSP_ERROR           -1
+#define XEN_NETIF_RSP_OKAY             0
+/* No response: used for auxiliary requests (e.g., xen_netif_extra_info_t). */
+#define XEN_NETIF_RSP_NULL             1
 
 #endif
-- 
cgit v1.2.3


From 470c3822d2ab7fadcbb1ac317ef27b31caac370e Mon Sep 17 00:00:00 2001
From: LABBE Corentin <clabbe.montjoie@gmail.com>
Date: Thu, 10 Mar 2016 13:58:58 +0100
Subject: phy: remove documentation of removed members of phy_device structure

Commit e5a03bfd873c ("phy: Add an mdio_device structure") removed addr,
bus and dev member of the phy_device structure.
This patch remove the documentation about those members.

Signed-off-by: LABBE Corentin <clabbe.montjoie@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d6f3641e7933..2abd7918f64f 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -327,8 +327,6 @@ struct phy_c45_device_ids {
 /* phy_device: An instance of a PHY
  *
  * drv: Pointer to the driver for this PHY instance
- * bus: Pointer to the bus this PHY is on
- * dev: driver model device structure for this PHY
  * phy_id: UID for this device found during discovery
  * c45_ids: 802.3-c45 Device Identifers if is_c45.
  * is_c45:  Set to true if this phy uses clause 45 addressing.
@@ -338,7 +336,6 @@ struct phy_c45_device_ids {
  * suspended: Set to true if this phy has been suspended successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
- * addr: Bus address of PHY
  * link_timeout: The number of timer firings to wait before the
  * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
-- 
cgit v1.2.3


From cea8768f333e3f0bc231d8b815aa4a9e63fa990c Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 10 Mar 2016 18:33:07 -0300
Subject: sctp: allow sctp_transmit_packet and others to use gfp

Currently sctp_sendmsg() triggers some calls that will allocate memory
with GFP_ATOMIC even when not necessary. In the case of
sctp_packet_transmit it will allocate a linear skb that will be used to
construct the packet and this may cause sends to fail due to ENOMEM more
often than anticipated specially with big MTUs.

This patch thus allows it to inherit gfp flags from upper calls so that
it can use GFP_KERNEL if it was triggered by a sctp_sendmsg call or
similar. All others, like retransmits or flushes started from BH, are
still allocated using GFP_ATOMIC.

In netperf tests this didn't result in any performance drawbacks when
memory is not too fragmented and made it trigger ENOMEM way less often.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h      |  2 +-
 include/net/sctp/structs.h | 10 +++---
 net/sctp/associola.c       |  2 +-
 net/sctp/chunk.c           |  6 ++--
 net/sctp/input.c           |  2 +-
 net/sctp/output.c          |  6 ++--
 net/sctp/outqueue.c        | 30 ++++++++---------
 net/sctp/sm_make_chunk.c   | 80 +++++++++++++++++++++++++++-------------------
 net/sctp/sm_sideeffect.c   | 23 ++++++-------
 9 files changed, 89 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 487ef34bbd63..efc01743b9d6 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -201,7 +201,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *,
 struct sctp_chunk * sctp_make_datafrag_empty(struct sctp_association *,
 					const struct sctp_sndrcvinfo *sinfo,
 					int len, const __u8 flags,
-					__u16 ssn);
+					__u16 ssn, gfp_t gfp);
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *,
 				  const __u32);
 struct sctp_chunk *sctp_make_sack(const struct sctp_association *);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d05b56641abc..9d237669c52c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -655,7 +655,7 @@ void sctp_chunk_free(struct sctp_chunk *);
 void  *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data);
 struct sctp_chunk *sctp_chunkify(struct sk_buff *,
 				 const struct sctp_association *,
-				 struct sock *);
+				 struct sock *, gfp_t gfp);
 void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *,
 		     union sctp_addr *);
 const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
@@ -717,10 +717,10 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *,
 				     __u16 sport, __u16 dport);
 struct sctp_packet *sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
 sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *,
-                                       struct sctp_chunk *, int);
+				       struct sctp_chunk *, int, gfp_t);
 sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *,
                                      struct sctp_chunk *);
-int sctp_packet_transmit(struct sctp_packet *);
+int sctp_packet_transmit(struct sctp_packet *, gfp_t);
 void sctp_packet_free(struct sctp_packet *);
 
 static inline int sctp_packet_empty(struct sctp_packet *packet)
@@ -1053,7 +1053,7 @@ struct sctp_outq {
 void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
 void sctp_outq_teardown(struct sctp_outq *);
 void sctp_outq_free(struct sctp_outq*);
-int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk);
+int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
 int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
 int sctp_outq_is_empty(const struct sctp_outq *);
 void sctp_outq_restart(struct sctp_outq *);
@@ -1061,7 +1061,7 @@ void sctp_outq_restart(struct sctp_outq *);
 void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
 		     sctp_retransmit_reason_t);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
-int sctp_outq_uncork(struct sctp_outq *);
+int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 /* Uncork and flush an outqueue.  */
 static inline void sctp_outq_cork(struct sctp_outq *q)
 {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index cd873446433c..a19b3e607703 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1493,7 +1493,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
 
 		asoc->peer.sack_needed = 0;
 
-		sctp_outq_tail(&asoc->outqueue, sack);
+		sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC);
 
 		/* Stop the SACK timer.  */
 		timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 3aa43073e0b9..958ef5f33f4b 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -260,7 +260,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 				frag |= SCTP_DATA_SACK_IMM;
 		}
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0);
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
+						 0, GFP_KERNEL);
 
 		if (!chunk) {
 			err = -ENOMEM;
@@ -296,7 +297,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
 			frag |= SCTP_DATA_SACK_IMM;
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0);
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
+						 0, GFP_KERNEL);
 
 		if (!chunk) {
 			err = -ENOMEM;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 21a2d6b7abaf..db76f1ab4ac2 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -221,7 +221,7 @@ int sctp_rcv(struct sk_buff *skb)
 		goto discard_release;
 
 	/* Create an SCTP packet structure. */
-	chunk = sctp_chunkify(skb, asoc, sk);
+	chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC);
 	if (!chunk)
 		goto discard_release;
 	SCTP_INPUT_CB(skb)->chunk = chunk;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 9d610eddd19e..736c004abfbc 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -153,7 +153,7 @@ void sctp_packet_free(struct sctp_packet *packet)
  */
 sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 				       struct sctp_chunk *chunk,
-				       int one_packet)
+				       int one_packet, gfp_t gfp)
 {
 	sctp_xmit_t retval;
 	int error = 0;
@@ -163,7 +163,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
 	case SCTP_XMIT_PMTU_FULL:
 		if (!packet->has_cookie_echo) {
-			error = sctp_packet_transmit(packet);
+			error = sctp_packet_transmit(packet, gfp);
 			if (error < 0)
 				chunk->skb->sk->sk_err = -error;
 
@@ -376,7 +376,7 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
  *
  * The return value is a normal kernel error return value.
  */
-int sctp_packet_transmit(struct sctp_packet *packet)
+int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c0380cfb16ae..f03541d0f12d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 
 static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout);
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,7 +285,7 @@ void sctp_outq_free(struct sctp_outq *q)
 }
 
 /* Put a new chunk in an sctp_outq.  */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
+int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
 	int error = 0;
@@ -341,7 +341,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
 		return error;
 
 	if (!q->cork)
-		error = sctp_outq_flush(q, 0);
+		error = sctp_outq_flush(q, 0, gfp);
 
 	return error;
 }
@@ -510,7 +510,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * will be flushed at the end.
 	 */
 	if (reason != SCTP_RTXR_FAST_RTX)
-		error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+		error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
 
 	if (error)
 		q->asoc->base.sk->sk_err = -error;
@@ -601,12 +601,12 @@ redo:
 				 * control chunks are already freed so there
 				 * is nothing we can do.
 				 */
-				sctp_packet_transmit(pkt);
+				sctp_packet_transmit(pkt, GFP_ATOMIC);
 				goto redo;
 			}
 
 			/* Send this packet.  */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* If we are retransmitting, we should only
 			 * send a single packet.
@@ -622,7 +622,7 @@ redo:
 
 		case SCTP_XMIT_RWND_FULL:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
@@ -632,7 +632,7 @@ redo:
 
 		case SCTP_XMIT_DELAY:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* Stop sending DATA because of nagle delay. */
 			done = 1;
@@ -685,12 +685,12 @@ redo:
 }
 
 /* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q)
+int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 {
 	if (q->cork)
 		q->cork = 0;
 
-	return sctp_outq_flush(q, 0);
+	return sctp_outq_flush(q, 0, gfp);
 }
 
 
@@ -703,7 +703,7 @@ int sctp_outq_uncork(struct sctp_outq *q)
  * locking concerns must be made.  Today we use the sock lock to protect
  * this function.
  */
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_packet singleton;
@@ -825,7 +825,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			sctp_packet_init(&singleton, transport, sport, dport);
 			sctp_packet_config(&singleton, vtag, 0);
 			sctp_packet_append_chunk(&singleton, chunk);
-			error = sctp_packet_transmit(&singleton);
+			error = sctp_packet_transmit(&singleton, gfp);
 			if (error < 0)
 				return error;
 			break;
@@ -856,7 +856,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
 			status = sctp_packet_transmit_chunk(packet, chunk,
-							    one_packet);
+							    one_packet, gfp);
 			if (status  != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
@@ -1011,7 +1011,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 				 atomic_read(&chunk->skb->users) : -1);
 
 			/* Add the chunk to the packet.  */
-			status = sctp_packet_transmit_chunk(packet, chunk, 0);
+			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
 
 			switch (status) {
 			case SCTP_XMIT_PMTU_FULL:
@@ -1088,7 +1088,7 @@ sctp_flush_out:
 						      send_ready);
 		packet = &t->packet;
 		if (!sctp_packet_empty(packet))
-			error = sctp_packet_transmit(packet);
+			error = sctp_packet_transmit(packet, gfp);
 
 		/* Clear the burst limited state, if any */
 		sctp_transport_burst_reset(t);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5d6a03fad378..8449ca26aa0b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -62,11 +62,13 @@
 #include <net/sctp/sm.h>
 
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen);
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp);
 static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
-					 __u8 flags, int paylen);
+					 __u8 flags, int paylen, gfp_t gfp);
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
-					   __u8 type, __u8 flags, int paylen);
+					   __u8 type, __u8 flags, int paylen,
+					   gfp_t gfp);
 static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *init_chunk,
@@ -318,7 +320,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	 * PLEASE DO NOT FIXME [This version does not support Host Name.]
 	 */
 
-	retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp);
 	if (!retval)
 		goto nodata;
 
@@ -465,7 +467,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 					num_ext);
 
 	/* Now allocate and fill out the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
 	if (!retval)
 		goto nomem_chunk;
 
@@ -570,7 +572,8 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
 	cookie_len = asoc->peer.cookie_len;
 
 	/* Build a cookie echo chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0,
+				   cookie_len, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 	retval->subh.cookie_hdr =
@@ -615,7 +618,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -664,7 +667,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 
 	cwr.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
-				   sizeof(sctp_cwrhdr_t));
+				   sizeof(sctp_cwrhdr_t), GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
@@ -698,7 +701,7 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 
 	ecne.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
-				   sizeof(sctp_ecnehdr_t));
+				   sizeof(sctp_ecnehdr_t), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 	retval->subh.ecne_hdr =
@@ -713,7 +716,8 @@ nodata:
  */
 struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 				       const struct sctp_sndrcvinfo *sinfo,
-				       int data_len, __u8 flags, __u16 ssn)
+				       int data_len, __u8 flags, __u16 ssn,
+				       gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	struct sctp_datahdr dp;
@@ -734,7 +738,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 		dp.ssn = htons(ssn);
 
 	chunk_len = sizeof(dp) + data_len;
-	retval = sctp_make_data(asoc, flags, chunk_len);
+	retval = sctp_make_data(asoc, flags, chunk_len, gfp);
 	if (!retval)
 		goto nodata;
 
@@ -781,7 +785,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 		+ sizeof(__u32) * num_dup_tsns;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len);
+	retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -861,7 +865,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 	shut.cum_tsn_ack = htonl(ctsn);
 
 	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
-				   sizeof(sctp_shutdownhdr_t));
+				   sizeof(sctp_shutdownhdr_t), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -879,7 +883,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0,
+				   GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -908,7 +913,8 @@ struct sctp_chunk *sctp_make_shutdown_complete(
 	 */
 	flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
 
-	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags,
+				   0, GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -947,7 +953,8 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 			flags = SCTP_CHUNK_FLAG_T;
 	}
 
-	retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint,
+				   GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -1139,7 +1146,8 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 	struct sctp_chunk *retval;
 	sctp_sender_hb_info_t hbinfo;
 
-	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
+				   sizeof(hbinfo), GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
@@ -1167,7 +1175,8 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+	retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen,
+				    GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -1200,7 +1209,7 @@ static struct sctp_chunk *sctp_make_op_error_space(
 	struct sctp_chunk *retval;
 
 	retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
-				   sizeof(sctp_errhdr_t) + size);
+				   sizeof(sctp_errhdr_t) + size, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -1271,7 +1280,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 		return NULL;
 
 	retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
-			hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
+			hmac_desc->hmac_len + sizeof(sctp_authhdr_t),
+			GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -1309,11 +1319,11 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
  */
 struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
 			    const struct sctp_association *asoc,
-			    struct sock *sk)
+			    struct sock *sk, gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 
-	retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC);
+	retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp);
 
 	if (!retval)
 		goto nodata;
@@ -1361,7 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
  * arguments, reserving enough space for a 'paylen' byte payload.
  */
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen)
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	sctp_chunkhdr_t *chunk_hdr;
@@ -1369,8 +1380,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sock *sk;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen),
-			GFP_ATOMIC);
+	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
 	if (!skb)
 		goto nodata;
 
@@ -1381,7 +1391,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t));
 
 	sk = asoc ? asoc->base.sk : NULL;
-	retval = sctp_chunkify(skb, asoc, sk);
+	retval = sctp_chunkify(skb, asoc, sk, gfp);
 	if (!retval) {
 		kfree_skb(skb);
 		goto nodata;
@@ -1400,16 +1410,18 @@ nodata:
 }
 
 static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
-					 __u8 flags, int paylen)
+					 __u8 flags, int paylen, gfp_t gfp)
 {
-	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen);
+	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
 }
 
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen)
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp)
 {
-	struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen);
+	struct sctp_chunk *chunk;
 
+	chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp);
 	if (chunk)
 		sctp_control_set_owner_w(chunk);
 
@@ -2756,7 +2768,8 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
 	length += addrlen;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length,
+				   GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -2940,7 +2953,8 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as
 	int			length = sizeof(asconf) + vparam_len;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length,
+				   GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -3500,7 +3514,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	hint = (nstreams + 1) * sizeof(__u32);
 
-	retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC);
 
 	if (!retval)
 		return NULL;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index b5327bb77458..3c22c41a2bc2 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1019,13 +1019,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
  * encouraged for small fragments.
  */
 static int sctp_cmd_send_msg(struct sctp_association *asoc,
-				struct sctp_datamsg *msg)
+				struct sctp_datamsg *msg, gfp_t gfp)
 {
 	struct sctp_chunk *chunk;
 	int error = 0;
 
 	list_for_each_entry(chunk, &msg->chunks, frag_list) {
-		error = sctp_outq_tail(&asoc->outqueue, chunk);
+		error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
 		if (error)
 			break;
 	}
@@ -1249,7 +1249,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_NEW_ASOC:
 			/* Register a new association.  */
 			if (local_cork) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 
@@ -1269,7 +1269,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_DELETE_TCB:
 			if (local_cork) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 			/* Delete the current association.  */
@@ -1423,13 +1423,14 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				local_cork = 1;
 			}
 			/* Send a chunk to our peer.  */
-			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk);
+			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
+					       gfp);
 			break;
 
 		case SCTP_CMD_SEND_PKT:
 			/* Send a full packet to our peer.  */
 			packet = cmd->obj.packet;
-			sctp_packet_transmit(packet);
+			sctp_packet_transmit(packet, gfp);
 			sctp_ootb_pkt_free(packet);
 			break;
 
@@ -1639,7 +1640,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			 */
 			chunk->pdiscard = 1;
 			if (asoc) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 			break;
@@ -1677,7 +1678,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_FORCE_PRIM_RETRAN:
 			t = asoc->peer.retran_path;
 			asoc->peer.retran_path = asoc->peer.primary_path;
-			error = sctp_outq_uncork(&asoc->outqueue);
+			error = sctp_outq_uncork(&asoc->outqueue, gfp);
 			local_cork = 0;
 			asoc->peer.retran_path = t;
 			break;
@@ -1704,7 +1705,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_outq_cork(&asoc->outqueue);
 				local_cork = 1;
 			}
-			error = sctp_cmd_send_msg(asoc, cmd->obj.msg);
+			error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
 			break;
 		case SCTP_CMD_SEND_NEXT_ASCONF:
 			sctp_cmd_send_asconf(asoc);
@@ -1734,9 +1735,9 @@ out:
 	 */
 	if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
 		if (chunk->end_of_packet || chunk->singleton)
-			error = sctp_outq_uncork(&asoc->outqueue);
+			error = sctp_outq_uncork(&asoc->outqueue, gfp);
 	} else if (local_cork)
-		error = sctp_outq_uncork(&asoc->outqueue);
+		error = sctp_outq_uncork(&asoc->outqueue, gfp);
 	return error;
 nomem:
 	error = -ENOMEM;
-- 
cgit v1.2.3


From dece8d2b78d19df7fe5e4e965f1f0d1a3e188d1b Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 11 Mar 2016 18:07:31 +0100
Subject: uapi: add MACsec bits

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild      |   1 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_link.h   |  29 ++++++++
 include/uapi/linux/if_macsec.h | 161 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 include/uapi/linux/if_macsec.h

(limited to 'include')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..e25ebcfbcb48 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -173,6 +173,7 @@ header-y += if_hippi.h
 header-y += if_infiniband.h
 header-y += if_link.h
 header-y += if_ltalk.h
+header-y += if_macsec.h
 header-y += if_packet.h
 header-y += if_phonet.h
 header-y += if_plip.h
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index ea9221b0331a..4a93051c578c 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -83,6 +83,7 @@
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_MACSEC	0x88E5		/* 802.1ae MACsec */
 #define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_MVRP	0x88F5          /* 802.1Q MVRP                  */
 #define ETH_P_1588	0x88F7		/* IEEE 1588 Timesync */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 249eef9a21bd..8e3f88fa5b59 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -413,6 +413,35 @@ enum {
 
 #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
 
+/* MACSEC section */
+enum {
+	IFLA_MACSEC_UNSPEC,
+	IFLA_MACSEC_SCI,
+	IFLA_MACSEC_PORT,
+	IFLA_MACSEC_ICV_LEN,
+	IFLA_MACSEC_CIPHER_SUITE,
+	IFLA_MACSEC_WINDOW,
+	IFLA_MACSEC_ENCODING_SA,
+	IFLA_MACSEC_ENCRYPT,
+	IFLA_MACSEC_PROTECT,
+	IFLA_MACSEC_INC_SCI,
+	IFLA_MACSEC_ES,
+	IFLA_MACSEC_SCB,
+	IFLA_MACSEC_REPLAY_PROTECT,
+	IFLA_MACSEC_VALIDATION,
+	__IFLA_MACSEC_MAX,
+};
+
+#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
+
+enum macsec_validation_type {
+	MACSEC_VALIDATE_DISABLED = 0,
+	MACSEC_VALIDATE_CHECK = 1,
+	MACSEC_VALIDATE_STRICT = 2,
+	__MACSEC_VALIDATE_END,
+	MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1,
+};
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
new file mode 100644
index 000000000000..26b0d1e3e3e7
--- /dev/null
+++ b/include/uapi/linux/if_macsec.h
@@ -0,0 +1,161 @@
+/*
+ * include/uapi/linux/if_macsec.h - MACsec device
+ *
+ * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_MACSEC_H
+#define _UAPI_MACSEC_H
+
+#include <linux/types.h>
+
+#define MACSEC_GENL_NAME "macsec"
+#define MACSEC_GENL_VERSION 1
+
+#define MACSEC_MAX_KEY_LEN 128
+
+#define DEFAULT_CIPHER_ID   0x0080020001000001ULL
+#define DEFAULT_CIPHER_ALT  0x0080C20001000001ULL
+
+#define MACSEC_MIN_ICV_LEN 8
+#define MACSEC_MAX_ICV_LEN 32
+
+enum macsec_attrs {
+	MACSEC_ATTR_UNSPEC,
+	MACSEC_ATTR_IFINDEX,     /* u32, ifindex of the MACsec netdevice */
+	MACSEC_ATTR_RXSC_CONFIG, /* config, nested macsec_rxsc_attrs */
+	MACSEC_ATTR_SA_CONFIG,   /* config, nested macsec_sa_attrs */
+	MACSEC_ATTR_SECY,        /* dump, nested macsec_secy_attrs */
+	MACSEC_ATTR_TXSA_LIST,   /* dump, nested, macsec_sa_attrs for each TXSA */
+	MACSEC_ATTR_RXSC_LIST,   /* dump, nested, macsec_rxsc_attrs for each RXSC */
+	MACSEC_ATTR_TXSC_STATS,  /* dump, nested, macsec_txsc_stats_attr */
+	MACSEC_ATTR_SECY_STATS,  /* dump, nested, macsec_secy_stats_attr */
+	__MACSEC_ATTR_END,
+	NUM_MACSEC_ATTR = __MACSEC_ATTR_END,
+	MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1,
+};
+
+enum macsec_secy_attrs {
+	MACSEC_SECY_ATTR_UNSPEC,
+	MACSEC_SECY_ATTR_SCI,
+	MACSEC_SECY_ATTR_ENCODING_SA,
+	MACSEC_SECY_ATTR_WINDOW,
+	MACSEC_SECY_ATTR_CIPHER_SUITE,
+	MACSEC_SECY_ATTR_ICV_LEN,
+	MACSEC_SECY_ATTR_PROTECT,
+	MACSEC_SECY_ATTR_REPLAY,
+	MACSEC_SECY_ATTR_OPER,
+	MACSEC_SECY_ATTR_VALIDATE,
+	MACSEC_SECY_ATTR_ENCRYPT,
+	MACSEC_SECY_ATTR_INC_SCI,
+	MACSEC_SECY_ATTR_ES,
+	MACSEC_SECY_ATTR_SCB,
+	__MACSEC_SECY_ATTR_END,
+	NUM_MACSEC_SECY_ATTR = __MACSEC_SECY_ATTR_END,
+	MACSEC_SECY_ATTR_MAX = __MACSEC_SECY_ATTR_END - 1,
+};
+
+enum macsec_rxsc_attrs {
+	MACSEC_RXSC_ATTR_UNSPEC,
+	MACSEC_RXSC_ATTR_SCI,     /* config/dump, u64 */
+	MACSEC_RXSC_ATTR_ACTIVE,  /* config/dump, u8 0..1 */
+	MACSEC_RXSC_ATTR_SA_LIST, /* dump, nested */
+	MACSEC_RXSC_ATTR_STATS,   /* dump, nested, macsec_rxsc_stats_attr */
+	__MACSEC_RXSC_ATTR_END,
+	NUM_MACSEC_RXSC_ATTR = __MACSEC_RXSC_ATTR_END,
+	MACSEC_RXSC_ATTR_MAX = __MACSEC_RXSC_ATTR_END - 1,
+};
+
+enum macsec_sa_attrs {
+	MACSEC_SA_ATTR_UNSPEC,
+	MACSEC_SA_ATTR_AN,     /* config/dump, u8 0..3 */
+	MACSEC_SA_ATTR_ACTIVE, /* config/dump, u8 0..1 */
+	MACSEC_SA_ATTR_PN,     /* config/dump, u32 */
+	MACSEC_SA_ATTR_KEY,    /* config, data */
+	MACSEC_SA_ATTR_KEYID,  /* config/dump, u64 */
+	MACSEC_SA_ATTR_STATS,  /* dump, nested, macsec_sa_stats_attr */
+	__MACSEC_SA_ATTR_END,
+	NUM_MACSEC_SA_ATTR = __MACSEC_SA_ATTR_END,
+	MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1,
+};
+
+enum macsec_nl_commands {
+	MACSEC_CMD_GET_TXSC,
+	MACSEC_CMD_ADD_RXSC,
+	MACSEC_CMD_DEL_RXSC,
+	MACSEC_CMD_UPD_RXSC,
+	MACSEC_CMD_ADD_TXSA,
+	MACSEC_CMD_DEL_TXSA,
+	MACSEC_CMD_UPD_TXSA,
+	MACSEC_CMD_ADD_RXSA,
+	MACSEC_CMD_DEL_RXSA,
+	MACSEC_CMD_UPD_RXSA,
+};
+
+/* u64 per-RXSC stats */
+enum macsec_rxsc_stats_attr {
+	MACSEC_RXSC_STATS_ATTR_UNSPEC,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	__MACSEC_RXSC_STATS_ATTR_END,
+	NUM_MACSEC_RXSC_STATS_ATTR = __MACSEC_RXSC_STATS_ATTR_END,
+	MACSEC_RXSC_STATS_ATTR_MAX = __MACSEC_RXSC_STATS_ATTR_END - 1,
+};
+
+/* u32 per-{RX,TX}SA stats */
+enum macsec_sa_stats_attr {
+	MACSEC_SA_STATS_ATTR_UNSPEC,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	__MACSEC_SA_STATS_ATTR_END,
+	NUM_MACSEC_SA_STATS_ATTR = __MACSEC_SA_STATS_ATTR_END,
+	MACSEC_SA_STATS_ATTR_MAX = __MACSEC_SA_STATS_ATTR_END - 1,
+};
+
+/* u64 per-TXSC stats */
+enum macsec_txsc_stats_attr {
+	MACSEC_TXSC_STATS_ATTR_UNSPEC,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
+	__MACSEC_TXSC_STATS_ATTR_END,
+	NUM_MACSEC_TXSC_STATS_ATTR = __MACSEC_TXSC_STATS_ATTR_END,
+	MACSEC_TXSC_STATS_ATTR_MAX = __MACSEC_TXSC_STATS_ATTR_END - 1,
+};
+
+/* u64 per-SecY stats */
+enum macsec_secy_stats_attr {
+	MACSEC_SECY_STATS_ATTR_UNSPEC,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
+	__MACSEC_SECY_STATS_ATTR_END,
+	NUM_MACSEC_SECY_STATS_ATTR = __MACSEC_SECY_STATS_ATTR_END,
+	MACSEC_SECY_STATS_ATTR_MAX = __MACSEC_SECY_STATS_ATTR_END - 1,
+};
+
+#endif /* _UAPI_MACSEC_H */
-- 
cgit v1.2.3


From 3c17578473b9be5a6e7680a45ea97e1d56e13249 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 11 Mar 2016 18:07:32 +0100
Subject: net: add MACsec netdevice priv_flags and helper

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 41df0b450757..be693b34662f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1328,6 +1328,7 @@ struct net_device_ops {
  * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
  * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
  *	entity (i.e. the master device for bridged veth)
+ * @IFF_MACSEC: device is a MACsec device
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1357,6 +1358,7 @@ enum netdev_priv_flags {
 	IFF_TEAM			= 1<<24,
 	IFF_RXFH_CONFIGURED		= 1<<25,
 	IFF_PHONY_HEADROOM		= 1<<26,
+	IFF_MACSEC			= 1<<27,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1385,6 +1387,7 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
 #define IFF_TEAM			IFF_TEAM
 #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
+#define IFF_MACSEC			IFF_MACSEC
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4045,6 +4048,11 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 	skb->mac_len = mac_len;
 }
 
+static inline bool netif_is_macsec(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_MACSEC;
+}
+
 static inline bool netif_is_macvlan(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_MACVLAN;
-- 
cgit v1.2.3


From 01cfbad79a5e2b835abf6a8154a341d75a6fc8cd Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:34 -0800
Subject: ipv4: Update parameters for csum_tcpudp_magic to their original types

This patch updates all instances of csum_tcpudp_magic and
csum_tcpudp_nofold to reflect the types that are usually used as the source
inputs.  For example the protocol field is populated based on nexthdr which
is actually an unsigned 8 bit value.  The length is usually populated based
on skb->len which is an unsigned integer.

This addresses an issue in which the IPv6 function csum_ipv6_magic was
generating a checksum using the full 32b of skb->len while
csum_tcpudp_magic was only using the lower 16 bits.  As a result we could
run into issues when attempting to adjust the checksum as there was no
protocol agnostic way to update it.

With this change the value is still truncated as many architectures use
"(len + proto) << 8", however this truncation only occurs for values
greater than 16776960 in length and as such is unlikely to occur as we stop
the inner headers at ~64K in size.

I did have to make a few minor changes in the arm, mn10300, nios2, and
score versions of the function in order to support these changes as they
were either using things such as an OR to combine the protocol and length,
or were using ntohs to convert the length which would have truncated the
value.

I also updated a few spots in terms of whitespace and type differences for
the addresses.  Most of this was just to make sure all of the definitions
were in sync going forward.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/checksum.h          |  9 +++------
 arch/alpha/lib/checksum.c                  |  8 ++------
 arch/arc/include/asm/checksum.h            |  4 ++--
 arch/arm/include/asm/checksum.h            | 10 +++++-----
 arch/avr32/include/asm/checksum.h          | 10 ++++------
 arch/blackfin/include/asm/checksum.h       |  4 ++--
 arch/c6x/include/asm/checksum.h            |  4 ++--
 arch/cris/include/arch-v10/arch/checksum.h |  4 ++--
 arch/cris/include/arch-v32/arch/checksum.h |  2 +-
 arch/cris/include/asm/checksum.h           |  5 ++---
 arch/frv/include/asm/checksum.h            |  8 ++++----
 arch/hexagon/include/asm/checksum.h        |  8 ++++----
 arch/hexagon/lib/checksum.c                | 10 ++++------
 arch/ia64/include/asm/checksum.h           | 12 ++++--------
 arch/ia64/lib/checksum.c                   |  8 ++++----
 arch/m32r/include/asm/checksum.h           | 10 ++++------
 arch/metag/include/asm/checksum.h          |  7 +++----
 arch/microblaze/include/asm/checksum.h     |  4 ++--
 arch/mips/include/asm/checksum.h           |  6 +++---
 arch/mn10300/include/asm/checksum.h        | 17 +++++------------
 arch/nios2/include/asm/checksum.h          |  9 ++++-----
 arch/parisc/include/asm/checksum.h         | 10 ++++------
 arch/s390/include/asm/checksum.h           |  6 ++----
 arch/score/include/asm/checksum.h          | 10 +++++-----
 arch/sh/include/asm/checksum_32.h          |  6 ++----
 arch/sparc/include/asm/checksum_32.h       | 10 ++++------
 arch/sparc/include/asm/checksum_64.h       |  6 ++----
 arch/unicore32/include/asm/checksum.h      |  4 ++--
 arch/x86/include/asm/checksum_32.h         |  6 ++----
 arch/x86/include/asm/checksum_64.h         |  8 ++++----
 arch/x86/um/asm/checksum.h                 |  9 ++++-----
 arch/xtensa/include/asm/checksum.h         | 10 ++++------
 include/asm-generic/checksum.h             |  8 ++++----
 lib/checksum.c                             |  4 +---
 34 files changed, 106 insertions(+), 150 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/asm/checksum.h b/arch/alpha/include/asm/checksum.h
index d3854bbf0a9e..cba34b1c738c 100644
--- a/arch/alpha/include/asm/checksum.h
+++ b/arch/alpha/include/asm/checksum.h
@@ -13,14 +13,11 @@ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
  * computes the checksum of the TCP/UDP pseudo-header
  * returns a 16-bit checksum, already complemented
  */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					   unsigned short len,
-					   unsigned short proto,
-					   __wsum sum);
+__sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum);
 
 __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-				unsigned short len, unsigned short proto,
-				__wsum sum);
+			  __u32 len, __u8 proto, __wsum sum);
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c
index 199f6efa83fa..377f9e34eb97 100644
--- a/arch/alpha/lib/checksum.c
+++ b/arch/alpha/lib/checksum.c
@@ -42,9 +42,7 @@ static inline unsigned short from64to16(unsigned long x)
  * returns a 16-bit checksum, already complemented.
  */
 __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-				   unsigned short len,
-				   unsigned short proto,
-				   __wsum sum)
+			  __u32 len, __u8 proto, __wsum sum)
 {
 	return (__force __sum16)~from64to16(
 		(__force u64)saddr + (__force u64)daddr +
@@ -52,9 +50,7 @@ __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 }
 
 __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-				   unsigned short len,
-				   unsigned short proto,
-				   __wsum sum)
+			  __u32 len, __u8 proto, __wsum sum)
 {
 	unsigned long result;
 
diff --git a/arch/arc/include/asm/checksum.h b/arch/arc/include/asm/checksum.h
index 10957298b7a3..913eb4aab05b 100644
--- a/arch/arc/include/asm/checksum.h
+++ b/arch/arc/include/asm/checksum.h
@@ -70,8 +70,8 @@ ip_fast_csum(const void *iph, unsigned int ihl)
  * SA [4], DA [4], zeroes [1], Proto[1], TCP Seg(hdr+data) Len [2]
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__(
 	"	add.f %0, %0, %1	\n"
diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h
index 523315115478..42d020b7dfba 100644
--- a/arch/arm/include/asm/checksum.h
+++ b/arch/arm/include/asm/checksum.h
@@ -84,10 +84,10 @@ ip_fast_csum(const void *iph, unsigned int ihl)
 }
 
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
-	u32 lenprot = len | proto << 16;
+	u32 lenprot = len + proto;
 	if (__builtin_constant_p(sum) && sum == 0) {
 		__asm__(
 		"adds	%0, %1, %2	@ csum_tcpudp_nofold0	\n\t"
@@ -121,8 +121,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
-		  unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/avr32/include/asm/checksum.h b/arch/avr32/include/asm/checksum.h
index 4ddbfd2486af..4ab7d5bdaf53 100644
--- a/arch/avr32/include/asm/checksum.h
+++ b/arch/avr32/include/asm/checksum.h
@@ -111,9 +111,8 @@ static inline __sum16 csum_fold(__wsum sum)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					       unsigned short len,
-					       unsigned short proto,
-					       __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	asm("	add	%0, %1\n"
 	    "	adc	%0, %0, %2\n"
@@ -132,9 +131,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/blackfin/include/asm/checksum.h b/arch/blackfin/include/asm/checksum.h
index 623cc7fb00bc..e7134bf94e3c 100644
--- a/arch/blackfin/include/asm/checksum.h
+++ b/arch/blackfin/include/asm/checksum.h
@@ -14,8 +14,8 @@
  */
 
 static inline __wsum
-__csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+__csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		     __u8 proto, __wsum sum)
 {
 	unsigned int carry;
 
diff --git a/arch/c6x/include/asm/checksum.h b/arch/c6x/include/asm/checksum.h
index 7246816d6e4d..249b0e421ddc 100644
--- a/arch/c6x/include/asm/checksum.h
+++ b/arch/c6x/include/asm/checksum.h
@@ -10,8 +10,8 @@
 #define _ASM_C6X_CHECKSUM_H
 
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	unsigned long long tmp;
 
diff --git a/arch/cris/include/arch-v10/arch/checksum.h b/arch/cris/include/arch-v10/arch/checksum.h
index b8000c5d7fe1..d1d1bd9e1090 100644
--- a/arch/cris/include/arch-v10/arch/checksum.h
+++ b/arch/cris/include/arch-v10/arch/checksum.h
@@ -9,8 +9,8 @@
  */
 
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	__wsum res;
 	__asm__ ("add.d %2, %0\n\t"
diff --git a/arch/cris/include/arch-v32/arch/checksum.h b/arch/cris/include/arch-v32/arch/checksum.h
index e5dcfce6e0dc..65cf205b1329 100644
--- a/arch/cris/include/arch-v32/arch/checksum.h
+++ b/arch/cris/include/arch-v32/arch/checksum.h
@@ -11,7 +11,7 @@
  */
 static inline __wsum
 csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-		   unsigned short len, unsigned short proto, __wsum sum)
+		   __u32 len, __u8 proto, __wsum sum)
 {
 	__wsum res;
 
diff --git a/arch/cris/include/asm/checksum.h b/arch/cris/include/asm/checksum.h
index 75dcb77d6cb0..ea949c60b190 100644
--- a/arch/cris/include/asm/checksum.h
+++ b/arch/cris/include/asm/checksum.h
@@ -63,9 +63,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
  */
 
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/frv/include/asm/checksum.h b/arch/frv/include/asm/checksum.h
index 269da09ff637..cd59cd4fd2d9 100644
--- a/arch/frv/include/asm/checksum.h
+++ b/arch/frv/include/asm/checksum.h
@@ -105,8 +105,8 @@ static inline __sum16 csum_fold(__wsum sum)
  * returns a 16-bit checksum, already complemented
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		  unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	asm("	addcc		%1,%0,%0,icc0	\n"
 	    "	addxcc		%2,%0,%0,icc0	\n"
@@ -120,8 +120,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
 }
 
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
-		  unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/hexagon/include/asm/checksum.h b/arch/hexagon/include/asm/checksum.h
index 46ec8a7fd65f..d9f58d696238 100644
--- a/arch/hexagon/include/asm/checksum.h
+++ b/arch/hexagon/include/asm/checksum.h
@@ -38,12 +38,12 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst,
  * returns a 16-bit checksum, already complemented
  */
 #define csum_tcpudp_nofold csum_tcpudp_nofold
-__wsum csum_tcpudp_nofold(unsigned long saddr, unsigned long daddr,
-	unsigned short len, unsigned short proto, __wsum sum);
+__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum);
 
 #define csum_tcpudp_magic csum_tcpudp_magic
-__sum16 csum_tcpudp_magic(unsigned long saddr, unsigned long daddr,
-	unsigned short len, unsigned short proto, __wsum sum);
+__sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum);
 
 #include <asm-generic/checksum.h>
 
diff --git a/arch/hexagon/lib/checksum.c b/arch/hexagon/lib/checksum.c
index 8169f78a46a7..617506d1a559 100644
--- a/arch/hexagon/lib/checksum.c
+++ b/arch/hexagon/lib/checksum.c
@@ -60,18 +60,16 @@ static inline unsigned short from64to16(u64 x)
  * computes the checksum of the TCP/UDP pseudo-header
  * returns a 16-bit checksum, already complemented.
  */
-__sum16 csum_tcpudp_magic(unsigned long saddr, unsigned long daddr,
-			  unsigned short len, unsigned short proto,
-			  __wsum sum)
+__sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum)
 {
 	return (__force __sum16)~from64to16(
 		(__force u64)saddr + (__force u64)daddr +
 		(__force u64)sum + ((len + proto) << 8));
 }
 
-__wsum csum_tcpudp_nofold(unsigned long saddr, unsigned long daddr,
-			  unsigned short len, unsigned short proto,
-			  __wsum sum)
+__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum)
 {
 	u64 result;
 
diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h
index 97af155057e4..ac9c687e8384 100644
--- a/arch/ia64/include/asm/checksum.h
+++ b/arch/ia64/include/asm/checksum.h
@@ -16,15 +16,11 @@ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
  * Computes the checksum of the TCP/UDP pseudo-header returns a 16-bit
  * checksum, already complemented
  */
-extern __sum16 csum_tcpudp_magic (__be32 saddr, __be32 daddr,
-					     unsigned short len,
-					     unsigned short proto,
-					     __wsum sum);
+extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+				 __u32 len, __u8 proto, __wsum sum);
 
-extern __wsum csum_tcpudp_nofold (__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
-					__wsum sum);
+extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+				 __u32 len, __u8 proto, __wsum sum);
 
 /*
  * Computes the checksum of a memory block at buff, length len,
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
index 9fc955026f86..2cb23cb0c2e1 100644
--- a/arch/ia64/lib/checksum.c
+++ b/arch/ia64/lib/checksum.c
@@ -34,8 +34,8 @@ from64to16 (unsigned long x)
  * returns a 16-bit checksum, already complemented.
  */
 __sum16
-csum_tcpudp_magic (__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return (__force __sum16)~from64to16(
 		(__force u64)saddr + (__force u64)daddr +
@@ -45,8 +45,8 @@ csum_tcpudp_magic (__be32 saddr, __be32 daddr, unsigned short len,
 EXPORT_SYMBOL(csum_tcpudp_magic);
 
 __wsum
-csum_tcpudp_nofold (__be32 saddr, __be32 daddr, unsigned short len,
-		    unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	unsigned long result;
 
diff --git a/arch/m32r/include/asm/checksum.h b/arch/m32r/include/asm/checksum.h
index a7a7c4f44abe..d68e93c9bd62 100644
--- a/arch/m32r/include/asm/checksum.h
+++ b/arch/m32r/include/asm/checksum.h
@@ -114,9 +114,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					       unsigned short len,
-					       unsigned short proto,
-					       __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 #if defined(__LITTLE_ENDIAN)
 	unsigned long len_proto = (proto + len) << 8;
@@ -145,9 +144,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/metag/include/asm/checksum.h b/arch/metag/include/asm/checksum.h
index 08dd1cc65799..f65fe83b1730 100644
--- a/arch/metag/include/asm/checksum.h
+++ b/arch/metag/include/asm/checksum.h
@@ -59,8 +59,7 @@ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
  * returns a 16-bit checksum, already complemented
  */
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	unsigned long len_proto = (proto + len) << 8;
@@ -78,8 +77,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 }
 
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
-		  unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/microblaze/include/asm/checksum.h b/arch/microblaze/include/asm/checksum.h
index 0185cbefdda4..adeecebbb0d1 100644
--- a/arch/microblaze/include/asm/checksum.h
+++ b/arch/microblaze/include/asm/checksum.h
@@ -16,8 +16,8 @@
  */
 #define csum_tcpudp_nofold	csum_tcpudp_nofold
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	__asm__("add %0, %0, %1\n\t"
 		"addc %0, %0, %2\n\t"
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index 3ceacde5eb6e..c635541d40b8 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -160,9 +160,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 #define ip_fast_csum ip_fast_csum
 
-static inline __wsum csum_tcpudp_nofold(__be32 saddr,
-	__be32 daddr, unsigned short len, unsigned short proto,
-	__wsum sum)
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	__asm__(
 	"	.set	push		# csum_tcpudp_nofold\n"
diff --git a/arch/mn10300/include/asm/checksum.h b/arch/mn10300/include/asm/checksum.h
index 9fb2a8d8826a..c80df5b504ac 100644
--- a/arch/mn10300/include/asm/checksum.h
+++ b/arch/mn10300/include/asm/checksum.h
@@ -37,16 +37,11 @@ static inline __sum16 csum_fold(__wsum sum)
 	return (~sum) >> 16;
 }
 
-static inline __wsum csum_tcpudp_nofold(unsigned long saddr,
-					unsigned long daddr,
-					unsigned short len,
-					unsigned short proto,
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
-	__wsum tmp;
-
-	tmp = (__wsum) ntohs(len) << 16;
-	tmp += (__wsum) proto << 8;
+	__wsum tmp = (__wsum)((len + proto) << 8);
 
 	asm(
 		"	add	%1,%0		\n"
@@ -64,10 +59,8 @@ static inline __wsum csum_tcpudp_nofold(unsigned long saddr,
  * computes the checksum of the TCP/UDP pseudo-header
  * returns a 16-bit checksum, already complemented
  */
-static inline __sum16 csum_tcpudp_magic(unsigned long saddr,
-					unsigned long daddr,
-					unsigned short len,
-					unsigned short proto,
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
diff --git a/arch/nios2/include/asm/checksum.h b/arch/nios2/include/asm/checksum.h
index 6bc1f0d5df7b..703c5ee63421 100644
--- a/arch/nios2/include/asm/checksum.h
+++ b/arch/nios2/include/asm/checksum.h
@@ -45,8 +45,7 @@ static inline __sum16 csum_fold(__wsum sum)
  */
 #define csum_tcpudp_nofold csum_tcpudp_nofold
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	__asm__ __volatile__(
@@ -60,7 +59,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 		"cmpltu	r8, %0, %3\n"
 		"add	%0, %0, r8\n"	/* add carry */
 		: "=r" (sum), "=r" (saddr)
-		: "r" (daddr), "r" ((ntohs(len) << 16) + (proto * 256)),
+		: "r" (daddr), "r" ((len + proto) << 8),
 		  "0" (sum),
 		  "1" (saddr)
 		: "r8");
@@ -69,8 +68,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 }
 
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto, __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index c84b2fcb18a9..9815ab1fc8aa 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -85,9 +85,8 @@ static inline __sum16 csum_fold(__wsum csum)
 }
  
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					       unsigned short len,
-					       unsigned short proto,
-					       __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	__asm__(
 	"	add  %1, %0, %0\n"
@@ -104,9 +103,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index 740364856355..d7f100c53f07 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -91,8 +91,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
  * returns a 32-bit checksum
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-                   unsigned short len, unsigned short proto,
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
                    __wsum sum)
 {
 	__u32 csum = (__force __u32)sum;
@@ -118,8 +117,7 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  */
 
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-                  unsigned short len, unsigned short proto,
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
                   __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h
index 961bd64015a8..a375bc2700be 100644
--- a/arch/score/include/asm/checksum.h
+++ b/arch/score/include/asm/checksum.h
@@ -127,10 +127,10 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
-	unsigned long tmp = (ntohs(len) << 16) + proto * 256;
+	unsigned long tmp = (len + proto) << 8;
 	__asm__ __volatile__(
 		".set volatile\n\t"
 		"add\t%0, %0, %2\n\t"
@@ -161,8 +161,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
-		unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h
index 14b7ac2f0a07..fd730f140c06 100644
--- a/arch/sh/include/asm/checksum_32.h
+++ b/arch/sh/include/asm/checksum_32.h
@@ -115,8 +115,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 #ifdef __LITTLE_ENDIAN__
@@ -142,8 +141,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h
index 426b2389a1c2..86ae655a3c0f 100644
--- a/arch/sparc/include/asm/checksum_32.h
+++ b/arch/sparc/include/asm/checksum_32.h
@@ -170,9 +170,8 @@ static inline __sum16 csum_fold(__wsum sum)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					       unsigned short len,
-					       unsigned short proto,
-					       __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	__asm__ __volatile__("addcc\t%1, %0, %0\n\t"
 			     "addxcc\t%2, %0, %0\n\t"
@@ -190,9 +189,8 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h
index b8779a6a5911..ef0c6f48189a 100644
--- a/arch/sparc/include/asm/checksum_64.h
+++ b/arch/sparc/include/asm/checksum_64.h
@@ -96,8 +96,7 @@ static inline __sum16 csum_fold(__wsum sum)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned int len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	__asm__ __volatile__(
@@ -116,8 +115,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h
index f55c3f937c3e..23ceb9e3a89b 100644
--- a/arch/unicore32/include/asm/checksum.h
+++ b/arch/unicore32/include/asm/checksum.h
@@ -20,8 +20,8 @@
  */
 
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	__asm__(
 	"add.a	%0, %1, %2\n"
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index f50de6951738..6f380605403d 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -112,8 +112,7 @@ static inline __sum16 csum_fold(__wsum sum)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	asm("addl %1, %0	;\n"
@@ -131,8 +130,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e1774491..97b98e2039bc 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
  * 32bit unfolded.
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	asm("  addl %1, %0\n"
 	    "  adcl %2, %0\n"
@@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * complemented and ready to be filled in.
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto, __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index ee940185e89f..54d96f1e3594 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -87,8 +87,8 @@ static inline __sum16 csum_fold(__wsum sum)
  * 32bit unfolded.
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	asm("  addl %1, %0\n"
 	    "  adcl %2, %0\n"
@@ -104,9 +104,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					   unsigned short len,
-					   unsigned short proto,
-					   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index 0593de689b56..62254e6688f5 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -123,9 +123,8 @@ static __inline__ __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 
 static __inline__ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-						   unsigned short len,
-						   unsigned short proto,
-						   __wsum sum)
+					    __u32 len, __u8 proto,
+					    __wsum sum)
 {
 
 #ifdef __XTENSA_EL__
@@ -157,9 +156,8 @@ static __inline__ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static __inline__ __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-						       unsigned short len,
-						       unsigned short proto,
-						       __wsum sum)
+					    __u32 len, __u8 proto,
+					    __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
index 59811df58c5b..3150cbd8eb21 100644
--- a/include/asm-generic/checksum.h
+++ b/include/asm-generic/checksum.h
@@ -65,14 +65,14 @@ static inline __sum16 csum_fold(__wsum csum)
  * returns a 16-bit checksum, already complemented
  */
 extern __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		unsigned short proto, __wsum sum);
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum);
 #endif
 
 #ifndef csum_tcpudp_magic
 static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
-		  unsigned short proto, __wsum sum)
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/lib/checksum.c b/lib/checksum.c
index 8b39e86dbab5..d3ec93f9e5f3 100644
--- a/lib/checksum.c
+++ b/lib/checksum.c
@@ -191,9 +191,7 @@ static inline u32 from64to32(u64 x)
 }
 
 __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-			unsigned short len,
-			unsigned short proto,
-			__wsum sum)
+			  __u32 len, __u8 proto, __wsum sum)
 {
 	unsigned long long s = (__force u32)sum;
 
-- 
cgit v1.2.3


From 1e94082963747b551b129528714827f76a090e93 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:41 -0800
Subject: ipv6: Pass proto to csum_ipv6_magic as __u8 instead of unsigned short

This patch updates csum_ipv6_magic so that it correctly recognizes that
protocol is a unsigned 8 bit value.

This will allow us to better understand what limitations may or may not be
present in how we handle the data.  For example there are a number of
places that call htonl on the protocol value.  This is likely not necessary
and can be replaced with a multiplication by ntohl(1) which will be
converted to a shift by the compiler.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/checksum.h    | 3 +--
 arch/arm/include/asm/checksum.h      | 4 ++--
 arch/frv/include/asm/checksum.h      | 2 +-
 arch/ia64/include/asm/checksum.h     | 4 ++--
 arch/m68k/include/asm/checksum.h     | 2 +-
 arch/mips/include/asm/checksum.h     | 2 +-
 arch/parisc/include/asm/checksum.h   | 2 +-
 arch/score/include/asm/checksum.h    | 5 ++---
 arch/sh/include/asm/checksum_32.h    | 3 +--
 arch/sparc/include/asm/checksum_32.h | 3 +--
 arch/sparc/include/asm/checksum_64.h | 3 +--
 arch/x86/include/asm/checksum_32.h   | 3 +--
 arch/x86/include/asm/checksum_64.h   | 2 +-
 arch/x86/lib/csum-wrappers_64.c      | 2 +-
 arch/x86/um/asm/checksum_32.h        | 2 +-
 arch/xtensa/include/asm/checksum.h   | 2 +-
 include/net/ip6_checksum.h           | 3 +--
 net/ipv6/ip6_checksum.c              | 3 +--
 18 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/asm/checksum.h b/arch/alpha/include/asm/checksum.h
index cba34b1c738c..f2bbdd2ace51 100644
--- a/arch/alpha/include/asm/checksum.h
+++ b/arch/alpha/include/asm/checksum.h
@@ -67,6 +67,5 @@ static inline __sum16 csum_fold(__wsum csum)
 #define _HAVE_ARCH_IPV6_CSUM
 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			       const struct in6_addr *daddr,
-			       __u32 len, unsigned short proto,
-			       __wsum sum);
+			       __u32 len, __u8 proto, __wsum sum);
 #endif
diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h
index 42d020b7dfba..524692f4acab 100644
--- a/arch/arm/include/asm/checksum.h
+++ b/arch/arm/include/asm/checksum.h
@@ -144,8 +144,8 @@ __csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __
 		__be32 proto, __wsum sum);
 
 static inline __sum16
-csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len,
-		unsigned short proto, __wsum sum)
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__u32 len, __u8 proto, __wsum sum)
 {
 	return csum_fold(__csum_ipv6_magic(saddr, daddr, htonl(len),
 					   htonl(proto), sum));
diff --git a/arch/frv/include/asm/checksum.h b/arch/frv/include/asm/checksum.h
index cd59cd4fd2d9..b77388c5901d 100644
--- a/arch/frv/include/asm/checksum.h
+++ b/arch/frv/include/asm/checksum.h
@@ -135,7 +135,7 @@ extern __sum16 ip_compute_csum(const void *buff, int len);
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum)
+		__u32 len, __u8 proto, __wsum sum)
 {
 	unsigned long tmp, tmp2;
 
diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h
index ac9c687e8384..7accf54162b2 100644
--- a/arch/ia64/include/asm/checksum.h
+++ b/arch/ia64/include/asm/checksum.h
@@ -69,7 +69,7 @@ static inline __sum16 csum_fold(__wsum csum)
 #define _HAVE_ARCH_IPV6_CSUM	1
 struct in6_addr;
 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-	const struct in6_addr *daddr, __u32 len, unsigned short proto,
-	__wsum csum);
+			       const struct in6_addr *daddr,
+			       __u32 len, __u8 proto, __wsum csum);
 
 #endif /* _ASM_IA64_CHECKSUM_H */
diff --git a/arch/m68k/include/asm/checksum.h b/arch/m68k/include/asm/checksum.h
index 2f88d867c711..75e91f03b178 100644
--- a/arch/m68k/include/asm/checksum.h
+++ b/arch/m68k/include/asm/checksum.h
@@ -117,7 +117,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum)
+		__u32 len, __u8 proto, __wsum sum)
 {
 	register unsigned long tmp;
 	__asm__("addl %2@,%0\n\t"
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index c635541d40b8..bce1ce53149a 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -215,7 +215,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__wsum tmp;
diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index 9815ab1fc8aa..60c2c42619c9 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -122,7 +122,7 @@ static inline __sum16 ip_compute_csum(const void *buf, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__asm__ __volatile__ (
diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h
index a375bc2700be..539d9fd45d21 100644
--- a/arch/score/include/asm/checksum.h
+++ b/arch/score/include/asm/checksum.h
@@ -179,9 +179,8 @@ static inline unsigned short ip_compute_csum(const void *buff, int len)
 
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-				const struct in6_addr *daddr,
-				__u32 len, unsigned short proto,
-				__wsum sum)
+				      const struct in6_addr *daddr,
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__(
 		".set\tvolatile\t\t\t# csum_ipv6_magic\n\t"
diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h
index fd730f140c06..9c84386d35cb 100644
--- a/arch/sh/include/asm/checksum_32.h
+++ b/arch/sh/include/asm/checksum_32.h
@@ -159,8 +159,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	unsigned int __dummy;
 	__asm__("clrt\n\t"
diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h
index 86ae655a3c0f..eff748c871ec 100644
--- a/arch/sparc/include/asm/checksum_32.h
+++ b/arch/sparc/include/asm/checksum_32.h
@@ -199,8 +199,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__ (
 		"addcc	%3, %4, %%g4\n\t"
diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h
index ef0c6f48189a..0395d75322e9 100644
--- a/arch/sparc/include/asm/checksum_64.h
+++ b/arch/sparc/include/asm/checksum_64.h
@@ -125,8 +125,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__ (
 "	addcc		%3, %4, %%g7\n"
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 6f380605403d..532f85e6651f 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -149,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	asm("addl 0(%1), %0	;\n"
 	    "adcl 4(%1), %0	;\n"
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 97b98e2039bc..c020ee75dce7 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -177,7 +177,7 @@ struct in6_addr;
 #define _HAVE_ARCH_IPV6_CSUM 1
 extern __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum);
+		__u32 len, __u8 proto, __wsum sum);
 
 static inline unsigned add32_with_carry(unsigned a, unsigned b)
 {
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 1318f75d56e4..28a6654f0d08 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto, __wsum sum)
+			__u32 len, __u8 proto, __wsum sum)
 {
 	__u64 rest, sum64;
 
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index ab77b6f9a4bf..83a75f8a1233 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__asm__(
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index 62254e6688f5..ec35074fcb03 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -175,7 +175,7 @@ static __inline__ __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	unsigned int __dummy;
diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h
index 1a49b73f7f6e..cca840584c88 100644
--- a/include/net/ip6_checksum.h
+++ b/include/net/ip6_checksum.h
@@ -37,8 +37,7 @@
 #ifndef _HAVE_ARCH_IPV6_CSUM
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto,
-			__wsum csum);
+			__u32 len, __u8 proto, __wsum csum);
 #endif
 
 static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 8f920580976f..b2025bf3da4a 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -6,8 +6,7 @@
 #ifndef _HAVE_ARCH_IPV6_CSUM
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto,
-			__wsum csum)
+			__u32 len, __u8 proto, __wsum csum)
 {
 
 	int carry;
-- 
cgit v1.2.3


From f2900acea8018c4525ddaa86c7f7cd8afd3f0cc4 Mon Sep 17 00:00:00 2001
From: Marcin Wojtas <mw@semihalf.com>
Date: Mon, 14 Mar 2016 09:39:02 +0100
Subject: bus: mvebu-mbus: provide api for obtaining IO and DRAM window
 information

This commit enables finding appropriate mbus window and obtaining its
target id and attribute for given physical address in two separate
routines, both for IO and DRAM windows. This functionality
is needed for Armada XP/38x Network Controller's Buffer Manager and
PnC configuration.

[gregory.clement@free-electrons.com: Fix size test for
mvebu_mbus_get_dram_win_info]

Signed-off-by: Marcin Wojtas <mw@semihalf.com>
[DRAM window information reference in LKv3.10]
Signed-off-by: Evan Wang <xswang@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bus/mvebu-mbus.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mbus.h     |  3 +++
 2 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
index c43c3d2baf73..c2e52864bb03 100644
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -948,6 +948,58 @@ void mvebu_mbus_get_pcie_io_aperture(struct resource *res)
 	*res = mbus_state.pcie_io_aperture;
 }
 
+int mvebu_mbus_get_dram_win_info(phys_addr_t phyaddr, u8 *target, u8 *attr)
+{
+	const struct mbus_dram_target_info *dram;
+	int i;
+
+	/* Get dram info */
+	dram = mv_mbus_dram_info();
+	if (!dram) {
+		pr_err("missing DRAM information\n");
+		return -ENODEV;
+	}
+
+	/* Try to find matching DRAM window for phyaddr */
+	for (i = 0; i < dram->num_cs; i++) {
+		const struct mbus_dram_window *cs = dram->cs + i;
+
+		if (cs->base <= phyaddr &&
+			phyaddr <= (cs->base + cs->size - 1)) {
+			*target = dram->mbus_dram_target_id;
+			*attr = cs->mbus_attr;
+			return 0;
+		}
+	}
+
+	pr_err("invalid dram address 0x%x\n", phyaddr);
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(mvebu_mbus_get_dram_win_info);
+
+int mvebu_mbus_get_io_win_info(phys_addr_t phyaddr, u32 *size, u8 *target,
+			       u8 *attr)
+{
+	int win;
+
+	for (win = 0; win < mbus_state.soc->num_wins; win++) {
+		u64 wbase;
+		int enabled;
+
+		mvebu_mbus_read_window(&mbus_state, win, &enabled, &wbase,
+				       size, target, attr, NULL);
+
+		if (!enabled)
+			continue;
+
+		if (wbase <= phyaddr && phyaddr <= wbase + *size)
+			return win;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(mvebu_mbus_get_io_win_info);
+
 static __init int mvebu_mbus_debugfs_init(void)
 {
 	struct mvebu_mbus_state *s = &mbus_state;
diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 1f7bc630d225..ea34a867caa0 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -69,6 +69,9 @@ static inline const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(vo
 int mvebu_mbus_save_cpu_target(u32 *store_addr);
 void mvebu_mbus_get_pcie_mem_aperture(struct resource *res);
 void mvebu_mbus_get_pcie_io_aperture(struct resource *res);
+int mvebu_mbus_get_dram_win_info(phys_addr_t phyaddr, u8 *target, u8 *attr);
+int mvebu_mbus_get_io_win_info(phys_addr_t phyaddr, u32 *size, u8 *target,
+			       u8 *attr);
 int mvebu_mbus_add_window_remap_by_id(unsigned int target,
 				      unsigned int attribute,
 				      phys_addr_t base, size_t size,
-- 
cgit v1.2.3


From 8cb2d8bf57e6e004c37db2fb4ce74f4d032b7cd0 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Mon, 14 Mar 2016 09:39:04 +0100
Subject: net: add a hardware buffer management helper API

This basic implementation allows to share code between driver using
hardware buffer management. As the code is hardware agnostic, there is
few helpers, most of the optimization brought by the an HW BM has to be
done at driver level.

Tested-by: Sebastian Careba <nitroshift@yahoo.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/hwbm.h | 28 ++++++++++++++++++
 net/Kconfig        |  3 ++
 net/core/Makefile  |  1 +
 net/core/hwbm.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 include/net/hwbm.h
 create mode 100644 net/core/hwbm.c

(limited to 'include')

diff --git a/include/net/hwbm.h b/include/net/hwbm.h
new file mode 100644
index 000000000000..47d08662501b
--- /dev/null
+++ b/include/net/hwbm.h
@@ -0,0 +1,28 @@
+#ifndef _HWBM_H
+#define _HWBM_H
+
+struct hwbm_pool {
+	/* Capacity of the pool */
+	int size;
+	/* Size of the buffers managed */
+	int frag_size;
+	/* Number of buffers currently used by this pool */
+	int buf_num;
+	/* constructor called during alocation */
+	int (*construct)(struct hwbm_pool *bm_pool, void *buf);
+	/* protect acces to the buffer counter*/
+	spinlock_t lock;
+	/* private data */
+	void *priv;
+};
+#ifdef CONFIG_HWBM
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf);
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp);
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp);
+#else
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {}
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; }
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{ return 0; }
+#endif /* CONFIG_HWBM */
+#endif /* _HWBM_H */
diff --git a/net/Kconfig b/net/Kconfig
index 10640d5f8bee..e13449870d06 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -253,6 +253,9 @@ config XPS
 	depends on SMP
 	default y
 
+config HWBM
+       bool
+
 config SOCK_CGROUP_DATA
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index 014422e2561f..d6508c2ddca5 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,4 +25,5 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000000..941c28486896
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,87 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+	if (likely(bm_pool->frag_size <= PAGE_SIZE))
+		skb_free_frag(buf);
+	else
+		kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+	int frag_size = bm_pool->frag_size;
+	void *buf;
+
+	if (likely(frag_size <= PAGE_SIZE))
+		buf = netdev_alloc_frag(frag_size);
+	else
+		buf = kmalloc(frag_size, gfp);
+
+	if (!buf)
+		return -ENOMEM;
+
+	if (bm_pool->construct)
+		if (bm_pool->construct(bm_pool, buf)) {
+			hwbm_buf_free(bm_pool, buf);
+			return -ENOMEM;
+		}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+	int err, i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&bm_pool->lock, flags);
+	if (bm_pool->buf_num == bm_pool->size) {
+		pr_warn("pool already filled\n");
+		return bm_pool->buf_num;
+	}
+
+	if (buf_num + bm_pool->buf_num > bm_pool->size) {
+		pr_warn("cannot allocate %d buffers for pool\n",
+			buf_num);
+		return 0;
+	}
+
+	if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+		pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+			buf_num,  bm_pool->buf_num);
+		return 0;
+	}
+
+	for (i = 0; i < buf_num; i++) {
+		err = hwbm_pool_refill(bm_pool, gfp);
+		if (err < 0)
+			break;
+	}
+
+	/* Update BM driver with number of buffers added to pool */
+	bm_pool->buf_num += i;
+
+	pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+	spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
-- 
cgit v1.2.3


From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 14 Mar 2016 10:52:15 -0700
Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In

Per RFC4898, they count segments sent/received
containing a positive length data segment (that includes
retransmission segments carrying data).  Unlike
tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments
carrying no data (e.g. pure ack).

The patch also updates the segs_in in tcp_fastopen_add_skb()
so that segs_in >= data_segs_in property is kept.

Together with retransmission data, tcpi_data_segs_out
gives a better signal on the rxmit rate.

v6: Rebase on the latest net-next

v5: Eric pointed out that checking skb->len is still needed in
tcp_fastopen_add_skb() because skb can carry a FIN without data.
Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in()
helper is used.  Comment is added to the fastopen case to explain why
segs_in has to be reset and tcp_segs_in() has to be called before
__skb_pull().

v4: Add comment to the changes in tcp_fastopen_add_skb()
and also add remark on this case in the commit message.

v3: Add const modifier to the skb parameter in tcp_segs_in()

v2: Rework based on recent fix by Eric:
commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment")

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <mleitner@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  6 ++++++
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_fastopen.c  |  8 ++++++++
 net/ipv4/tcp_ipv4.c      |  2 +-
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    |  4 +++-
 net/ipv6/tcp_ipv6.c      |  2 +-
 9 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcbf51da4e1e..7be9b1242354 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -158,6 +158,9 @@ struct tcp_sock {
 	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
 				 * total number of segments in.
 				 */
+	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
+				 * total number of data segments in.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
@@ -165,6 +168,9 @@ struct tcp_sock {
 	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
 				 * The total number of segments sent.
 				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0302636af98c..c8dbd293daae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk)
 	return answ;
 }
 
+static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+	u16 segs_in;
+
+	segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tp->segs_in += segs_in;
+	if (skb->len > tcp_hdrlen(skb))
+		tp->data_segs_in += segs_in;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fe95446e9abf..53e8e3fe6b1b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -199,6 +199,8 @@ struct tcp_info {
 
 	__u32	tcpi_notsent_bytes;
 	__u32	tcpi_min_rtt;
+	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a265f00b9df9..992b3103ec3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2715,6 +2715,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_notsent_bytes = max(0, notsent_bytes);
 
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
+	info->tcpi_data_segs_in = tp->data_segs_in;
+	info->tcpi_data_segs_out = tp->data_segs_out;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fdb286ddba04..4fc0061bebf4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -140,6 +140,14 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	skb_dst_drop(skb);
+	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
+	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
+	 * to avoid double counting.  Also, tcp_segs_in() expects
+	 * skb->len to include the tcp_hdrlen.  Hence, it should
+	 * be called before __skb_pull().
+	 */
+	tp->segs_in = 0;
+	tcp_segs_in(tp, skb);
 	__skb_pull(skb, tcp_hdrlen(skb));
 	skb_set_owner_r(skb, sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4c8d58dfac9b..0b02ef773705 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1650,7 +1650,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ae90e4b34bd3..acb366dd61e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -812,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
-	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2c7a400456..7d2dc015cd19 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1003,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-	if (skb->len != tcp_header_size)
+	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
+		tp->data_segs_out += tcp_skb_pcount(skb);
+	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33f2820181f9..9c16565b70cc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1443,7 +1443,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
-- 
cgit v1.2.3


From 5bcbe0f35fb13e31fdd0b2dc9695f19ab0208207 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 12 Mar 2016 00:01:40 +0100
Subject: phy: fixed: Fix removal of phys.

The fixed phys delete function simply removed the fixed phy from the
internal linked list and freed the memory. It however did not
unregister the associated phy device. This meant it was still possible
to find the phy device on the mdio bus.

Make fixed_phy_del() an internal function and add a
fixed_phy_unregister() to unregisters the phy device and then uses
fixed_phy_del() to free resources.

Modify DSA to use this new API function, so we don't leak phys.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/fixed_phy.c | 11 +++++++++--
 include/linux/phy_fixed.h   |  5 ++---
 net/dsa/dsa.c               |  4 +---
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index ab9c473d75ea..fc07a8866020 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -285,7 +285,7 @@ err_regs:
 }
 EXPORT_SYMBOL_GPL(fixed_phy_add);
 
-void fixed_phy_del(int phy_addr)
+static void fixed_phy_del(int phy_addr)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
 	struct fixed_phy *fp, *tmp;
@@ -300,7 +300,6 @@ void fixed_phy_del(int phy_addr)
 		}
 	}
 }
-EXPORT_SYMBOL_GPL(fixed_phy_del);
 
 static int phy_fixed_addr;
 static DEFINE_SPINLOCK(phy_fixed_addr_lock);
@@ -371,6 +370,14 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 }
 EXPORT_SYMBOL_GPL(fixed_phy_register);
 
+void fixed_phy_unregister(struct phy_device *phy)
+{
+	phy_device_remove(phy);
+
+	fixed_phy_del(phy->mdio.addr);
+}
+EXPORT_SYMBOL_GPL(fixed_phy_unregister);
+
 static int __init fixed_mdio_bus_init(void)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 2400d2ea4f34..1d41ec44e39d 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -19,7 +19,7 @@ extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
 					     int link_gpio,
 					     struct device_node *np);
-extern void fixed_phy_del(int phy_addr);
+extern void fixed_phy_unregister(struct phy_device *phydev);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
 					   struct fixed_phy_status *));
@@ -40,9 +40,8 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq,
 {
 	return ERR_PTR(-ENODEV);
 }
-static inline int fixed_phy_del(int phy_addr)
+static inline void fixed_phy_unregister(struct phy_device *phydev)
 {
-	return -ENODEV;
 }
 static inline int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index f100f340d93f..c28c47463b7e 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -447,11 +447,9 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		if (of_phy_is_fixed_link(port_dn)) {
 			phydev = of_phy_find_device(port_dn);
 			if (phydev) {
-				int addr = phydev->mdio.addr;
-
 				phy_device_free(phydev);
 				of_node_put(port_dn);
-				fixed_phy_del(addr);
+				fixed_phy_unregister(phydev);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 71327a4e7d997276d49db92fd3d30008389ee6d5 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 13 Mar 2016 16:21:32 -0400
Subject: net: dsa: rename port_*_bridge routines

Rename DSA port_join_bridge and port_leave_bridge routines to
respectively port_bridge_join and port_bridge_leave in order to respect
an implicit Port::Bridge namespace.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt | 4 ++--
 drivers/net/dsa/bcm_sf2.c            | 4 ++--
 drivers/net/dsa/mv88e6171.c          | 4 ++--
 drivers/net/dsa/mv88e6352.c          | 4 ++--
 include/net/dsa.h                    | 4 ++--
 net/dsa/slave.c                      | 8 ++++----
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 974e9c387d1e..3b196c304b73 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -521,12 +521,12 @@ See Documentation/hwmon/sysfs-interface for details.
 Bridge layer
 ------------
 
-- port_join_bridge: bridge layer function invoked when a given switch port is
+- port_bridge_join: bridge layer function invoked when a given switch port is
   added to a bridge, this function should be doing the necessary at the switch
   level to permit the joining port from being added to the relevant logical
   domain for it to ingress/egress traffic with other members of the bridge.
 
-- port_leave_bridge: bridge layer function invoked when a given switch port is
+- port_bridge_leave: bridge layer function invoked when a given switch port is
   removed from a bridge, this function should be doing the necessary at the
   switch level to deny the leaving port from ingress/egress traffic from the
   remaining bridge members. When the port leaves the bridge, it should be aged
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 3f627598f277..4bcc9ebf5e06 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1387,8 +1387,8 @@ static struct dsa_switch_driver bcm_sf2_switch_driver = {
 	.port_disable		= bcm_sf2_port_disable,
 	.get_eee		= bcm_sf2_sw_get_eee,
 	.set_eee		= bcm_sf2_sw_set_eee,
-	.port_join_bridge	= bcm_sf2_sw_br_join,
-	.port_leave_bridge	= bcm_sf2_sw_br_leave,
+	.port_bridge_join	= bcm_sf2_sw_br_join,
+	.port_bridge_leave	= bcm_sf2_sw_br_leave,
 	.port_stp_update	= bcm_sf2_sw_br_set_stp_state,
 	.port_fdb_prepare	= bcm_sf2_sw_fdb_prepare,
 	.port_fdb_add		= bcm_sf2_sw_fdb_add,
diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index d72ccbdf53ec..c0164b98fc08 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -103,8 +103,8 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 #endif
 	.get_regs_len		= mv88e6xxx_get_regs_len,
 	.get_regs		= mv88e6xxx_get_regs,
-	.port_join_bridge	= mv88e6xxx_port_bridge_join,
-	.port_leave_bridge	= mv88e6xxx_port_bridge_leave,
+	.port_bridge_join	= mv88e6xxx_port_bridge_join,
+	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
 	.port_stp_update        = mv88e6xxx_port_stp_update,
 	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
 	.port_vlan_prepare	= mv88e6xxx_port_vlan_prepare,
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index a41fa5043d77..5f528abc8af1 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -324,8 +324,8 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
 	.set_eeprom		= mv88e6352_set_eeprom,
 	.get_regs_len		= mv88e6xxx_get_regs_len,
 	.get_regs		= mv88e6xxx_get_regs,
-	.port_join_bridge	= mv88e6xxx_port_bridge_join,
-	.port_leave_bridge	= mv88e6xxx_port_bridge_leave,
+	.port_bridge_join	= mv88e6xxx_port_bridge_join,
+	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
 	.port_stp_update	= mv88e6xxx_port_stp_update,
 	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
 	.port_vlan_prepare	= mv88e6xxx_port_vlan_prepare,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 26c0a3fa009a..004e034184c1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,9 +296,9 @@ struct dsa_switch_driver {
 	/*
 	 * Bridge integration
 	 */
-	int	(*port_join_bridge)(struct dsa_switch *ds, int port,
+	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
-	int	(*port_leave_bridge)(struct dsa_switch *ds, int port);
+	int	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 49056d90b179..52653d715f64 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -448,8 +448,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 
 	p->bridge_dev = br;
 
-	if (ds->drv->port_join_bridge)
-		ret = ds->drv->port_join_bridge(ds, p->port, br);
+	if (ds->drv->port_bridge_join)
+		ret = ds->drv->port_bridge_join(ds, p->port, br);
 
 	return ret;
 }
@@ -461,8 +461,8 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	int ret = -EOPNOTSUPP;
 
 
-	if (ds->drv->port_leave_bridge)
-		ret = ds->drv->port_leave_bridge(ds, p->port);
+	if (ds->drv->port_bridge_leave)
+		ret = ds->drv->port_bridge_leave(ds, p->port);
 
 	p->bridge_dev = NULL;
 
-- 
cgit v1.2.3


From 16bfa7024eba5e36aff38ba62086b9027373007d Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 13 Mar 2016 16:21:33 -0400
Subject: net: dsa: make port_bridge_leave return void

netdev_upper_dev_unlink() which notifies NETDEV_CHANGEUPPER, returns
void, as well as del_nbp(). So there's no advantage to catch an eventual
error from the port_bridge_leave routine at the DSA level.

Make this routine void for the DSA layer and its existing drivers.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c   |  4 +---
 drivers/net/dsa/mv88e6xxx.c | 28 +++++++++-------------------
 drivers/net/dsa/mv88e6xxx.h |  2 +-
 include/net/dsa.h           |  2 +-
 net/dsa/slave.c             |  9 +++------
 5 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 4bcc9ebf5e06..95944d5e3e22 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -516,7 +516,7 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
+static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
 	struct net_device *bridge = priv->port_sts[port].bridge_dev;
@@ -543,8 +543,6 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 	core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(port));
 	priv->port_sts[port].vlan_ctl_mask = p_ctl;
 	priv->port_sts[port].bridge_dev = NULL;
-
-	return 0;
 }
 
 static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 5309c738ff00..fa086e09d6b7 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2219,39 +2219,29 @@ unlock:
 	return err;
 }
 
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 	struct net_device *bridge = ps->ports[port].bridge_dev;
 	u16 fid;
-	int i, err;
+	int i;
 
 	mutex_lock(&ps->smi_mutex);
 
 	/* Give the port a fresh Filtering Information Database */
-	err = _mv88e6xxx_fid_new(ds, &fid);
-	if (err)
-		goto unlock;
-
-	err = _mv88e6xxx_port_fid_set(ds, port, fid);
-	if (err)
-		goto unlock;
+	if (_mv88e6xxx_fid_new(ds, &fid) ||
+	    _mv88e6xxx_port_fid_set(ds, port, fid))
+		netdev_warn(ds->ports[port], "failed to assign a new FID\n");
 
 	/* Unassign the bridge and remap each port's VLANTable */
 	ps->ports[port].bridge_dev = NULL;
 
-	for (i = 0; i < ps->num_ports; ++i) {
-		if (i == port || ps->ports[i].bridge_dev == bridge) {
-			err = _mv88e6xxx_port_based_vlan_map(ds, i);
-			if (err)
-				break;
-		}
-	}
+	for (i = 0; i < ps->num_ports; ++i)
+		if (i == port || ps->ports[i].bridge_dev == bridge)
+			if (_mv88e6xxx_port_based_vlan_map(ds, i))
+				netdev_warn(ds->ports[i], "failed to remap\n");
 
-unlock:
 	mutex_unlock(&ps->smi_mutex);
-
-	return err;
 }
 
 static void mv88e6xxx_bridge_work(struct work_struct *work)
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 281cefe86afd..9a038aba48fb 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -488,7 +488,7 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
 		      struct phy_device *phydev, struct ethtool_eee *e);
 int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 			       struct net_device *bridge);
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
 int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 				  bool vlan_filtering);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 004e034184c1..6463bb2863ac 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -298,7 +298,7 @@ struct dsa_switch_driver {
 	 */
 	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
-	int	(*port_bridge_leave)(struct dsa_switch *ds, int port);
+	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 52653d715f64..8e00f1d83eb8 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -454,15 +454,14 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	return ret;
 }
 
-static int dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
-	int ret = -EOPNOTSUPP;
 
 
 	if (ds->drv->port_bridge_leave)
-		ret = ds->drv->port_bridge_leave(ds, p->port);
+		ds->drv->port_bridge_leave(ds, p->port);
 
 	p->bridge_dev = NULL;
 
@@ -470,8 +469,6 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
 	dsa_slave_stp_update(dev, BR_STATE_FORWARDING);
-
-	return ret;
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -1156,7 +1153,7 @@ static int dsa_slave_master_changed(struct net_device *dev)
 	    !strcmp(master->rtnl_link_ops->kind, "bridge"))
 		err = dsa_slave_bridge_port_join(dev, master);
 	else if (dsa_port_is_bridged(p))
-		err = dsa_slave_bridge_port_leave(dev);
+		dsa_slave_bridge_port_leave(dev);
 
 	return err;
 }
-- 
cgit v1.2.3


From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:16 -0800
Subject: netfilter: Remove IP_CT_NEW_REPLY definition.

Remove the definition of IP_CT_NEW_REPLY from the kernel as it does
not make sense.  This allows the definition of IP_CT_NUMBER to be
simplified as well.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++---
 net/openvswitch/conntrack.c                        |  2 --
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f47128db8..6d074d14ee27 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,15 @@ enum ip_conntrack_info {
 
 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
-	/* Number of distinct IP_CT types (no NEW in reply dirn). */
-	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+	/* No NEW in reply direction. */
+
+	/* Number of distinct IP_CT types. */
+	IP_CT_NUMBER,
+
+	/* only for userspace compatibility */
+#ifndef __KERNEL__
+	IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#endif
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..304529015744 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -75,7 +75,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	switch (ctinfo) {
 	case IP_CT_ESTABLISHED_REPLY:
 	case IP_CT_RELATED_REPLY:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_REPLY_DIR;
 		break;
 	default:
@@ -92,7 +91,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_RELATED;
 		break;
 	case IP_CT_NEW:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_NEW;
 		break;
 	default:
-- 
cgit v1.2.3


From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:23 -0800
Subject: openvswitch: Interface with NAT.

Extend OVS conntrack interface to cover NAT.  New nested
OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action.
A bare OVS_CT_ATTR_NAT only mangles existing and expected connections.
If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.

The corresponding OVS userspace patch series includes test cases (in
tests/system-traffic.at) that also serve as example uses.

This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/openvswitch.h |  49 ++++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 524 +++++++++++++++++++++++++++++++++++++--
 net/openvswitch/conntrack.h      |   3 +-
 4 files changed, 551 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a27222d5b413..616d04761730 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
 #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
 #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
 #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+					 * mangled by NAT.
+					 */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+					 * was mangled by NAT.
+					 */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -641,11 +651,50 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_LABELS,     /* labels to associate with this connection. */
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
+	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	__OVS_CT_ATTR_MAX
 };
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
+/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+	OVS_NAT_ATTR_UNSPEC,
+	OVS_NAT_ATTR_SRC,
+	OVS_NAT_ATTR_DST,
+	OVS_NAT_ATTR_IP_MIN,
+	OVS_NAT_ATTR_IP_MAX,
+	OVS_NAT_ATTR_PROTO_MIN,
+	OVS_NAT_ATTR_PROTO_MAX,
+	OVS_NAT_ATTR_PERSISTENT,
+	OVS_NAT_ATTR_PROTO_HASH,
+	OVS_NAT_ATTR_PROTO_RANDOM,
+	__OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
 /**
  * enum ovs_action_attr - Action types.
  *
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d728a7..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
 	tristate "Open vSwitch"
 	depends on INET
 	depends on !NF_CONNTRACK || \
-		   (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+				     (!NF_NAT || NF_NAT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f718b724e650..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
 
 #include <linux/module.h>
 #include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_netlink.h"
 
 struct ovs_ct_len_tbl {
-	size_t maxlen;
-	size_t minlen;
+	int maxlen;
+	int minlen;
 };
 
 /* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
 	struct ovs_key_ct_labels mask;
 };
 
+enum ovs_ct_nat {
+	OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
+	OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+	OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_helper *helper;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u8 commit : 1;
+	u8 nat : 3;                 /* enum ovs_ct_nat */
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+#endif
 };
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_labels(ct, &key->ct.labels);
 }
 
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
  */
 static void ovs_ct_update_key(const struct sk_buff *skb,
 			      const struct ovs_conntrack_info *info,
-			      struct sw_flow_key *key, bool post_ct)
+			      struct sw_flow_key *key, bool post_ct,
+			      bool keep_nat_flags)
 {
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
@@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		 */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
+		if (keep_nat_flags) {
+			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+		} else {
+			if (ct->status & IPS_SRC_NAT)
+				state |= OVS_CS_F_SRC_NAT;
+			if (ct->status & IPS_DST_NAT)
+				state |= OVS_CS_F_DST_NAT;
+		}
 		zone = nf_ct_zone(ct);
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
  */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
-	ovs_ct_update_key(skb, NULL, key, false);
+	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 	enum ip_conntrack_info ctinfo;
 	unsigned int protoff;
 	struct nf_conn *ct;
+	int err;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 		return NF_DROP;
 	}
 
-	return helper->help(skb, protoff, ct, ctinfo);
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
 }
 
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net,
 	return true;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      const struct nf_nat_range *range,
+			      enum nf_nat_manip_type maniptype)
+{
+	int hooknum, nh_off, err = NF_ACCEPT;
+
+	nh_off = skb_network_offset(skb);
+	skb_pull(skb, nh_off);
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto push;
+			}
+#endif
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto push;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto push;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+	skb_push(skb, nh_off);
+
+	return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+			       const struct sk_buff *skb,
+			       enum nf_nat_manip_type maniptype)
+{
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		__be16 src;
+
+		key->ct.state |= OVS_CS_F_SRC_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.src = ip_hdr(skb)->saddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+			       sizeof(key->ipv6.addr.src));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			src = udp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_TCP)
+			src = tcp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			src = sctp_hdr(skb)->source;
+		else
+			return;
+
+		key->tp.src = src;
+	} else {
+		__be16 dst;
+
+		key->ct.state |= OVS_CS_F_DST_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+			       sizeof(key->ipv6.addr.dst));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			dst = udp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_TCP)
+			dst = tcp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			dst = sctp_hdr(skb)->dest;
+		else
+			return;
+
+		key->tp.dst = dst;
+	}
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	enum nf_nat_manip_type maniptype;
+	int err;
+
+	if (nf_ct_is_untracked(ct)) {
+		/* A NAT action may only be performed on tracked packets. */
+		return NF_ACCEPT;
+	}
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_ACCEPT;   /* Can't NAT. */
+
+	/* Determine NAT type.
+	 * Check if the NAT type can be deduced from the tracked connection.
+	 * Make sure expected traffic is NATted only when committing.
+	 */
+	if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+	    ct->status & IPS_NAT_MASK &&
+	    (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (info->nat & OVS_CT_SRC_NAT) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT; /* Connection is not NATed. */
+	}
+	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+	/* Mark NAT done if successful and update the flow key. */
+	if (err == NF_ACCEPT)
+		ovs_nat_update_key(key, skb, maniptype);
+
+	return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	return NF_ACCEPT;
+}
+#endif
+
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
@@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
-		ovs_ct_update_key(skb, info, key, true);
+		/* Clear CT state NAT flags to mark that we have not yet done
+		 * NAT after the nf_conntrack_in() call.  We can actually clear
+		 * the whole state, as it will be re-initialized below.
+		 */
+		key->ct.state = 0;
+
+		/* Update the key, but keep the NAT flags. */
+		ovs_ct_update_key(skb, info, key, true, true);
 	}
 
-	/* Call the helper only if:
-	 * - nf_conntrack_in() was executed above ("!cached") for a confirmed
-	 *   connection, or
-	 * - When committing an unconfirmed connection.
-	 */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
-	    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-		WARN_ONCE(1, "helper rejected packet");
-		return -EINVAL;
+	if (ct) {
+		/* Packets starting a new connection must be NATted before the
+		 * helper, so that the helper knows about the NAT.  We enforce
+		 * this by delaying both NAT and helper calls for unconfirmed
+		 * connections until the committing CT action.  For later
+		 * packets NAT and Helper may be called in either order.
+		 *
+		 * NAT will be done only if the CT action has NAT, and only
+		 * once per packet (per zone), as guarded by the NAT bits in
+		 * the key->ct.state.
+		 */
+		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		    (nf_ct_is_confirmed(ct) || info->commit) &&
+		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+			return -EINVAL;
+		}
+
+		/* Call the helper only if:
+		 * - nf_conntrack_in() was executed above ("!cached") for a
+		 *   confirmed connection, or
+		 * - When committing an unconfirmed connection.
+		 */
+		if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+		    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (exp) {
 		u8 state;
 
+		/* NOTE: New connections are NATted and Helped only when
+		 * committed, so we are not calling into NAT here.
+		 */
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
 		__ovs_ct_update_key(key, state, &info->zone, exp->master);
-	} else {
-		int err;
-
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
-	}
+	} else
+		return __ovs_ct_lookup(net, key, info, skb);
 
 	return 0;
 }
@@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+		     struct ovs_conntrack_info *info, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool have_ip_max = false;
+	bool have_proto_max = false;
+	bool ip_vers = (info->family == NFPROTO_IPV6);
+
+	nla_for_each_nested(a, attr, rem) {
+		static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+			[OVS_NAT_ATTR_SRC] = {0, 0},
+			[OVS_NAT_ATTR_DST] = {0, 0},
+			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+		};
+		int type = nla_type(a);
+
+		if (type > OVS_NAT_ATTR_MAX) {
+			OVS_NLERR(log,
+				  "Unknown NAT attribute (type=%d, max=%d).\n",
+				  type, OVS_NAT_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+			OVS_NLERR(log,
+				  "NAT attribute type %d has unexpected length (%d != %d).\n",
+				  type, nla_len(a),
+				  ovs_nat_attr_lens[type][ip_vers]);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NAT_ATTR_SRC:
+		case OVS_NAT_ATTR_DST:
+			if (info->nat) {
+				OVS_NLERR(log,
+					  "Only one type of NAT may be specified.\n"
+					  );
+				return -ERANGE;
+			}
+			info->nat |= OVS_CT_NAT;
+			info->nat |= ((type == OVS_NAT_ATTR_SRC)
+					? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+			break;
+
+		case OVS_NAT_ATTR_IP_MIN:
+			nla_memcpy(&info->range.min_addr, a, nla_len(a));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_IP_MAX:
+			have_ip_max = true;
+			nla_memcpy(&info->range.max_addr, a,
+				   sizeof(info->range.max_addr));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MIN:
+			info->range.min_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MAX:
+			have_proto_max = true;
+			info->range.max_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PERSISTENT:
+			info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_HASH:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_RANDOM:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+			break;
+
+		default:
+			OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+		return -EINVAL;
+	}
+	if (!info->nat) {
+		/* Do not allow flags if no type is given. */
+		if (info->range.flags) {
+			OVS_NLERR(log,
+				  "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+				  );
+			return -EINVAL;
+		}
+		info->nat = OVS_CT_NAT;   /* NAT existing connections. */
+	} else if (!info->commit) {
+		OVS_NLERR(log,
+			  "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+			  );
+		return -EINVAL;
+	}
+	/* Allow missing IP_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+		memcpy(&info->range.max_addr, &info->range.min_addr,
+		       sizeof(info->range.max_addr));
+	}
+	/* Allow missing PROTO_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    !have_proto_max) {
+		info->range.max_proto.all = info->range.min_proto.all;
+	}
+	return 0;
+}
+#endif
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
@@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_LABELS]	= { .minlen = sizeof(struct md_labels),
 				    .maxlen = sizeof(struct md_labels) },
 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
-				    .maxlen = NF_CT_HELPER_NAME_LEN }
+				    .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* NAT length is checked when parsing the nested attributes. */
+	[OVS_CT_ATTR_NAT]	= { .minlen = 0, .maxlen = INT_MAX },
+#endif
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 				return -EINVAL;
 			}
 			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		case OVS_CT_ATTR_NAT: {
+			int err = parse_nat(a, info, log);
+
+			if (err)
+				return err;
+			break;
+		}
+#endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -816,6 +1217,74 @@ err_free_ct:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+			       struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+	if (!start)
+		return false;
+
+	if (info->nat & OVS_CT_SRC_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+			return false;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+			return false;
+	} else {
+		goto out;
+	}
+
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+		if (info->family == NFPROTO_IPV4) {
+			if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					    info->range.min_addr.ip) ||
+			    (info->range.max_addr.ip
+			     != info->range.min_addr.ip &&
+			     (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					      info->range.max_addr.ip))))
+				return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (info->family == NFPROTO_IPV6) {
+			if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					     &info->range.min_addr.in6) ||
+			    (memcmp(&info->range.max_addr.in6,
+				    &info->range.min_addr.in6,
+				    sizeof(info->range.max_addr.in6)) &&
+			     (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					       &info->range.max_addr.in6))))
+				return false;
+#endif
+		} else {
+			return false;
+		}
+	}
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+			 ntohs(info->range.min_proto.all)) ||
+	     (info->range.max_proto.all != info->range.min_proto.all &&
+	      nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+			  ntohs(info->range.max_proto.all)))))
+		return false;
+
+	if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+		return false;
+out:
+	nla_nest_end(skb, start);
+
+	return true;
+}
+#endif
+
 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 			  struct sk_buff *skb)
 {
@@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 				   ct_info->helper->name))
 			return -EMSGSIZE;
 	}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+		return -EMSGSIZE;
+#endif
 	nla_nest_end(skb, start);
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
 			   OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
-			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+			   OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 #else
 #include <linux/errno.h>
 
-- 
cgit v1.2.3


From fb781c8e2a370d67acf7b8a8826e6f5e3ae1d7c6 Mon Sep 17 00:00:00 2001
From: Xing Zheng <zhengxing@rock-chips.com>
Date: Mon, 14 Mar 2016 16:01:56 +0800
Subject: clk: rockchip: add node-id for rk3036 emac hclk

Add the node-id for the emac hclk to the binding header.

Signed-off-by: Xing Zheng <zhengxing@rock-chips.com>
Signed-off-by: Caesar Wang <wxt@rock-chips.com>
Cc: Xing Zheng <zhengxing@rock-chips.com>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: linux-clk@vger.kernel.org
Cc: linux-rockchip@lists.infradead.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/clock/rk3036-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3036-cru.h b/include/dt-bindings/clock/rk3036-cru.h
index ebc7a7b43f52..339659115695 100644
--- a/include/dt-bindings/clock/rk3036-cru.h
+++ b/include/dt-bindings/clock/rk3036-cru.h
@@ -92,6 +92,7 @@
 #define HCLK_SDMMC		456
 #define HCLK_SDIO		457
 #define HCLK_EMMC		459
+#define HCLK_MAC		460
 #define HCLK_I2S		462
 #define HCLK_LCDC		465
 #define HCLK_ROM		467
-- 
cgit v1.2.3


From f7e180222b973a0b363564b281a314276cb2b594 Mon Sep 17 00:00:00 2001
From: Xing Zheng <zhengxing@rock-chips.com>
Date: Mon, 14 Mar 2016 16:01:58 +0800
Subject: clk: rockchip: add clock-id for rk3036 emac pll source clock

Suitable PLLs for the emac on the rk3036 are difficult to find
and one of them is the (continuously changing) APLL. So in most
cases it will be necessary to select a PLL manually.
So add a clock-id for it.

Signed-off-by: Xing Zheng <zhengxing@rock-chips.com>
Signed-off-by: Caesar Wang <wxt@rock-chips.com>
Cc: Xing Zheng <zhengxing@rock-chips.com>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: linux-clk@vger.kernel.org
Cc: linux-rockchip@lists.infradead.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/clock/rk3036-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3036-cru.h b/include/dt-bindings/clock/rk3036-cru.h
index 339659115695..de44109a3a04 100644
--- a/include/dt-bindings/clock/rk3036-cru.h
+++ b/include/dt-bindings/clock/rk3036-cru.h
@@ -54,6 +54,7 @@
 #define SCLK_PVTM_VIDEO		125
 #define SCLK_MAC		151
 #define SCLK_MACREF		152
+#define SCLK_MACPLL		153
 #define SCLK_SFC		160
 
 /* aclk gates */
-- 
cgit v1.2.3


From 808c1b697c3c4dd2a7132882424c390b0d0acfb9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Mar 2016 01:42:50 +0100
Subject: bpf, dst: add and use dst_tclassid helper

We can just add a small helper dst_tclassid() for retrieving the
dst->tclassid value. It makes the code a bit better in that we can
get rid of the ifdef from filter.c by moving this into the header.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 12 ++++++++++++
 net/core/filter.c |  9 +--------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index c7329dcd90cc..5c98443c1c9e 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -398,6 +398,18 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 	__skb_tunnel_rx(skb, dev, net);
 }
 
+static inline u32 dst_tclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	const struct dst_entry *dst;
+
+	dst = skb_dst(skb);
+	if (dst)
+		return dst->tclassid;
+#endif
+	return 0;
+}
+
 int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 static inline int dst_discard(struct sk_buff *skb)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 69c7b2fecf44..4c35d8325c34 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1682,14 +1682,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 
 static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-#ifdef CONFIG_IP_ROUTE_CLASSID
-	const struct dst_entry *dst;
-
-	dst = skb_dst((struct sk_buff *) (unsigned long) r1);
-	if (dst)
-		return dst->tclassid;
-#endif
-	return 0;
+	return dst_tclassid((struct sk_buff *) (unsigned long) r1);
 }
 
 static const struct bpf_func_proto bpf_get_route_realm_proto = {
-- 
cgit v1.2.3


From fca5fdf67de9e092fda23c9eb059ba968e7b5267 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Mar 2016 01:42:51 +0100
Subject: ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it

eBPF defines this as BPF_TUNLEN_MAX and OVS just uses the hard-coded
value inside struct sw_flow_key. Thus, add and use IP_TUNNEL_OPTS_MAX
for this, which makes the code a bit more generic and allows to remove
BPF_TUNLEN_MAX from eBPF code.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  | 7 +++++++
 net/core/filter.c         | 9 ++-------
 net/ipv4/ip_tunnel_core.c | 6 ++++++
 net/openvswitch/flow.h    | 2 +-
 4 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5dc2e454f866..c35dda9ec991 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -7,6 +7,8 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/bitops.h>
+
 #include <net/dsfield.h>
 #include <net/gro_cells.h>
 #include <net/inet_ecn.h>
@@ -57,6 +59,11 @@ struct ip_tunnel_key {
 #define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
 #define IP_TUNNEL_INFO_IPV6	0x02	/* key contains IPv6 addresses */
 
+/* Maximum tunnel options length. */
+#define IP_TUNNEL_OPTS_MAX					\
+	GENMASK((FIELD_SIZEOF(struct ip_tunnel_info,		\
+			      options_len) * BITS_PER_BYTE) - 1, 0)
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 #ifdef CONFIG_DST_CACHE
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c35d8325c34..b7177d01ecb0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1904,8 +1904,6 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-#define BPF_TUNLEN_MAX	255
-
 static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1915,7 +1913,7 @@ static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
 
 	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
 		return -EINVAL;
-	if (unlikely(size > BPF_TUNLEN_MAX))
+	if (unlikely(size > IP_TUNNEL_OPTS_MAX))
 		return -ENOMEM;
 
 	ip_tunnel_info_opts_set(info, from, size);
@@ -1936,13 +1934,10 @@ static const struct bpf_func_proto *
 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
-					  options_len) != 1);
-
 		/* Race is not possible, since it's called from verifier
 		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
 						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index eaca2449a09a..d27276f6f8dd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -398,6 +398,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
 
 void __init ip_tunnel_core_init(void)
 {
+	/* If you land here, make sure whether increasing ip_tunnel_info's
+	 * options_len is a reasonable choice with its usage in front ends
+	 * (f.e., it's part of flow keys, etc).
+	 */
+	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
 	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1d055c559eaf..03378e75a67c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -55,7 +55,7 @@ struct ovs_tunnel_info {
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
 struct sw_flow_key {
-	u8 tun_opts[255];
+	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
 	struct ip_tunnel_key tun_key;	/* Encapsulating tunnel key. */
 	struct {
-- 
cgit v1.2.3


From 93e68cd6115f67d8363c94dae8206af36f6d3b00 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 16 Mar 2016 09:12:46 +0000
Subject: net: fix a comment typo

Fix a comment typo.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 9cf2394f0bcf..f80277569f24 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -37,7 +37,7 @@
  * are shared for all types of net_devices. The sysfs entries are available
  * via /sys/class/net/<dev>/flags. Flags which can be toggled through sysfs
  * are annotated below, note that only a few flags can be toggled and some
- * other flags are always always preserved from the original net_device flags
+ * other flags are always preserved from the original net_device flags
  * even if you try to set them via sysfs. Flags which are always preserved
  * are kept under the flag grouping @IFF_VOLATILE. Flags which are volatile
  * are annotated below as such.
-- 
cgit v1.2.3


From fe30937b65354c7fec244caebbdaae68e28ca797 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 Mar 2016 17:23:36 -0700
Subject: bonding: fix bond_get_stats()

bond_get_stats() can be called from rtnetlink (with RTNL held)
or from /proc/net/dev seq handler (with RCU held)

The logic added in commit 5f0c5f73e5ef ("bonding: make global bonding
stats more reliable") kind of assumed only one cpu could run there.

If multiple threads are reading /proc/net/dev, stats can be really
messed up after a while.

A second problem is that some fields are 32bit, so we need to properly
handle the wrap around problem.

Given that RTNL is not always held, we need to use
bond_for_each_slave_rcu().

Fixes: 5f0c5f73e5ef ("bonding: make global bonding stats more reliable")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Andy Gospodarek <gospo@cumulusnetworks.com>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 66 ++++++++++++++++++++++-------------------
 include/net/bonding.h           |  1 +
 2 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 270b39c8357f..941ec99cd3b6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3301,6 +3301,30 @@ static int bond_close(struct net_device *bond_dev)
 	return 0;
 }
 
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void bond_fold_stats(struct rtnl_link_stats64 *_res,
+			    const struct rtnl_link_stats64 *_new,
+			    const struct rtnl_link_stats64 *_old)
+{
+	const u64 *new = (const u64 *)_new;
+	const u64 *old = (const u64 *)_old;
+	u64 *res = (u64 *)_res;
+	int i;
+
+	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+		u64 nv = new[i];
+		u64 ov = old[i];
+
+		/* detects if this particular field is 32bit only */
+		if (((nv | ov) >> 32) == 0)
+			res[i] += (u32)nv - (u32)ov;
+		else
+			res[i] += nv - ov;
+	}
+}
+
 static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
 						struct rtnl_link_stats64 *stats)
 {
@@ -3309,44 +3333,23 @@ static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
 	struct list_head *iter;
 	struct slave *slave;
 
+	spin_lock(&bond->stats_lock);
 	memcpy(stats, &bond->bond_stats, sizeof(*stats));
 
-	bond_for_each_slave(bond, slave, iter) {
-		const struct rtnl_link_stats64 *sstats =
+	rcu_read_lock();
+	bond_for_each_slave_rcu(bond, slave, iter) {
+		const struct rtnl_link_stats64 *new =
 			dev_get_stats(slave->dev, &temp);
-		struct rtnl_link_stats64 *pstats = &slave->slave_stats;
-
-		stats->rx_packets +=  sstats->rx_packets - pstats->rx_packets;
-		stats->rx_bytes += sstats->rx_bytes - pstats->rx_bytes;
-		stats->rx_errors += sstats->rx_errors - pstats->rx_errors;
-		stats->rx_dropped += sstats->rx_dropped - pstats->rx_dropped;
-		stats->rx_nohandler += sstats->rx_nohandler - pstats->rx_nohandler;
-
-		stats->tx_packets += sstats->tx_packets - pstats->tx_packets;;
-		stats->tx_bytes += sstats->tx_bytes - pstats->tx_bytes;
-		stats->tx_errors += sstats->tx_errors - pstats->tx_errors;
-		stats->tx_dropped += sstats->tx_dropped - pstats->tx_dropped;
-
-		stats->multicast += sstats->multicast - pstats->multicast;
-		stats->collisions += sstats->collisions - pstats->collisions;
-
-		stats->rx_length_errors += sstats->rx_length_errors - pstats->rx_length_errors;
-		stats->rx_over_errors += sstats->rx_over_errors - pstats->rx_over_errors;
-		stats->rx_crc_errors += sstats->rx_crc_errors - pstats->rx_crc_errors;
-		stats->rx_frame_errors += sstats->rx_frame_errors - pstats->rx_frame_errors;
-		stats->rx_fifo_errors += sstats->rx_fifo_errors - pstats->rx_fifo_errors;
-		stats->rx_missed_errors += sstats->rx_missed_errors - pstats->rx_missed_errors;
-
-		stats->tx_aborted_errors += sstats->tx_aborted_errors - pstats->tx_aborted_errors;
-		stats->tx_carrier_errors += sstats->tx_carrier_errors - pstats->tx_carrier_errors;
-		stats->tx_fifo_errors += sstats->tx_fifo_errors - pstats->tx_fifo_errors;
-		stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors - pstats->tx_heartbeat_errors;
-		stats->tx_window_errors += sstats->tx_window_errors - pstats->tx_window_errors;
+
+		bond_fold_stats(stats, new, &slave->slave_stats);
 
 		/* save off the slave stats for the next run */
-		memcpy(pstats, sstats, sizeof(*sstats));
+		memcpy(&slave->slave_stats, new, sizeof(*new));
 	}
+	rcu_read_unlock();
+
 	memcpy(&bond->bond_stats, stats, sizeof(*stats));
+	spin_unlock(&bond->stats_lock);
 
 	return stats;
 }
@@ -4160,6 +4163,7 @@ void bond_setup(struct net_device *bond_dev)
 	struct bonding *bond = netdev_priv(bond_dev);
 
 	spin_lock_init(&bond->mode_lock);
+	spin_lock_init(&bond->stats_lock);
 	bond->params = bonding_defaults;
 
 	/* Initialize pointers */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index ee6c52053aa3..791800ddd6d9 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -215,6 +215,7 @@ struct bonding {
 	 * ALB mode (6) - to sync the use and modifications of its hash table
 	 */
 	spinlock_t mode_lock;
+	spinlock_t stats_lock;
 	u8	 send_peer_notif;
 	u8       igmp_retrans;
 #ifdef CONFIG_PROC_FS
-- 
cgit v1.2.3