From c8770bcf5cefa8cbfae21c07c4fe3428f5a9d42a Mon Sep 17 00:00:00 2001
From: Miaoqing Pan <miaoqing@codeaurora.org>
Date: Mon, 7 Mar 2016 10:38:18 +0800
Subject: ath9k: Allow platform override BTCoex pin

Add new platform data to allow override BTCoex default pin.

Signed-off-by: Miaoqing Pan <miaoqing@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 include/linux/ath9k_platform.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index 33eb274cd0e6..e66153d60bd5 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -31,6 +31,10 @@ struct ath9k_platform_data {
 	u32 gpio_mask;
 	u32 gpio_val;
 
+	u32 bt_active_pin;
+	u32 bt_priority_pin;
+	u32 wlan_active_pin;
+
 	bool endian_check;
 	bool is_clk_25mhz;
 	bool tx_gain_buffalo;
-- 
cgit v1.2.3


From 2da62906b1e298695e1bb725927041cd59942c98 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 14 Mar 2015 21:13:46 -0400
Subject: [net] drop 'size' argument of sock_recvmsg()

all callers have it equal to msg_data_left(msg).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/target/iscsi/iscsi_target_util.c |  5 ++---
 include/linux/net.h                      |  3 +--
 net/socket.c                             | 23 ++++++++++-------------
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 428b0d9e3dba..57720385a751 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1283,9 +1283,8 @@ static int iscsit_do_rx_data(
 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC,
 		      count->iov, count->iov_count, data);
 
-	while (total_rx < data) {
-		rx_loop = sock_recvmsg(conn->sock, &msg,
-				      (data - total_rx), MSG_WAITALL);
+	while (msg_data_left(&msg)) {
+		rx_loop = sock_recvmsg(conn->sock, &msg, MSG_WAITALL);
 		if (rx_loop <= 0) {
 			pr_debug("rx_loop: %d total_rx: %d\n",
 				rx_loop, total_rx);
diff --git a/include/linux/net.h b/include/linux/net.h
index 49175e4ced11..72c1e0622ce2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -218,8 +218,7 @@ int sock_create_lite(int family, int type, int proto, struct socket **res);
 struct socket *sock_alloc(void);
 void sock_release(struct socket *sock);
 int sock_sendmsg(struct socket *sock, struct msghdr *msg);
-int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-		 int flags);
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
 struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname);
 struct socket *sockfd_lookup(int fd, int *err);
 struct socket *sock_from_file(struct file *file, int *err);
diff --git a/net/socket.c b/net/socket.c
index 5f77a8e93830..956426e347af 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -709,17 +709,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
-				     size_t size, int flags)
+				     int flags)
 {
-	return sock->ops->recvmsg(sock, msg, size, flags);
+	return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
 }
 
-int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-		 int flags)
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
 {
-	int err = security_socket_recvmsg(sock, msg, size, flags);
+	int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
 
-	return err ?: sock_recvmsg_nosec(sock, msg, size, flags);
+	return err ?: sock_recvmsg_nosec(sock, msg, flags);
 }
 EXPORT_SYMBOL(sock_recvmsg);
 
@@ -746,7 +745,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
 	set_fs(KERNEL_DS);
-	result = sock_recvmsg(sock, msg, size, flags);
+	result = sock_recvmsg(sock, msg, flags);
 	set_fs(oldfs);
 	return result;
 }
@@ -796,7 +795,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
 		return 0;
 
-	res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags);
+	res = sock_recvmsg(sock, &msg, msg.msg_flags);
 	*to = msg.msg_iter;
 	return res;
 }
@@ -1696,7 +1695,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 	msg.msg_iocb = NULL;
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
+	err = sock_recvmsg(sock, &msg, flags);
 
 	if (err >= 0 && addr != NULL) {
 		err2 = move_addr_to_user(&address,
@@ -2073,7 +2072,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	unsigned long cmsg_ptr;
-	int total_len, len;
+	int len;
 	ssize_t err;
 
 	/* kernel mode address */
@@ -2091,7 +2090,6 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 		err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
 	if (err < 0)
 		return err;
-	total_len = iov_iter_count(&msg_sys->msg_iter);
 
 	cmsg_ptr = (unsigned long)msg_sys->msg_control;
 	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
@@ -2101,8 +2099,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
-							  total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
-- 
cgit v1.2.3


From e8b123e6008480b2b8d80dae060315d84b79f4bb Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn@kryo.se>
Date: Thu, 24 Dec 2015 00:28:38 -0800
Subject: soc: qcom: smem_state: Add stubs for disabled smem_state

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 include/linux/soc/qcom/smem_state.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smem_state.h b/include/linux/soc/qcom/smem_state.h
index f35e1512fcaa..7b88697929e9 100644
--- a/include/linux/soc/qcom/smem_state.h
+++ b/include/linux/soc/qcom/smem_state.h
@@ -1,12 +1,17 @@
 #ifndef __QCOM_SMEM_STATE__
 #define __QCOM_SMEM_STATE__
 
+#include <linux/errno.h>
+
+struct device_node;
 struct qcom_smem_state;
 
 struct qcom_smem_state_ops {
 	int (*update_bits)(void *, u32, u32);
 };
 
+#ifdef CONFIG_QCOM_SMEM_STATE
+
 struct qcom_smem_state *qcom_smem_state_get(struct device *dev, const char *con_id, unsigned *bit);
 void qcom_smem_state_put(struct qcom_smem_state *);
 
@@ -15,4 +20,34 @@ int qcom_smem_state_update_bits(struct qcom_smem_state *state, u32 mask, u32 val
 struct qcom_smem_state *qcom_smem_state_register(struct device_node *of_node, const struct qcom_smem_state_ops *ops, void *data);
 void qcom_smem_state_unregister(struct qcom_smem_state *state);
 
+#else
+
+static inline struct qcom_smem_state *qcom_smem_state_get(struct device *dev,
+	const char *con_id, unsigned *bit)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void qcom_smem_state_put(struct qcom_smem_state *state)
+{
+}
+
+static inline int qcom_smem_state_update_bits(struct qcom_smem_state *state,
+	u32 mask, u32 value)
+{
+	return -EINVAL;
+}
+
+static inline struct qcom_smem_state *qcom_smem_state_register(struct device_node *of_node,
+	const struct qcom_smem_state_ops *ops, void *data)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void qcom_smem_state_unregister(struct qcom_smem_state *state)
+{
+}
+
+#endif
+
 #endif
-- 
cgit v1.2.3


From 39f0db298e7c02a29371fb39cabdd5d76e6b726c Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Wed, 17 Feb 2016 22:39:02 -0800
Subject: soc: qcom: smd: Introduce callback setter

Introduce a setter for the callback function pointer to clarify the
locking around the operation and to reduce some duplication.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd.c       | 25 +++++++++++++++++--------
 include/linux/soc/qcom/smd.h |  4 +++-
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index 498fd0581a45..c357842b92e1 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -186,7 +186,7 @@ struct qcom_smd_channel {
 	int fifo_size;
 
 	void *bounce_buffer;
-	int (*cb)(struct qcom_smd_device *, const void *, size_t);
+	qcom_smd_cb_t cb;
 
 	spinlock_t recv_lock;
 
@@ -377,6 +377,19 @@ static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
 	channel->pkt_size = 0;
 }
 
+/*
+ * Set the callback for a channel, with appropriate locking
+ */
+static void qcom_smd_channel_set_callback(struct qcom_smd_channel *channel,
+					  qcom_smd_cb_t cb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&channel->recv_lock, flags);
+	channel->cb = cb;
+	spin_unlock_irqrestore(&channel->recv_lock, flags);
+};
+
 /*
  * Calculate the amount of data available in the rx fifo
  */
@@ -814,8 +827,7 @@ static int qcom_smd_dev_probe(struct device *dev)
 	if (!channel->bounce_buffer)
 		return -ENOMEM;
 
-	channel->cb = qsdrv->callback;
-
+	qcom_smd_channel_set_callback(channel, qsdrv->callback);
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENING);
 
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENED);
@@ -831,7 +843,7 @@ static int qcom_smd_dev_probe(struct device *dev)
 err:
 	dev_err(&qsdev->dev, "probe failed\n");
 
-	channel->cb = NULL;
+	qcom_smd_channel_set_callback(channel, NULL);
 	kfree(channel->bounce_buffer);
 	channel->bounce_buffer = NULL;
 
@@ -850,16 +862,13 @@ static int qcom_smd_dev_remove(struct device *dev)
 	struct qcom_smd_device *qsdev = to_smd_device(dev);
 	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
 	struct qcom_smd_channel *channel = qsdev->channel;
-	unsigned long flags;
 
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSING);
 
 	/*
 	 * Make sure we don't race with the code receiving data.
 	 */
-	spin_lock_irqsave(&channel->recv_lock, flags);
-	channel->cb = NULL;
-	spin_unlock_irqrestore(&channel->recv_lock, flags);
+	qcom_smd_channel_set_callback(channel, NULL);
 
 	/* Wake up any sleepers in qcom_smd_send() */
 	wake_up_interruptible(&channel->fblockread_event);
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index d0cb6d189a0a..65a64fcdb1aa 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -26,6 +26,8 @@ struct qcom_smd_device {
 	struct qcom_smd_channel *channel;
 };
 
+typedef int (*qcom_smd_cb_t)(struct qcom_smd_device *, const void *, size_t);
+
 /**
  * struct qcom_smd_driver - smd driver struct
  * @driver:	underlying device driver
@@ -42,7 +44,7 @@ struct qcom_smd_driver {
 
 	int (*probe)(struct qcom_smd_device *dev);
 	void (*remove)(struct qcom_smd_device *dev);
-	int (*callback)(struct qcom_smd_device *, const void *, size_t);
+	qcom_smd_cb_t callback;
 };
 
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
-- 
cgit v1.2.3


From 028021d29ea069390e1f60c6aa5b3511d218454b Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Wed, 17 Feb 2016 22:39:06 -0800
Subject: soc: qcom: smd: Support opening additional channels

With the qcom_smd_open_channel() API we allow SMD devices to open
additional SMD channels, to allow implementation of multi-channel SMD
devices - like Bluetooth.

Channels are opened from the same edge as the calling SMD device is tied
to.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd.c       | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smd.h |  4 +++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index c3fa0fd724f7..b6434c4be86a 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -129,6 +129,8 @@ struct qcom_smd_edge {
 
 	unsigned smem_available;
 
+	wait_queue_head_t new_channel_event;
+
 	struct work_struct scan_work;
 	struct work_struct state_work;
 };
@@ -1042,6 +1044,77 @@ void qcom_smd_driver_unregister(struct qcom_smd_driver *qsdrv)
 }
 EXPORT_SYMBOL(qcom_smd_driver_unregister);
 
+static struct qcom_smd_channel *
+qcom_smd_find_channel(struct qcom_smd_edge *edge, const char *name)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_channel *ret = NULL;
+	unsigned long flags;
+	unsigned state;
+
+	spin_lock_irqsave(&edge->channels_lock, flags);
+	list_for_each_entry(channel, &edge->channels, list) {
+		if (strcmp(channel->name, name))
+			continue;
+
+		state = GET_RX_CHANNEL_INFO(channel, state);
+		if (state != SMD_CHANNEL_OPENING &&
+		    state != SMD_CHANNEL_OPENED)
+			continue;
+
+		ret = channel;
+		break;
+	}
+	spin_unlock_irqrestore(&edge->channels_lock, flags);
+
+	return ret;
+}
+
+/**
+ * qcom_smd_open_channel() - claim additional channels on the same edge
+ * @sdev:	smd_device handle
+ * @name:	channel name
+ * @cb:		callback method to use for incoming data
+ *
+ * Returns a channel handle on success, or -EPROBE_DEFER if the channel isn't
+ * ready.
+ */
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+					       const char *name,
+					       qcom_smd_cb_t cb)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_edge *edge = sdev->channel->edge;
+	int ret;
+
+	/* Wait up to HZ for the channel to appear */
+	ret = wait_event_interruptible_timeout(edge->new_channel_event,
+			(channel = qcom_smd_find_channel(edge, name)) != NULL,
+			HZ);
+	if (!ret)
+		return ERR_PTR(-ETIMEDOUT);
+
+	if (channel->state != SMD_CHANNEL_CLOSED) {
+		dev_err(&sdev->dev, "channel %s is busy\n", channel->name);
+		return ERR_PTR(-EBUSY);
+	}
+
+	channel->qsdev = sdev;
+	ret = qcom_smd_channel_open(channel, cb);
+	if (ret) {
+		channel->qsdev = NULL;
+		return ERR_PTR(ret);
+	}
+
+	/*
+	 * Append the list of channel to the channels associated with the sdev
+	 */
+	list_add_tail(&channel->dev_list, &sdev->channel->dev_list);
+
+	return channel;
+}
+EXPORT_SYMBOL(qcom_smd_open_channel);
+
 /*
  * Allocate the qcom_smd_channel object for a newly found smd channel,
  * retrieving and validating the smem items involved.
@@ -1178,6 +1251,8 @@ static void qcom_channel_scan_worker(struct work_struct *work)
 
 			dev_dbg(smd->dev, "new channel found: '%s'\n", channel->name);
 			set_bit(i, edge->allocated[tbl]);
+
+			wake_up_interruptible(&edge->new_channel_event);
 		}
 	}
 
@@ -1341,6 +1416,7 @@ static int qcom_smd_probe(struct platform_device *pdev)
 	for_each_available_child_of_node(pdev->dev.of_node, node) {
 		edge = &smd->edges[i++];
 		edge->smd = smd;
+		init_waitqueue_head(&edge->new_channel_event);
 
 		ret = qcom_smd_parse_edge(&pdev->dev, node, edge);
 		if (ret)
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index 65a64fcdb1aa..bd51c8a9d807 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -56,4 +56,8 @@ void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
 int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+					       const char *name,
+					       qcom_smd_cb_t cb);
+
 #endif
-- 
cgit v1.2.3


From ee2ae1ed46251dcbdcc2c59b5e30f664ddfbacb1 Mon Sep 17 00:00:00 2001
From: Alexandre TORGUE <alexandre.torgue@st.com>
Date: Fri, 1 Apr 2016 11:37:33 +0200
Subject: stmmac: add new DT platform entries for GMAC4

This is to support the snps,dwmac-4.00 and snps,dwmac-4.10a
and related features on the platform driver.
See binding doc for further details.

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 7 +++++++
 include/linux/stmmac.h                                | 2 ++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 6605d19601c2..4d302db657c0 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -59,6 +59,8 @@ Optional properties:
 	- snps,fb: fixed-burst
 	- snps,mb: mixed-burst
 	- snps,rb: rebuild INCRx Burst
+	- snps,tso: this enables the TSO feature otherwise it will be managed by
+	    MAC HW capability register.
 - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
 
 Examples:
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index cf37ea558ecc..effaa4ff5ab7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -284,6 +284,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		plat->pmt = 1;
 	}
 
+	if (of_device_is_compatible(np, "snps,dwmac-4.00") ||
+	    of_device_is_compatible(np, "snps,dwmac-4.10a")) {
+		plat->has_gmac4 = 1;
+		plat->pmt = 1;
+		plat->tso_en = of_property_read_bool(np, "snps,tso");
+	}
+
 	if (of_device_is_compatible(np, "snps,dwmac-3.610") ||
 		of_device_is_compatible(np, "snps,dwmac-3.710")) {
 		plat->enh_desc = 1;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e6bc30a42a74..ffdaca9c01af 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -137,5 +137,7 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 	struct stmmac_axi *axi;
+	int has_gmac4;
+	bool tso_en;
 };
 #endif
-- 
cgit v1.2.3


From ca065d0cf80fa547724440a8bf37f1e674d917c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:13 -0700
Subject: udp: no longer use SLAB_DESTROY_BY_RCU

Tom Herbert would like not touching UDP socket refcnt for encapsulated
traffic. For this to happen, we need to use normal RCU rules, with a grace
period before freeing a socket. UDP sockets are not short lived in the
high usage case, so the added cost of call_rcu() should not be a concern.

This actually removes a lot of complexity in UDP stack.

Multicast receives no longer need to hold a bucket spinlock.

Note that ip early demux still needs to take a reference on the socket.

Same remark for functions used by xt_socket and xt_PROXY netfilter modules,
but this might be changed later.

Performance for a single UDP socket receiving flood traffic from
many RX queues/cpus.

Simple udp_rx using simple recvfrom() loop :
438 kpps instead of 374 kpps : 17 % increase of the peak rate.

v2: Addressed Willem de Bruijn feedback in multicast handling
 - keep early demux break in __udp4_lib_demux_lookup()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Willem de Bruijn <willemb@google.com>
Tested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |   8 +-
 include/net/sock.h  |  12 +--
 include/net/udp.h   |   2 +-
 net/ipv4/udp.c      | 293 ++++++++++++++++------------------------------------
 net/ipv4/udp_diag.c |  18 ++--
 net/ipv6/udp.c      | 196 ++++++++++++-----------------------
 6 files changed, 171 insertions(+), 358 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 87c094961bd5..32342754643a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -98,11 +98,11 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
-#define udp_portaddr_for_each_entry(__sk, node, list) \
-	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry(__sk, list) \
+	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
-#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
-	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry_rcu(__sk, list) \
+	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
 #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag)
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9e77353a92ae..7ad73db9dde2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -178,7 +178,7 @@ struct sock_common {
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
-		struct hlist_nulls_node skc_portaddr_node;
+		struct hlist_node	skc_portaddr_node;
 	};
 	struct proto		*skc_prot;
 	possible_net_t		skc_net;
@@ -670,18 +670,18 @@ static inline void sk_add_bind_node(struct sock *sk,
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
 /**
- * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @offset:	offset of hlist_node within the struct.
  *
  */
-#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
-	for (pos = (head)->first;					       \
-	     (!is_a_nulls(pos)) &&					       \
+#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)		       \
+	for (pos = rcu_dereference((head)->first);			       \
+	     pos != NULL &&						       \
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
-	     pos = pos->next)
+	     pos = rcu_dereference(pos->next))
 
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
diff --git a/include/net/udp.h b/include/net/udp.h
index 92927f729ac8..d870ec1611c4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -59,7 +59,7 @@ struct udp_skb_cb {
  *	@lock:	spinlock protecting changes to head/count
  */
 struct udp_hslot {
-	struct hlist_nulls_head	head;
+	struct hlist_head	head;
 	int			count;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 45ff590661f4..355bdb221057 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       unsigned int log)
 {
 	struct sock *sk2;
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 
-	sk_nulls_for_each(sk2, node, &hslot->head) {
+	sk_for_each(sk2, &hslot->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 						  bool match_wildcard))
 {
 	struct sock *sk2;
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 	int res = 0;
 
 	spin_lock(&hslot2->lock);
-	udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+	udp_portaddr_for_each_entry(sk2, &hslot2->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    (udp_sk(sk2)->udp_port_hash == num) &&
@@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
 						    bool match_wildcard))
 {
 	struct net *net = sock_net(sk);
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 	struct sock *sk2;
 
-	sk_nulls_for_each(sk2, node, &hslot->head) {
+	sk_for_each(sk2, &hslot->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    sk2->sk_family == sk->sk_family &&
@@ -333,17 +330,18 @@ found:
 			goto fail_unlock;
 		}
 
-		sk_nulls_add_node_rcu(sk, &hslot->head);
+		sk_add_node_rcu(sk, &hslot->head);
 		hslot->count++;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
 		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
 		spin_lock(&hslot2->lock);
-		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+		hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
 					 &hslot2->head);
 		hslot2->count++;
 		spin_unlock(&hslot2->lock);
 	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	error = 0;
 fail_unlock:
 	spin_unlock_bh(&hslot->lock);
@@ -497,37 +495,27 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-begin:
 	result = NULL;
 	badness = 0;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score2(sk, net, saddr, sport,
 				      daddr, hnum, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			badness = score;
+			result = sk;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -535,23 +523,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot2)
-		goto begin;
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score2(result, net, saddr, sport,
-				  daddr, hnum, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
 	return result;
 }
 
@@ -563,15 +534,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
@@ -593,35 +561,27 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 						  htonl(INADDR_ANY), hnum, dif,
 						  hslot2, slot2, skb);
 		}
-		rcu_read_unlock();
 		return result;
 	}
 begin:
 	result = NULL;
 	badness = 0;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -629,25 +589,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
-				  daddr, dport, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
-	rcu_read_unlock();
 	return result;
 }
 EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -663,13 +604,24 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
-	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
-				 &udp_table, NULL);
+	struct sock *sk;
+
+	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+			       dif, &udp_table, NULL);
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	return sk;
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
 
 static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 				       __be16 loc_port, __be32 loc_addr,
@@ -771,7 +723,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	sock_put(sk);
+	return;
 }
 
 void udp_err(struct sk_buff *skb, u32 info)
@@ -1474,13 +1426,13 @@ void udp_lib_unhash(struct sock *sk)
 		spin_lock_bh(&hslot->lock);
 		if (rcu_access_pointer(sk->sk_reuseport_cb))
 			reuseport_detach_sock(sk);
-		if (sk_nulls_del_node_init_rcu(sk)) {
+		if (sk_del_node_init_rcu(sk)) {
 			hslot->count--;
 			inet_sk(sk)->inet_num = 0;
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
 			spin_lock(&hslot2->lock);
-			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
 			hslot2->count--;
 			spin_unlock(&hslot2->lock);
 		}
@@ -1513,12 +1465,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
 
 			if (hslot2 != nhslot2) {
 				spin_lock(&hslot2->lock);
-				hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+				hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
 				hslot2->count--;
 				spin_unlock(&hslot2->lock);
 
 				spin_lock(&nhslot2->lock);
-				hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+				hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
 							 &nhslot2->head);
 				nhslot2->count++;
 				spin_unlock(&nhslot2->lock);
@@ -1697,35 +1649,6 @@ drop:
 	return -1;
 }
 
-static void flush_stack(struct sock **stack, unsigned int count,
-			struct sk_buff *skb, unsigned int final)
-{
-	unsigned int i;
-	struct sk_buff *skb1 = NULL;
-	struct sock *sk;
-
-	for (i = 0; i < count; i++) {
-		sk = stack[i];
-		if (likely(!skb1))
-			skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-
-		if (!skb1) {
-			atomic_inc(&sk->sk_drops);
-			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
-					 IS_UDPLITE(sk));
-			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
-					 IS_UDPLITE(sk));
-		}
-
-		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
-			skb1 = NULL;
-
-		sock_put(sk);
-	}
-	if (unlikely(skb1))
-		kfree_skb(skb1);
-}
-
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
@@ -1749,14 +1672,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable,
 				    int proto)
 {
-	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct hlist_nulls_node *node;
+	struct sock *sk, *first = NULL;
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
-	int dif = skb->dev->ifindex;
-	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
 	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
-	bool inner_flushed = false;
+	unsigned int offset = offsetof(typeof(*sk), sk_node);
+	int dif = skb->dev->ifindex;
+	struct hlist_node *node;
+	struct sk_buff *nskb;
 
 	if (use_hash2) {
 		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1767,23 +1690,28 @@ start_lookup:
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-	spin_lock(&hslot->lock);
-	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
-		if (__udp_is_mcast_sock(net, sk,
-					uh->dest, daddr,
-					uh->source, saddr,
-					dif, hnum)) {
-			if (unlikely(count == ARRAY_SIZE(stack))) {
-				flush_stack(stack, count, skb, ~0);
-				inner_flushed = true;
-				count = 0;
-			}
-			stack[count++] = sk;
-			sock_hold(sk);
+	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+					 uh->source, saddr, dif, hnum))
+			continue;
+
+		if (!first) {
+			first = sk;
+			continue;
 		}
-	}
+		nskb = skb_clone(skb, GFP_ATOMIC);
 
-	spin_unlock(&hslot->lock);
+		if (unlikely(!nskb)) {
+			atomic_inc(&sk->sk_drops);
+			UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
+					 IS_UDPLITE(sk));
+			UDP_INC_STATS_BH(net, UDP_MIB_INERRORS,
+					 IS_UDPLITE(sk));
+			continue;
+		}
+		if (udp_queue_rcv_skb(sk, nskb) > 0)
+			consume_skb(nskb);
+	}
 
 	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
 	if (use_hash2 && hash2 != hash2_any) {
@@ -1791,16 +1719,13 @@ start_lookup:
 		goto start_lookup;
 	}
 
-	/*
-	 * do the slow work with no lock held
-	 */
-	if (count) {
-		flush_stack(stack, count, skb, count - 1);
+	if (first) {
+		if (udp_queue_rcv_skb(first, skb) > 0)
+			consume_skb(skb);
 	} else {
-		if (!inner_flushed)
-			UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
-					 proto == IPPROTO_UDPLITE);
-		consume_skb(skb);
+		kfree_skb(skb);
+		UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+				 proto == IPPROTO_UDPLITE);
 	}
 	return 0;
 }
@@ -1897,7 +1822,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 						 inet_compute_pseudo);
 
 		ret = udp_queue_rcv_skb(sk, skb);
-		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input, but
 		 * it wants the return to be -protocol, or 0
@@ -1958,49 +1882,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
 						  int dif)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+	unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
 	struct udp_hslot *hslot = &udp_table.hash[slot];
 
 	/* Do not bother scanning a too big list */
 	if (hslot->count > 10)
 		return NULL;
 
-	rcu_read_lock();
-begin:
-	count = 0;
 	result = NULL;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum)) {
+	sk_for_each_rcu(sk, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+					rmt_port, rmt_addr, dif, hnum)) {
+			if (result)
+				return NULL;
 			result = sk;
-			++count;
-		}
-	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-		if (count != 1 ||
-		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(!__udp_is_mcast_sock(net, result,
-						       loc_port, loc_addr,
-						       rmt_port, rmt_addr,
-						       dif, hnum))) {
-			sock_put(result);
-			result = NULL;
 		}
 	}
-	rcu_read_unlock();
+
 	return result;
 }
 
@@ -2013,37 +1912,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    __be16 rmt_port, __be32 rmt_addr,
 					    int dif)
 {
-	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(loc_port);
 	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
 	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+	struct sock *sk;
 
-	rcu_read_lock();
-	result = NULL;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
-		if (INET_MATCH(sk, net, acookie,
-			       rmt_addr, loc_addr, ports, dif))
-			result = sk;
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+		if (INET_MATCH(sk, net, acookie, rmt_addr,
+			       loc_addr, ports, dif))
+			return sk;
 		/* Only check first socket in chain */
 		break;
 	}
-
-	if (result) {
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(!INET_MATCH(sk, net, acookie,
-					      rmt_addr, loc_addr,
-					      ports, dif))) {
-			sock_put(result);
-			result = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return result;
+	return NULL;
 }
 
 void udp_v4_early_demux(struct sk_buff *skb)
@@ -2051,7 +1935,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
 	const struct udphdr *uh;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct dst_entry *dst;
 	int dif = skb->dev->ifindex;
 	int ours;
@@ -2083,11 +1967,9 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	} else if (skb->pkt_type == PACKET_HOST) {
 		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
 					     uh->source, iph->saddr, dif);
-	} else {
-		return;
 	}
 
-	if (!sk)
+	if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
 		return;
 
 	skb->sk = sk;
@@ -2387,14 +2269,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 
 	for (state->bucket = start; state->bucket <= state->udp_table->mask;
 	     ++state->bucket) {
-		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 
-		if (hlist_nulls_empty(&hslot->head))
+		if (hlist_empty(&hslot->head))
 			continue;
 
 		spin_lock_bh(&hslot->lock);
-		sk_nulls_for_each(sk, node, &hslot->head) {
+		sk_for_each(sk, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -2413,7 +2294,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_nulls_next(sk);
+		sk = sk_next(sk);
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
@@ -2622,12 +2503,12 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 
 	table->hash2 = table->hash + (table->mask + 1);
 	for (i = 0; i <= table->mask; i++) {
-		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+		INIT_HLIST_HEAD(&table->hash[i].head);
 		table->hash[i].count = 0;
 		spin_lock_init(&table->hash[i].lock);
 	}
 	for (i = 0; i <= table->mask; i++) {
-		INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+		INIT_HLIST_HEAD(&table->hash2[i].head);
 		table->hash2[i].count = 0;
 		spin_lock_init(&table->hash2[i].lock);
 	}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index df1966f3b6ec..3d5ccf4b1412 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 			const struct inet_diag_req_v2 *req)
 {
 	int err = -EINVAL;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct sk_buff *rep;
 	struct net *net = sock_net(in_skb->sk);
 
+	rcu_read_lock();
 	if (req->sdiag_family == AF_INET)
 		sk = __udp4_lib_lookup(net,
 				req->id.idiag_src[0], req->id.idiag_sport,
@@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 				req->id.idiag_dport,
 				req->id.idiag_if, tbl, NULL);
 #endif
-	else
-		goto out_nosk;
-
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	rcu_read_unlock();
 	err = -ENOENT;
 	if (!sk)
 		goto out_nosk;
@@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
 		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
-	int num, s_num, slot, s_slot;
 	struct net *net = sock_net(skb->sk);
+	int num, s_num, slot, s_slot;
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
 	for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
-		struct sock *sk;
-		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &table->hash[slot];
+		struct sock *sk;
 
 		num = 0;
 
-		if (hlist_nulls_empty(&hslot->head))
+		if (hlist_empty(&hslot->head))
 			continue;
 
 		spin_lock_bh(&hslot->lock);
-		sk_nulls_for_each(sk, node, &hslot->head) {
+		sk_for_each(sk, &hslot->head) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b772a7641fbd..78a7dfd12707 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -213,37 +213,28 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-begin:
 	result = NULL;
 	badness = -1;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score2(sk, net, saddr, sport,
 				      daddr, hnum, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
 
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -251,27 +242,10 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot2)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score2(result, net, saddr, sport,
-				  daddr, hnum, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
 	return result;
 }
 
+/* rcu_read_lock() must be held */
 struct sock *__udp6_lib_lookup(struct net *net,
 				      const struct in6_addr *saddr, __be16 sport,
 				      const struct in6_addr *daddr, __be16 dport,
@@ -279,15 +253,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
 				      struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp6_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
@@ -309,34 +280,26 @@ struct sock *__udp6_lib_lookup(struct net *net,
 						  &in6addr_any, hnum, dif,
 						  hslot2, slot2, skb);
 		}
-		rcu_read_unlock();
 		return result;
 	}
 begin:
 	result = NULL;
 	badness = -1;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -344,25 +307,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
-					daddr, dport, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
-	rcu_read_unlock();
 	return result;
 }
 EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -382,12 +326,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
 struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
 			     const struct in6_addr *daddr, __be16 dport, int dif)
 {
-	return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
+	struct sock *sk;
+
+	sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				dif, &udp_table, NULL);
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	return sk;
 }
 EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+#endif
 
 /*
  *	This should be easy, if there is something there we
@@ -585,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	sock_put(sk);
+	return;
 }
 
 static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -747,33 +703,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static void flush_stack(struct sock **stack, unsigned int count,
-			struct sk_buff *skb, unsigned int final)
-{
-	struct sk_buff *skb1 = NULL;
-	struct sock *sk;
-	unsigned int i;
-
-	for (i = 0; i < count; i++) {
-		sk = stack[i];
-		if (likely(!skb1))
-			skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-		if (!skb1) {
-			atomic_inc(&sk->sk_drops);
-			UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
-					  IS_UDPLITE(sk));
-			UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
-					  IS_UDPLITE(sk));
-		}
-
-		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
-			skb1 = NULL;
-		sock_put(sk);
-	}
-	if (unlikely(skb1))
-		kfree_skb(skb1);
-}
-
 static void udp6_csum_zero_error(struct sk_buff *skb)
 {
 	/* RFC 2460 section 8.1 says that we SHOULD log
@@ -792,15 +721,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		struct udp_table *udptable, int proto)
 {
-	struct sock *sk, *stack[256 / sizeof(struct sock *)];
+	struct sock *sk, *first = NULL;
 	const struct udphdr *uh = udp_hdr(skb);
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
-	int dif = inet6_iif(skb);
-	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int offset = offsetof(typeof(*sk), sk_node);
 	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
-	bool inner_flushed = false;
+	int dif = inet6_iif(skb);
+	struct hlist_node *node;
+	struct sk_buff *nskb;
 
 	if (use_hash2) {
 		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
@@ -811,27 +740,32 @@ start_lookup:
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-	spin_lock(&hslot->lock);
-	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
-		if (__udp_v6_is_mcast_sock(net, sk,
-					   uh->dest, daddr,
-					   uh->source, saddr,
-					   dif, hnum) &&
-		    /* If zero checksum and no_check is not on for
-		     * the socket then skip it.
-		     */
-		    (uh->check || udp_sk(sk)->no_check6_rx)) {
-			if (unlikely(count == ARRAY_SIZE(stack))) {
-				flush_stack(stack, count, skb, ~0);
-				inner_flushed = true;
-				count = 0;
-			}
-			stack[count++] = sk;
-			sock_hold(sk);
+	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+		if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
+					    uh->source, saddr, dif, hnum))
+			continue;
+		/* If zero checksum and no_check is not on for
+		 * the socket then skip it.
+		 */
+		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+			continue;
+		if (!first) {
+			first = sk;
+			continue;
+		}
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (unlikely(!nskb)) {
+			atomic_inc(&sk->sk_drops);
+			UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
+					  IS_UDPLITE(sk));
+			UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS,
+					  IS_UDPLITE(sk));
+			continue;
 		}
-	}
 
-	spin_unlock(&hslot->lock);
+		if (udpv6_queue_rcv_skb(sk, nskb) > 0)
+			consume_skb(nskb);
+	}
 
 	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
 	if (use_hash2 && hash2 != hash2_any) {
@@ -839,13 +773,13 @@ start_lookup:
 		goto start_lookup;
 	}
 
-	if (count) {
-		flush_stack(stack, count, skb, count - 1);
+	if (first) {
+		if (udpv6_queue_rcv_skb(first, skb) > 0)
+			consume_skb(skb);
 	} else {
-		if (!inner_flushed)
-			UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
-					  proto == IPPROTO_UDPLITE);
-		consume_skb(skb);
+		kfree_skb(skb);
+		UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+				  proto == IPPROTO_UDPLITE);
 	}
 	return 0;
 }
@@ -853,10 +787,10 @@ start_lookup:
 int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
+	const struct in6_addr *saddr, *daddr;
 	struct net *net = dev_net(skb->dev);
-	struct sock *sk;
 	struct udphdr *uh;
-	const struct in6_addr *saddr, *daddr;
+	struct sock *sk;
 	u32 ulen = 0;
 
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -910,7 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		int ret;
 
 		if (!uh->check && !udp_sk(sk)->no_check6_rx) {
-			sock_put(sk);
 			udp6_csum_zero_error(skb);
 			goto csum_error;
 		}
@@ -920,7 +853,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 						 ip6_compute_pseudo);
 
 		ret = udpv6_queue_rcv_skb(sk, skb);
-		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input */
 		if (ret > 0)
-- 
cgit v1.2.3


From 8f6fd83c6c5ec66a4a70c728535ddcdfef4f3697 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Wed, 2 Mar 2016 10:09:19 -0500
Subject: rhashtable: accept GFP flags in rhashtable_walk_init

In certain cases, the 802.11 mesh pathtable code wants to
iterate over all of the entries in the forwarding table from
the receive path, which is inside an RCU read-side critical
section.  Enable walks inside atomic sections by allowing
GFP_ATOMIC allocations for the walker state.

Change all existing callsites to pass in GFP_KERNEL.

Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
[also adjust gfs2/glock.c and rhashtable tests]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/gfs2/glock.c            | 4 ++--
 include/linux/rhashtable.h | 3 ++-
 lib/rhashtable.c           | 6 ++++--
 lib/test_rhashtable.c      | 2 +-
 net/ipv6/ila/ila_xlat.c    | 3 ++-
 net/netfilter/nft_hash.c   | 4 ++--
 net/netlink/af_netlink.c   | 3 ++-
 net/sctp/proc.c            | 3 ++-
 8 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6539131c52a2..4b73bd101bdc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1913,7 +1913,7 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
 	}
 	return ret;
 }
@@ -1941,7 +1941,7 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
 	}
 	return ret;
 }
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 63bd7601b6de..3eef0802a0cd 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -346,7 +346,8 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 					    struct bucket_table *old_tbl);
 int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
 
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp);
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
 int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cc808707d1cf..5d845ffd7982 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -487,6 +487,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * rhashtable_walk_init - Initialise an iterator
  * @ht:		Table to walk over
  * @iter:	Hash table Iterator
+ * @gfp:	GFP flags for allocations
  *
  * This function prepares a hash table walk.
  *
@@ -504,14 +505,15 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * You must call rhashtable_walk_exit if this function returns
  * successfully.
  */
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp)
 {
 	iter->ht = ht;
 	iter->p = NULL;
 	iter->slot = 0;
 	iter->skip = 0;
 
-	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	iter->walker = kmalloc(sizeof(*iter->walker), gfp);
 	if (!iter->walker)
 		return -ENOMEM;
 
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 270bf7289b1e..297fdb5e74bd 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -143,7 +143,7 @@ static void test_bucket_stats(struct rhashtable *ht)
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
 
-	err = rhashtable_walk_init(ht, &hti);
+	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
 	if (err) {
 		pr_warn("Test failed: allocation error");
 		return;
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 295ca29a23c3..0b03533453e4 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -501,7 +501,8 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
 	struct ila_net *ilan = net_generic(net, ila_net_id);
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
 
-	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter);
+	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+				    GFP_KERNEL);
 }
 
 static int ila_nl_dump_done(struct netlink_callback *cb)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3f9d45d3d9b7..6fa016564f90 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -192,7 +192,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 	u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
 	int err;
 
-	err = rhashtable_walk_init(&priv->ht, &hti);
+	err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
 	iter->err = err;
 	if (err)
 		return;
@@ -248,7 +248,7 @@ static void nft_hash_gc(struct work_struct *work)
 	priv = container_of(work, struct nft_hash, gc_work.work);
 	set  = nft_set_container_of(priv);
 
-	err = rhashtable_walk_init(&priv->ht, &hti);
+	err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
 	if (err)
 		goto schedule;
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 215fc08c02ab..0f16bf635480 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2343,7 +2343,8 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
 {
 	int err;
 
-	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
+	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
+				   GFP_KERNEL);
 	if (err) {
 		iter->link = MAX_LINKS;
 		return err;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5cfac8d5d3b3..6d45d53321e6 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -319,7 +319,8 @@ static int sctp_transport_walk_start(struct seq_file *seq)
 	struct sctp_ht_iter *iter = seq->private;
 	int err;
 
-	err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti);
+	err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti,
+				   GFP_KERNEL);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 3c5bcb2e1930bdbccd14a49660895f014349b51d Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Thu, 17 Mar 2016 15:02:53 +0200
Subject: ieee80211: support parsing Fine Timing Measurement action frame

Add definition for Fine Timing Measurement (FTM) frame format
as defined in IEEE802.11-REVmcD5.0 section 9.6.8.33

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3b1f6cef9513..bf9706c5b0bd 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -7,6 +7,7 @@
  * Copyright (c) 2005, Devicescape Software, Inc.
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1011,6 +1012,16 @@ struct ieee80211_mgmt {
 					u8 tpc_elem_length;
 					struct ieee80211_tpc_report_ie tpc;
 				} __packed tpc_report;
+				struct {
+					u8 action_code;
+					u8 dialog_token;
+					u8 follow_up;
+					u8 tod[6];
+					u8 toa[6];
+					__le16 tod_error;
+					__le16 toa_error;
+					u8 variable[0];
+				} __packed ftm;
 			} u;
 		} __packed action;
 	} u;
-- 
cgit v1.2.3


From 627d2d6b550094d88f9e518e15967e7bf906ebbf Mon Sep 17 00:00:00 2001
From: samanthakumar <samanthakumar@google.com>
Date: Tue, 5 Apr 2016 12:41:16 -0400
Subject: udp: enable MSG_PEEK at non-zero offset

Enable peeking at UDP datagrams at the offset specified with socket
option SOL_SOCKET/SO_PEEK_OFF. Peek at any datagram in the queue, up
to the end of the given datagram.

Implement the SO_PEEK_OFF semantics introduced in commit ef64a54f6e55
("sock: Introduce the SO_PEEK_OFF sock option"). Increase the offset
on peek, decrease it on regular reads.

When peeking, always checksum the packet immediately, to avoid
recomputation on subsequent peeks and final read.

The socket lock is not held for the duration of udp_recvmsg, so
peek and read operations can run concurrently. Only the last store
to sk_peek_off is preserved.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  7 ++++++-
 include/net/sock.h     |  2 ++
 net/core/datagram.c    |  9 ++++++---
 net/core/sock.c        |  9 +++++++++
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/udp.c         | 22 +++++++++++-----------
 net/ipv6/af_inet6.c    |  1 +
 net/ipv6/udp.c         | 22 +++++++++++-----------
 8 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 15d0df943466..007381270ff8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2949,7 +2949,12 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
+static inline void skb_free_datagram_locked(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	__skb_free_datagram_locked(sk, skb, 0);
+}
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index b75998952482..1decb7a22261 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -457,6 +457,8 @@ struct sock {
 #define SK_CAN_REUSE	1
 #define SK_FORCE_REUSE	2
 
+int sk_set_peek_off(struct sock *sk, int val);
+
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fa9dc6450b08..b7de71f8d5d3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_free_datagram);
 
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 {
 	bool slow;
 
 	if (likely(atomic_read(&skb->users) == 1))
 		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
+	else if (likely(!atomic_dec_and_test(&skb->users))) {
+		sk_peek_offset_bwd(sk, len);
 		return;
+	}
 
 	slow = lock_sock_fast(sk);
+	sk_peek_offset_bwd(sk, len);
 	skb_orphan(skb);
 	sk_mem_reclaim_partial(sk);
 	unlock_sock_fast(sk, slow);
@@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	/* skb is now orphaned, can be freed outside of locked section */
 	__kfree_skb(skb);
 }
-EXPORT_SYMBOL(skb_free_datagram_locked);
+EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 /**
  *	skb_kill_datagram - Free a datagram skbuff forcibly
diff --git a/net/core/sock.c b/net/core/sock.c
index e12197b359fd..2ce76e82857f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2187,6 +2187,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
+int sk_set_peek_off(struct sock *sk, int val)
+{
+	if (val < 0)
+		return -EINVAL;
+
+	sk->sk_peek_off = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
 
 /*
  * Set of default routines for initialising struct proto_ops when
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e481992dbae..a38b9910af60 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf747e86ce52..d80312ddbb8a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1294,7 +1294,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -1304,15 +1304,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -1322,16 +1323,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 
 		if (err == -EINVAL)
 			goto csum_copy_err;
@@ -1344,7 +1345,8 @@ try_again:
 			UDP_INC_STATS_USER(sock_net(sk),
 					   UDP_MIB_INERRORS, is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 
 	if (!peeked)
@@ -1368,9 +1370,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37cfd67c..2b78aad0d52f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -561,6 +561,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 84c8d7b66820..87bd7aff88b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -357,7 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	struct inet_sock *inet = inet_sk(sk);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -371,15 +371,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -391,16 +392,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
@@ -417,7 +418,8 @@ try_again:
 						    UDP_MIB_INERRORS,
 						    is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 	if (!peeked) {
 		if (is_udp4)
@@ -465,9 +467,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
-- 
cgit v1.2.3


From 49ddf8e6e2347cffdcf83d1ca2d04ff929820178 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 31 Mar 2016 20:02:10 +0300
Subject: mac80211: add fast-rx path

The regular RX path has a lot of code, but with a few
assumptions on the hardware it's possible to reduce the
amount of code significantly. Currently the assumptions
on the driver are the following:
 * hardware/driver reordering buffer (if supporting aggregation)
 * hardware/driver decryption & PN checking (if using encryption)
 * hardware/driver did de-duplication
 * hardware/driver did A-MSDU deaggregation
 * AP_LINK_PS is used (in AP mode)
 * no client powersave handling in mac80211 (in client mode)

of which some are actually checked per packet:
 * de-duplication
 * PN checking
 * decryption
and additionally packets must
 * not be A-MSDU (have been deaggregated by driver/device)
 * be data packets
 * not be fragmented
 * be unicast
 * have RFC 1042 header

Additionally dynamically we assume:
 * no encryption or CCMP/GCMP, TKIP/WEP/other not allowed
 * station must be authorized
 * 4-addr format not enabled

Some data needed for the RX path is cached in a new per-station
"fast_rx" structure, so that we only need to look at this and
the packet, no other memory when processing packets on the fast
RX path.

After doing the above per-packet checks, the data path collapses
down to a pretty simple conversion function taking advantage of
the data cached in the small fast_rx struct.

This should speed up the RX processing, and will make it easier
to reason about parallelizing RX (for which statistics will need
to be per-CPU still.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  10 ++
 net/mac80211/cfg.c         |  10 +-
 net/mac80211/ieee80211_i.h |   5 +
 net/mac80211/key.c         |   1 +
 net/mac80211/mlme.c        |   9 ++
 net/mac80211/rx.c          | 351 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/sta_info.c    |   2 +
 net/mac80211/sta_info.h    |  34 +++++
 8 files changed, 419 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bf9706c5b0bd..113bfc468a4d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -638,6 +638,16 @@ static inline bool ieee80211_is_first_frag(__le16 seq_ctrl)
 	return (seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG)) == 0;
 }
 
+/**
+ * ieee80211_is_frag - check if a frame is a fragment
+ * @hdr: 802.11 header of the frame
+ */
+static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr)
+{
+	return ieee80211_has_morefrags(hdr->frame_control) ||
+	       hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG);
+}
+
 struct ieee80211s_hdr {
 	u8 flags;
 	u8 ttl;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 62a90f270f03..fc4730b938d0 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -65,11 +65,13 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 		return ret;
 
 	if (type == NL80211_IFTYPE_AP_VLAN &&
-	    params && params->use_4addr == 0)
+	    params && params->use_4addr == 0) {
 		RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
-	else if (type == NL80211_IFTYPE_STATION &&
-		 params && params->use_4addr >= 0)
+		ieee80211_check_fast_rx_iface(sdata);
+	} else if (type == NL80211_IFTYPE_STATION &&
+		   params && params->use_4addr >= 0) {
 		sdata->u.mgd.use_4addr = params->use_4addr;
+	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
 		struct ieee80211_local *local = sdata->local;
@@ -1367,6 +1369,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 
 			rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
 			new_4addr = true;
+			__ieee80211_check_fast_rx_iface(vlansdata);
 		}
 
 		if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -1889,6 +1892,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 			sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
 		else
 			sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+		ieee80211_check_fast_rx_iface(sdata);
 	}
 
 	if (params->ht_opmode >= 0) {
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c8945e2d8a86..6243109979ed 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1494,6 +1494,11 @@ u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
 int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
 			     u64 *cookie, gfp_t gfp);
 
+void ieee80211_check_fast_rx(struct sta_info *sta);
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_clear_fast_rx(struct sta_info *sta);
+
 /* STA code */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
 int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 3df7b0392d30..edd6f2945f69 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -338,6 +338,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 		} else {
 			rcu_assign_pointer(sta->gtk[idx], new);
 		}
+		ieee80211_check_fast_rx(sta);
 	} else {
 		defunikey = old &&
 			old == key_mtx_dereference(sdata->local,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2112df4ffb7b..d3c75ac8a029 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2217,6 +2217,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 	const u8 *ssid;
 	u8 *dst = ifmgd->associated->bssid;
 	u8 unicast_limit = max(1, max_probe_tries - 3);
+	struct sta_info *sta;
 
 	/*
 	 * Try sending broadcast probe requests for the last three
@@ -2235,6 +2236,14 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 	 */
 	ifmgd->probe_send_count++;
 
+	if (dst) {
+		mutex_lock(&sdata->local->sta_mtx);
+		sta = sta_info_get(sdata, dst);
+		if (!WARN_ON(!sta))
+			ieee80211_check_fast_rx(sta);
+		mutex_unlock(&sdata->local->sta_mtx);
+	}
+
 	if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
 		ifmgd->nullfunc_failed = false;
 		ieee80211_send_nullfunc(sdata->local, sdata, false);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 2863832b0db4..96f8bbf21649 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3508,6 +3508,342 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 	return false;
 }
 
+void ieee80211_check_fast_rx(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_key *key;
+	struct ieee80211_fast_rx fastrx = {
+		.dev = sdata->dev,
+		.vif_type = sdata->vif.type,
+		.control_port_protocol = sdata->control_port_protocol,
+	}, *old, *new = NULL;
+	bool assign = false;
+
+	/* use sparse to check that we don't return without updating */
+	__acquire(check_fast_rx);
+
+	BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != sizeof(rfc1042_header));
+	BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != ETH_ALEN);
+	ether_addr_copy(fastrx.rfc1042_hdr, rfc1042_header);
+	ether_addr_copy(fastrx.vif_addr, sdata->vif.addr);
+
+	/* fast-rx doesn't do reordering */
+	if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) &&
+	    !ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER))
+		goto clear;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		/* 4-addr is harder to deal with, later maybe */
+		if (sdata->u.mgd.use_4addr)
+			goto clear;
+		/* software powersave is a huge mess, avoid all of it */
+		if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
+			goto clear;
+		if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
+		    !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
+			goto clear;
+		if (sta->sta.tdls) {
+			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+			fastrx.expected_ds_bits = 0;
+		} else {
+			fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0;
+			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
+			fastrx.expected_ds_bits =
+				cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		}
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_AP:
+		/* parallel-rx requires this, at least with calls to
+		 * ieee80211_sta_ps_transition()
+		 */
+		if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
+			goto clear;
+		fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
+		fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+		fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_TODS);
+
+		fastrx.internal_forward =
+			!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
+			(sdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
+			 !sdata->u.vlan.sta);
+		break;
+	default:
+		goto clear;
+	}
+
+	if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+		goto clear;
+
+	rcu_read_lock();
+	key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+	if (key) {
+		switch (key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_TKIP:
+			/* we don't want to deal with MMIC in fast-rx */
+			goto clear_rcu;
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
+			break;
+		default:
+			/* we also don't want to deal with WEP or cipher scheme
+			 * since those require looking up the key idx in the
+			 * frame, rather than assuming the PTK is used
+			 * (we need to revisit this once we implement the real
+			 * PTK index, which is now valid in the spec, but we
+			 * haven't implemented that part yet)
+			 */
+			goto clear_rcu;
+		}
+
+		fastrx.key = true;
+		fastrx.icv_len = key->conf.icv_len;
+	}
+
+	assign = true;
+ clear_rcu:
+	rcu_read_unlock();
+ clear:
+	__release(check_fast_rx);
+
+	if (assign)
+		new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL);
+
+	spin_lock_bh(&sta->lock);
+	old = rcu_dereference_protected(sta->fast_rx, true);
+	rcu_assign_pointer(sta->fast_rx, new);
+	spin_unlock_bh(&sta->lock);
+
+	if (old)
+		kfree_rcu(old, rcu_head);
+}
+
+void ieee80211_clear_fast_rx(struct sta_info *sta)
+{
+	struct ieee80211_fast_rx *old;
+
+	spin_lock_bh(&sta->lock);
+	old = rcu_dereference_protected(sta->fast_rx, true);
+	RCU_INIT_POINTER(sta->fast_rx, NULL);
+	spin_unlock_bh(&sta->lock);
+
+	if (old)
+		kfree_rcu(old, rcu_head);
+}
+
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	lockdep_assert_held(&local->sta_mtx);
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata &&
+		    (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
+			continue;
+		ieee80211_check_fast_rx(sta);
+	}
+}
+
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	mutex_lock(&local->sta_mtx);
+	__ieee80211_check_fast_rx_iface(sdata);
+	mutex_unlock(&local->sta_mtx);
+}
+
+static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
+				     struct ieee80211_fast_rx *fast_rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct sta_info *sta = rx->sta;
+	int orig_len = skb->len;
+	int snap_offs = ieee80211_hdrlen(hdr->frame_control);
+	struct {
+		u8 snap[sizeof(rfc1042_header)];
+		__be16 proto;
+	} *payload __aligned(2);
+	struct {
+		u8 da[ETH_ALEN];
+		u8 sa[ETH_ALEN];
+	} addrs __aligned(2);
+
+	/* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write
+	 * to a common data structure; drivers can implement that per queue
+	 * but we don't have that information in mac80211
+	 */
+	if (!(status->flag & RX_FLAG_DUP_VALIDATED))
+		return false;
+
+#define FAST_RX_CRYPT_FLAGS	(RX_FLAG_PN_VALIDATED | RX_FLAG_DECRYPTED)
+
+	/* If using encryption, we also need to have:
+	 *  - PN_VALIDATED: similar, but the implementation is tricky
+	 *  - DECRYPTED: necessary for PN_VALIDATED
+	 */
+	if (fast_rx->key &&
+	    (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS)
+		return false;
+
+	/* we don't deal with A-MSDU deaggregation here */
+	if (status->rx_flags & IEEE80211_RX_AMSDU)
+		return false;
+
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+		return false;
+
+	if (unlikely(ieee80211_is_frag(hdr)))
+		return false;
+
+	/* Since our interface address cannot be multicast, this
+	 * implicitly also rejects multicast frames without the
+	 * explicit check.
+	 *
+	 * We shouldn't get any *data* frames not addressed to us
+	 * (AP mode will accept multicast *management* frames), but
+	 * punting here will make it go through the full checks in
+	 * ieee80211_accept_frame().
+	 */
+	if (!ether_addr_equal(fast_rx->vif_addr, hdr->addr1))
+		return false;
+
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS |
+					      IEEE80211_FCTL_TODS)) !=
+	    fast_rx->expected_ds_bits)
+		goto drop;
+
+	/* assign the key to drop unencrypted frames (later)
+	 * and strip the IV/MIC if necessary
+	 */
+	if (fast_rx->key && !(status->flag & RX_FLAG_IV_STRIPPED)) {
+		/* GCMP header length is the same */
+		snap_offs += IEEE80211_CCMP_HDR_LEN;
+	}
+
+	if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
+		goto drop;
+	payload = (void *)(skb->data + snap_offs);
+
+	if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
+		return false;
+
+	/* Don't handle these here since they require special code.
+	 * Accept AARP and IPX even though they should come with a
+	 * bridge-tunnel header - but if we get them this way then
+	 * there's little point in discarding them.
+	 */
+	if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
+		     payload->proto == fast_rx->control_port_protocol))
+		return false;
+
+	/* after this point, don't punt to the slowpath! */
+
+	if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) &&
+	    pskb_trim(skb, skb->len - fast_rx->icv_len))
+		goto drop;
+
+	if (unlikely(fast_rx->sta_notify)) {
+		ieee80211_sta_rx_notify(rx->sdata, hdr);
+		fast_rx->sta_notify = false;
+	}
+
+	/* statistics part of ieee80211_rx_h_sta_process() */
+	sta->rx_stats.last_rx = jiffies;
+	sta->rx_stats.last_rate = sta_stats_encode_rate(status);
+
+	sta->rx_stats.fragments++;
+
+	if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
+		sta->rx_stats.last_signal = status->signal;
+		ewma_signal_add(&sta->rx_stats_avg.signal, -status->signal);
+	}
+
+	if (status->chains) {
+		int i;
+
+		sta->rx_stats.chains = status->chains;
+		for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
+			int signal = status->chain_signal[i];
+
+			if (!(status->chains & BIT(i)))
+				continue;
+
+			sta->rx_stats.chain_signal_last[i] = signal;
+			ewma_signal_add(&sta->rx_stats_avg.chain_signal[i],
+					-signal);
+		}
+	}
+	/* end of statistics */
+
+	if (rx->key && !ieee80211_has_protected(hdr->frame_control))
+		goto drop;
+
+	/* do the header conversion - first grab the addresses */
+	ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
+	ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
+	/* remove the SNAP but leave the ethertype */
+	skb_pull(skb, snap_offs + sizeof(rfc1042_header));
+	/* push the addresses in front */
+	memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs));
+
+	skb->dev = fast_rx->dev;
+
+	ieee80211_rx_stats(fast_rx->dev, skb->len);
+
+	/* The seqno index has the same property as needed
+	 * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
+	 * for non-QoS-data frames. Here we know it's a data
+	 * frame, so count MSDUs.
+	 */
+	u64_stats_update_begin(&sta->rx_stats.syncp);
+	sta->rx_stats.msdu[rx->seqno_idx]++;
+	sta->rx_stats.bytes += orig_len;
+	u64_stats_update_end(&sta->rx_stats.syncp);
+
+	if (fast_rx->internal_forward) {
+		struct sta_info *dsta = sta_info_get(rx->sdata, skb->data);
+
+		if (dsta) {
+			/*
+			 * Send to wireless media and increase priority by 256
+			 * to keep the received priority instead of
+			 * reclassifying the frame (see cfg80211_classify8021d).
+			 */
+			skb->priority += 256;
+			skb->protocol = htons(ETH_P_802_3);
+			skb_reset_network_header(skb);
+			skb_reset_mac_header(skb);
+			dev_queue_xmit(skb);
+			return true;
+		}
+	}
+
+	/* deliver to local stack */
+	skb->protocol = eth_type_trans(skb, fast_rx->dev);
+	memset(skb->cb, 0, sizeof(skb->cb));
+	if (rx->napi)
+		napi_gro_receive(rx->napi, skb);
+	else
+		netif_receive_skb(skb);
+
+	return true;
+ drop:
+	dev_kfree_skb(skb);
+	sta->rx_stats.dropped++;
+	return true;
+}
+
 /*
  * This function returns whether or not the SKB
  * was destined for RX processing or not, which,
@@ -3522,6 +3858,21 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
 
 	rx->skb = skb;
 
+	/* See if we can do fast-rx; if we have to copy we already lost,
+	 * so punt in that case. We should never have to deliver a data
+	 * frame to multiple interfaces anyway.
+	 *
+	 * We skip the ieee80211_accept_frame() call and do the necessary
+	 * checking inside ieee80211_invoke_fast_rx().
+	 */
+	if (consume && rx->sta) {
+		struct ieee80211_fast_rx *fast_rx;
+
+		fast_rx = rcu_dereference(rx->sta->fast_rx);
+		if (fast_rx && ieee80211_invoke_fast_rx(rx, fast_rx))
+			return true;
+	}
+
 	if (!ieee80211_accept_frame(rx))
 		return false;
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index bdd303e8b577..a0ce7e40f420 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1874,6 +1874,7 @@ int sta_info_move_state(struct sta_info *sta,
 				atomic_dec(&sta->sdata->bss->num_mcast_sta);
 			clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
 			ieee80211_clear_fast_xmit(sta);
+			ieee80211_clear_fast_rx(sta);
 		}
 		break;
 	case IEEE80211_STA_AUTHORIZED:
@@ -1884,6 +1885,7 @@ int sta_info_move_state(struct sta_info *sta,
 				atomic_inc(&sta->sdata->bss->num_mcast_sta);
 			set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
 			ieee80211_check_fast_xmit(sta);
+			ieee80211_check_fast_rx(sta);
 		}
 		break;
 	default:
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 7c23b575672e..a0a06609338d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -285,6 +285,38 @@ struct ieee80211_fast_tx {
 	struct rcu_head rcu_head;
 };
 
+/**
+ * struct ieee80211_fast_rx - RX fastpath information
+ * @dev: netdevice for reporting the SKB
+ * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type)
+ * @vif_addr: interface address
+ * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache)
+ * @control_port_protocol: control port protocol copied from sdata
+ * @expected_ds_bits: from/to DS bits expected
+ * @icv_len: length of the MIC if present
+ * @key: bool indicating encryption is expected (key is set)
+ * @sta_notify: notify the MLME code (once)
+ * @internal_forward: forward froms internally on AP/VLAN type interfaces
+ * @da_offs: offset of the DA in the header (for header conversion)
+ * @sa_offs: offset of the SA in the header (for header conversion)
+ * @rcu_head: RCU head for freeing this structure
+ */
+struct ieee80211_fast_rx {
+	struct net_device *dev;
+	enum nl80211_iftype vif_type;
+	u8 vif_addr[ETH_ALEN] __aligned(2);
+	u8 rfc1042_hdr[6] __aligned(2);
+	__be16 control_port_protocol;
+	__le16 expected_ds_bits;
+	u8 icv_len;
+	u8 key:1,
+	   sta_notify:1,
+	   internal_forward:1;
+	u8 da_offs, sa_offs;
+
+	struct rcu_head rcu_head;
+};
+
 /**
  * struct mesh_sta - mesh STA information
  * @plink_lock: serialize access to plink fields
@@ -391,6 +423,7 @@ DECLARE_EWMA(signal, 1024, 8)
  * @cipher_scheme: optional cipher scheme for this station
  * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
  * @fast_tx: TX fastpath information
+ * @fast_rx: RX fastpath information
  * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
  *	the BSS one.
  * @tx_stats: TX statistics
@@ -414,6 +447,7 @@ struct sta_info {
 	spinlock_t lock;
 
 	struct ieee80211_fast_tx __rcu *fast_tx;
+	struct ieee80211_fast_rx __rcu *fast_rx;
 
 #ifdef CONFIG_MAC80211_MESH
 	struct mesh_sta *mesh;
-- 
cgit v1.2.3


From 6e0456b5454561c4e9fa9e8a4acea405e6d56c80 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Thu, 3 Mar 2016 22:59:00 +0100
Subject: mac80211: add A-MSDU tx support

Requires software tx queueing and fast-xmit support. For good
performance, drivers need frag_list support as well. This avoids the
need for copying data of aggregated frames. Running without it is only
supported for debugging purposes.

To avoid performance and packet size issues, the rate control module or
driver needs to limit the maximum A-MSDU size by setting
max_rc_amsdu_len in struct ieee80211_sta.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
[fix locking issue]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |   3 +
 include/net/mac80211.h     |  19 ++++++
 net/mac80211/agg-tx.c      |   5 ++
 net/mac80211/debugfs.c     |   2 +
 net/mac80211/ieee80211_i.h |   1 +
 net/mac80211/sta_info.c    |   2 +
 net/mac80211/tx.c          | 156 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 188 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 113bfc468a4d..b118744d3382 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -164,6 +164,9 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
+/* Maximal size of an A-MSDU that can be transported in a HT BA session */
+#define IEEE80211_MAX_MPDU_LEN_HT_BA		4095
+
 /* Maximal size of an A-MSDU */
 #define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
 #define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5f4b4c773a92..a3ee76559791 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -713,6 +713,7 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -720,6 +721,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
+	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 };
 
 /*
@@ -1746,6 +1748,7 @@ struct ieee80211_sta_rates {
  *	Both additional HT limits must be enforced by the low level driver.
  *	This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2).
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
+ * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1767,6 +1770,7 @@ struct ieee80211_sta {
 	u8 max_amsdu_subframes;
 	u16 max_amsdu_len;
 	bool support_p2p_ps;
+	u16 max_rc_amsdu_len;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
@@ -1983,6 +1987,15 @@ struct ieee80211_txq {
  * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX,
  *	which implies using per-CPU station statistics.
  *
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ *	A-MSDU frames. Requires software tx queueing and fast-xmit support.
+ *	When not using minstrel/minstrel_ht rate control, the driver must
+ *	limit the maximum A-MSDU size based on the current tx rate by setting
+ *	max_rc_amsdu_len in struct ieee80211_sta.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ *	skbs, needed for zero-copy software A-MSDU.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2021,6 +2034,8 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
 	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
 	IEEE80211_HW_USES_RSS,
+	IEEE80211_HW_TX_AMSDU,
+	IEEE80211_HW_TX_FRAG_LIST,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -2093,6 +2108,9 @@ enum ieee80211_hw_flags {
  *	size is smaller (an example is LinkSys WRT120N with FW v1.0.07
  *	build 002 Jun 18 2012).
  *
+ * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum
+ *	of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list.
+ *
  * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
  *	(if %IEEE80211_HW_QUEUE_CONTROL is set)
  *
@@ -2147,6 +2165,7 @@ struct ieee80211_hw {
 	u8 max_rate_tries;
 	u8 max_rx_aggregation_subframes;
 	u8 max_tx_aggregation_subframes;
+	u8 max_tx_fragments;
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f243a2..42fa81031dfa 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 				  size_t len)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_txq *txq;
 	u16 capab, tid;
 	u8 buf_size;
 	bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
 	buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
 
+	txq = sta->sta.txq[tid];
+	if (!amsdu && txq)
+		set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 52ed2afc408d..b251b2f7f8dd 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -128,6 +128,8 @@ static const char *hw_flag_names[] = {
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
 	FLAG(SUPPORTS_REORDERING_BUFFER),
 	FLAG(USES_RSS),
+	FLAG(TX_AMSDU),
+	FLAG(TX_FRAG_LIST),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6243109979ed..40c1d343992c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -802,6 +802,7 @@ struct mac80211_qos_map {
 enum txq_info_flags {
 	IEEE80211_TXQ_STOP,
 	IEEE80211_TXQ_AMPDU,
+	IEEE80211_TXQ_NO_AMSDU,
 };
 
 struct txq_info {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cf2aca0cc200..960e13d8ed30 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -416,6 +416,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA;
+
 	sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
 
 	return sta;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 597c8fe672a3..4fa2842ddb25 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 out:
 	spin_unlock_bh(&txqi->queue.lock);
 
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
 	return skb;
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2802,6 +2806,154 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
 		kfree_rcu(fast_tx, rcu_head);
 }
 
+static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local,
+					struct sk_buff *skb, int headroom,
+					int *subframe_len)
+{
+	int amsdu_len = *subframe_len + sizeof(struct ethhdr);
+	int padding = (4 - amsdu_len) & 3;
+
+	if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) {
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+
+		if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) {
+			wiphy_debug(local->hw.wiphy,
+				    "failed to reallocate TX buffer\n");
+			return false;
+		}
+	}
+
+	if (padding) {
+		*subframe_len += padding;
+		memset(skb_put(skb, padding), 0, padding);
+	}
+
+	return true;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_fast_tx *fast_tx,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr;
+	struct ethhdr amsdu_hdr;
+	int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+	int subframe_len = skb->len - hdr_len;
+	void *data;
+	u8 *qc;
+
+	if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+		return false;
+
+	if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+		return true;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+					 &subframe_len))
+		return false;
+
+	amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+	memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+	memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+	data = skb_push(skb, sizeof(amsdu_hdr));
+	memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+	memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+	hdr = data;
+	qc = ieee80211_get_qos_ctl(hdr);
+	*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+	info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+	return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+				      struct sta_info *sta,
+				      struct ieee80211_fast_tx *fast_tx,
+				      struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+	struct ieee80211_txq *txq = sta->sta.txq[tid];
+	struct txq_info *txqi;
+	struct sk_buff **frag_tail, *head;
+	int subframe_len = skb->len - ETH_ALEN;
+	u8 max_subframes = sta->sta.max_amsdu_subframes;
+	int max_frags = local->hw.max_tx_fragments;
+	int max_amsdu_len = sta->sta.max_amsdu_len;
+	__be16 len;
+	void *data;
+	bool ret = false;
+	int n = 1, nfrags;
+
+	if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+		return false;
+
+	if (!txq)
+		return false;
+
+	txqi = to_txq_info(txq);
+	if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+		return false;
+
+	if (sta->sta.max_rc_amsdu_len)
+		max_amsdu_len = min_t(int, max_amsdu_len,
+				      sta->sta.max_rc_amsdu_len);
+
+	spin_lock_bh(&txqi->queue.lock);
+
+	head = skb_peek_tail(&txqi->queue);
+	if (!head)
+		goto out;
+
+	if (skb->len + head->len > max_amsdu_len)
+		goto out;
+
+	if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+		goto out;
+
+	nfrags = 1 + skb_shinfo(skb)->nr_frags;
+	nfrags += 1 + skb_shinfo(head)->nr_frags;
+	frag_tail = &skb_shinfo(head)->frag_list;
+	while (*frag_tail) {
+		nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+		frag_tail = &(*frag_tail)->next;
+		n++;
+	}
+
+	if (max_subframes && n > max_subframes)
+		goto out;
+
+	if (max_frags && nfrags > max_frags)
+		goto out;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2,
+					 &subframe_len))
+		goto out;
+
+	ret = true;
+	data = skb_push(skb, ETH_ALEN + 2);
+	memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+	data += 2 * ETH_ALEN;
+	len = cpu_to_be16(subframe_len);
+	memcpy(data, &len, 2);
+	memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header));
+
+	head->len += skb->len;
+	head->data_len += skb->len;
+	*frag_tail = skb;
+
+out:
+	spin_unlock_bh(&txqi->queue.lock);
+
+	return ret;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 				struct net_device *dev, struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
@@ -2856,6 +3008,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_tx_stats(dev, skb->len + extra_head);
 
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+		return true;
+
 	/* will not be crypto-handled beyond what we do here, so use false
 	 * as the may-encrypt argument for the resize to not account for
 	 * more room than we already have in 'extra_head'
-- 
cgit v1.2.3


From 8ced425ee630c03beea06c1dfa35190bf8395d07 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 5 Apr 2016 17:10:16 +0200
Subject: tun: use socket locks for sk_{attach,detatch}_filter

This reverts commit 5a5abb1fa3b05dd ("tun, bpf: fix suspicious RCU usage
in tun_{attach, detach}_filter") and replaces it to use lock_sock around
sk_{attach,detach}_filter. The checks inside filter.c are updated with
lockdep_sock_is_held to check for proper socket locks.

It keeps the code cleaner by ensuring that only one lock governs the
socket filter instead of two independent locks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 14 +++++++++-----
 include/linux/filter.h |  4 ----
 net/core/filter.c      | 35 +++++++++++++----------------------
 3 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9abc36bf77ea..64bc143eddd9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -622,8 +622,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 
 	/* Re-attach the filter to persist device */
 	if (!skip_filter && (tun->filter_attached == true)) {
-		err = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (!err)
 			goto out;
 	}
@@ -1824,7 +1825,9 @@ static void tun_detach_filter(struct tun_struct *tun, int n)
 
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		__sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		sk_detach_filter(tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 	}
 
 	tun->filter_attached = false;
@@ -1837,8 +1840,9 @@ static int tun_attach_filter(struct tun_struct *tun)
 
 	for (i = 0; i < tun->numqueues; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (ret) {
 			tun_detach_filter(tun, i);
 			return ret;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a51a5361695f..43aa1f8855c7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -465,14 +465,10 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
-int __sk_detach_filter(struct sock *sk, bool locked);
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ca7f832b2980..e8486ba601ea 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1149,8 +1149,7 @@ void bpf_prog_destroy(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
-			    bool locked)
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
 
@@ -1166,8 +1165,10 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
 		return -ENOMEM;
 	}
 
-	old_fp = rcu_dereference_protected(sk->sk_filter, locked);
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
+
 	if (old_fp)
 		sk_filter_uncharge(sk, old_fp);
 
@@ -1246,8 +1247,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
  * occurs or there is insufficient memory for the filter a negative
  * errno code is returned. On success the return is zero.
  */
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked)
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct bpf_prog *prog = __get_filter(fprog, sk);
 	int err;
@@ -1255,7 +1255,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, locked);
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		__bpf_prog_release(prog);
 		return err;
@@ -1263,12 +1263,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__sk_attach_filter);
-
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
-{
-	return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
 
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
@@ -1314,7 +1309,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk));
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		bpf_prog_put(prog);
 		return err;
@@ -2255,7 +2250,7 @@ static int __init register_sk_filter_ops(void)
 }
 late_initcall(register_sk_filter_ops);
 
-int __sk_detach_filter(struct sock *sk, bool locked)
+int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
 	struct sk_filter *filter;
@@ -2263,7 +2258,8 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return -EPERM;
 
-	filter = rcu_dereference_protected(sk->sk_filter, locked);
+	filter = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	if (filter) {
 		RCU_INIT_POINTER(sk->sk_filter, NULL);
 		sk_filter_uncharge(sk, filter);
@@ -2272,12 +2268,7 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__sk_detach_filter);
-
-int sk_detach_filter(struct sock *sk)
-{
-	return __sk_detach_filter(sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
 
 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 		  unsigned int len)
@@ -2288,7 +2279,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 
 	lock_sock(sk);
 	filter = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	if (!filter)
 		goto out;
 
-- 
cgit v1.2.3


From a6024562ffd7e0f31bc6671817840ad1e91de7b4 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 5 Apr 2016 08:22:51 -0700
Subject: udp: Add GRO functions to UDP socket

This patch adds GRO functions (gro_receive and gro_complete) to UDP
sockets. udp_gro_receive is changed to perform socket lookup on a
packet. If a socket is found the related GRO functions are called.

This features obsoletes using UDP offload infrastructure for GRO
(udp_offload). This has the advantage of not being limited to provide
offload on a per port basis, GRO is now applied to whatever individual
UDP sockets are bound to.  This also allows the possbility of
"application defined GRO"-- that is we can attach something like
a BPF program to a UDP socket to perfrom GRO on an application
layer protocol.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h    |  8 ++++++++
 include/net/udp.h      |  7 +++++--
 net/ipv4/udp_offload.c | 52 +++++++++++++++++++-------------------------------
 net/ipv6/Makefile      |  5 +++--
 net/ipv6/af_inet6.c    |  8 ++++++++
 net/ipv6/ip6_offload.c |  2 --
 net/ipv6/ip6_offload.h |  3 ++-
 net/ipv6/udp_offload.c | 11 ++++++++---
 8 files changed, 54 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 32342754643a..d1fd8cd39478 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,14 @@ struct udp_sock {
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
+
+	/* GRO functions for UDP socket */
+	struct sk_buff **	(*gro_receive)(struct sock *sk,
+					       struct sk_buff **head,
+					       struct sk_buff *skb);
+	int			(*gro_complete)(struct sock *sk,
+						struct sk_buff *skb,
+						int nhoff);
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/include/net/udp.h b/include/net/udp.h
index 3aa0b3ec1fb0..3c5a65e0946d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -167,9 +167,12 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
 	UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
 }
 
+typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
+				     __be16 dport);
+
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh);
-int udp_gro_complete(struct sk_buff *skb, int nhoff);
+				 struct udphdr *uh, udp_lookup_t lookup);
+int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0ed2dafb7cc4..65c3fd34b363 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -179,6 +179,7 @@ out_unlock:
 
 	return segs;
 }
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
@@ -304,13 +305,13 @@ unlock:
 EXPORT_SYMBOL(udp_del_offload);
 
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh)
+				 struct udphdr *uh, udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	struct sk_buff *p, **pp = NULL;
 	struct udphdr *uh2;
 	unsigned int off = skb_gro_offset(skb);
 	int flush = 1;
+	struct sock *sk;
 
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +323,11 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
 	rcu_read_lock();
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_receive)
-			goto unflush;
-	}
+	sk = (*lookup)(skb, uh->source, uh->dest);
+
+	if (sk && udp_sk(sk)->gro_receive)
+		goto unflush;
+
 	goto out_unlock;
 
 unflush:
@@ -352,9 +351,7 @@ unflush:
 
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
-	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
-						     uo_priv->offload);
+	pp = udp_sk(sk)->gro_receive(sk, head, skb);
 
 out_unlock:
 	rcu_read_unlock();
@@ -362,6 +359,7 @@ out:
 	NAPI_GRO_CB(skb)->flush |= flush;
 	return pp;
 }
+EXPORT_SYMBOL(udp_gro_receive);
 
 static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
@@ -383,39 +381,28 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					     inet_gro_compute_pseudo);
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 0;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
 	return NULL;
 }
 
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+		     udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	__be16 newlen = htons(skb->len - nhoff);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 	int err = -ENOSYS;
+	struct sock *sk;
 
 	uh->len = newlen;
 
 	rcu_read_lock();
-
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_complete)
-			break;
-	}
-
-	if (uo_priv) {
-		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb,
-				nhoff + sizeof(struct udphdr),
-				uo_priv->offload);
-	}
-
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (sk && udp_sk(sk)->gro_complete)
+		err = udp_sk(sk)->gro_complete(sk, skb,
+				nhoff + sizeof(struct udphdr));
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -426,6 +413,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	return err;
 }
+EXPORT_SYMBOL(udp_gro_complete);
 
 static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 {
@@ -440,7 +428,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
 static const struct net_offload udpv4_offload = {
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fbd90bf8d33..5e9d6bf4aaca 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,9 +8,10 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		addrlabel.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
-		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
+		udp_offload.o
 
-ipv6-offload :=	ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
 ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
 ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b78aad0d52f..bfa86f040c16 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,8 @@
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
 
+#include "ip6_offload.h"
+
 MODULE_AUTHOR("Cast of dozens");
 MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
 MODULE_LICENSE("GPL");
@@ -959,6 +961,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto udplitev6_fail;
 
+	err = udpv6_offload_init();
+	if (err)
+		goto udpv6_offload_fail;
+
 	err = tcpv6_init();
 	if (err)
 		goto tcpv6_fail;
@@ -988,6 +994,8 @@ pingv6_fail:
 ipv6_packet_fail:
 	tcpv6_exit();
 tcpv6_fail:
+	udpv6_offload_exit();
+udpv6_offload_fail:
 	udplitev6_exit();
 udplitev6_fail:
 	udpv6_exit();
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 82e9f3076028..204af2219471 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -325,8 +325,6 @@ static int __init ipv6_offload_init(void)
 
 	if (tcpv6_offload_init() < 0)
 		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
-	if (udp_offload_init() < 0)
-		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
 	if (ipv6_exthdrs_offload_init() < 0)
 		pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
 
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
index 2e155c651b35..96b40e41ac53 100644
--- a/net/ipv6/ip6_offload.h
+++ b/net/ipv6/ip6_offload.h
@@ -12,7 +12,8 @@
 #define __ip6_offload_h
 
 int ipv6_exthdrs_offload_init(void);
-int udp_offload_init(void);
+int udpv6_offload_init(void);
+int udpv6_offload_exit(void);
 int tcpv6_offload_init(void);
 
 #endif
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2b0fbe6929e8..5429f6bcf047 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -153,7 +153,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 1;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
@@ -173,7 +173,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
 static const struct net_offload udpv6_offload = {
@@ -184,7 +184,12 @@ static const struct net_offload udpv6_offload = {
 	},
 };
 
-int __init udp_offload_init(void)
+int udpv6_offload_init(void)
 {
 	return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
 }
+
+int udpv6_offload_exit(void)
+{
+	return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+}
-- 
cgit v1.2.3


From 46aa2f30aa7fe03a4dcd732b009284c02ff4f093 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 5 Apr 2016 08:22:56 -0700
Subject: udp: Remove udp_offloads

Now that the UDP encapsulation GRO functions have been moved to the UDP
socket we not longer need the udp_offload insfrastructure so removing it.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 -------------
 include/net/protocol.h    |  3 ---
 net/ipv4/udp_offload.c    | 63 -----------------------------------------------
 3 files changed, 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb0d5d09c2e4..cb4e508b3f38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2159,23 +2159,6 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
-struct udp_offload;
-
-struct udp_offload_callbacks {
-	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-						 struct sk_buff *skb,
-						 struct udp_offload *uoff);
-	int			(*gro_complete)(struct sk_buff *skb,
-						int nhoff,
-						struct udp_offload *uoff);
-};
-
-struct udp_offload {
-	__be16			 port;
-	u8			 ipproto;
-	struct udp_offload_callbacks callbacks;
-};
-
 /* often modified stats are per-CPU, other are shared (netdev->stats) */
 struct pcpu_sw_netstats {
 	u64     rx_packets;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index da689f5432de..bf36ca34af7a 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -107,9 +107,6 @@ int inet_del_offload(const struct net_offload *prot, unsigned char num);
 void inet_register_protosw(struct inet_protosw *p);
 void inet_unregister_protosw(struct inet_protosw *p);
 
-int  udp_add_offload(struct net *net, struct udp_offload *prot);
-void udp_del_offload(struct udp_offload *prot);
-
 #if IS_ENABLED(CONFIG_IPV6)
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char num);
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char num);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 65c3fd34b363..6230cf4b0d2d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,18 +14,6 @@
 #include <net/udp.h>
 #include <net/protocol.h>
 
-static DEFINE_SPINLOCK(udp_offload_lock);
-static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
-
-#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
-
-struct udp_offload_priv {
-	struct udp_offload	*offload;
-	possible_net_t	net;
-	struct rcu_head		rcu;
-	struct udp_offload_priv __rcu *next;
-};
-
 static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	netdev_features_t features,
 	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -254,56 +242,6 @@ out:
 	return segs;
 }
 
-int udp_add_offload(struct net *net, struct udp_offload *uo)
-{
-	struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
-
-	if (!new_offload)
-		return -ENOMEM;
-
-	write_pnet(&new_offload->net, net);
-	new_offload->offload = uo;
-
-	spin_lock(&udp_offload_lock);
-	new_offload->next = udp_offload_base;
-	rcu_assign_pointer(udp_offload_base, new_offload);
-	spin_unlock(&udp_offload_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(udp_add_offload);
-
-static void udp_offload_free_routine(struct rcu_head *head)
-{
-	struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
-	kfree(ou_priv);
-}
-
-void udp_del_offload(struct udp_offload *uo)
-{
-	struct udp_offload_priv __rcu **head = &udp_offload_base;
-	struct udp_offload_priv *uo_priv;
-
-	spin_lock(&udp_offload_lock);
-
-	uo_priv = udp_deref_protected(*head);
-	for (; uo_priv != NULL;
-	     uo_priv = udp_deref_protected(*head)) {
-		if (uo_priv->offload == uo) {
-			rcu_assign_pointer(*head,
-					   udp_deref_protected(uo_priv->next));
-			goto unlock;
-		}
-		head = &uo_priv->next;
-	}
-	pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
-unlock:
-	spin_unlock(&udp_offload_lock);
-	if (uo_priv)
-		call_rcu(&uo_priv->rcu, udp_offload_free_routine);
-}
-EXPORT_SYMBOL(udp_del_offload);
-
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -327,7 +265,6 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 
 	if (sk && udp_sk(sk)->gro_receive)
 		goto unflush;
-
 	goto out_unlock;
 
 unflush:
-- 
cgit v1.2.3


From ec5e099d6e941668d121ea9ca7057f4fa00830b0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:22 -0700
Subject: perf: optimize perf_fetch_caller_regs

avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu init logic and
subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
so we can safely drop memset from all of the above cases and move it into
perf_ftrace_function_call that calls it with stack allocated pt_regs.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      | 2 --
 kernel/trace/trace_event_perf.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..e89f7199c223 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
  */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	memset(regs, 0, sizeof(*regs));
-
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..7a68afca8249 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -316,6 +316,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 
+	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
 	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
-- 
cgit v1.2.3


From 1e1dcd93b468901e114f279c94a0b356adc5e7cd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:24 -0700
Subject: perf: split perf_trace_buf_prepare into alloc and update parts

split allows to move expensive update of 'struct trace_entry' to later phase.
Repurpose unused 1st argument of perf_tp_event() to indicate event type.

While splitting use temp variable 'rctx' instead of '*rctx' to avoid
unnecessary loads done by the compiler due to -fno-strict-aliasing

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  8 ++++----
 include/trace/perf.h            |  8 ++++----
 kernel/events/core.c            |  6 ++++--
 kernel/trace/trace_event_perf.c | 39 ++++++++++++++++++++-------------------
 kernel/trace/trace_kprobe.c     | 10 ++++++----
 kernel/trace/trace_syscalls.c   | 13 +++++++------
 kernel/trace/trace_uprobe.c     |  5 +++--
 8 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e89f7199c223..eb41b535ef38 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1016,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 extern void perf_event_init(void);
-extern void perf_tp_event(u64 addr, u64 count, void *record,
+extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
 			  struct task_struct *task);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..56f795e6a093 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -605,15 +605,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
-extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs **regs, int *rctxp);
+void perf_trace_buf_update(void *record, u16 type);
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
 static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
 		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 6f7e37869065..77cd9043b7e4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -53,8 +53,7 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	entry = perf_trace_buf_prepare(__entry_size,			\
-			event_call->event.type, &__regs, &rctx);	\
+	entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
@@ -64,8 +63,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, 0,		\
-		__count, __regs, head, __task);				\
+	perf_trace_buf_submit(entry, __entry_size, rctx,		\
+			      event_call->event.type, __count, __regs,	\
+			      head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..d8512883c0a0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6987,7 +6987,7 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
 {
@@ -6999,9 +6999,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 		.data = record,
 	};
 
-	perf_sample_data_init(&data, addr, 0);
+	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	perf_trace_buf_update(record, event_type);
+
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7a68afca8249..5a927075977f 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
-void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs **regs, int *rctxp)
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 {
-	struct trace_entry *entry;
-	unsigned long flags;
 	char *raw_data;
-	int pc;
+	int rctx;
 
 	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-			"perf buffer not large enough"))
+		      "perf buffer not large enough"))
 		return NULL;
 
-	pc = preempt_count();
-
-	*rctxp = perf_swevent_get_recursion_context();
-	if (*rctxp < 0)
+	*rctxp = rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		return NULL;
 
 	if (regs)
-		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
-	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+		*regs = this_cpu_ptr(&__perf_regs[rctx]);
+	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+	return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
+NOKPROBE_SYMBOL(perf_trace_buf_alloc);
+
+void perf_trace_buf_update(void *record, u16 type)
+{
+	struct trace_entry *entry = record;
+	int pc = preempt_count();
+	unsigned long flags;
 
-	entry = (struct trace_entry *)raw_data;
 	local_save_flags(flags);
 	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
-
-	return raw_data;
 }
-EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_update);
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
@@ -319,13 +320,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
-	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 919e0ddd8fcc..5546eec0505f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->func = (unsigned long)tk->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e78f364cc192..b2b6efc083a4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx,
+			      sys_data->enter_event->event.type, 1, regs,
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7915142c89e4..c53485441c88 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		goto out;
 
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 		memset(data + len, 0, size - esize - len);
 	}
 
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:27 -0700
Subject: bpf: support bpf_get_stackid() and bpf_perf_event_output() in
 tracepoint programs

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..198f6ace70ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,6 +160,7 @@ struct bpf_array {
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e5ebe3254d2..413ec5614180 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -340,12 +340,52 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	/*
+	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+	 * from there and call the same bpf_perf_event_output() helper
+	 */
+	u64 ctx = *(long *)r1;
+
+	return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+	.func		= bpf_perf_event_output_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	u64 ctx = *(long *)r1;
+
+	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+	.func		= bpf_get_stackid_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return NULL;
+		return &bpf_get_stackid_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:28 -0700
Subject: bpf: sanitize bpf tracepoint access

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 +
 include/linux/trace_events.h |  1 +
 kernel/bpf/verifier.c        |  6 +++++-
 kernel/events/core.c         |  8 ++++++++
 kernel/trace/trace_events.c  | 18 ++++++++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 198f6ace70ec..b2365a6eba3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
+	u32 max_ctx_offset;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 56f795e6a093..fe6441203b59 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
+extern int trace_event_get_offsets(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ffe97d6166..9a01019ff7c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7133,6 +7133,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	if (is_tracepoint) {
+		int off = trace_event_get_offsets(event->tp_event);
+
+		if (prog->aux->max_ctx_offset > off) {
+			bpf_prog_put(prog);
+			return -EACCES;
+		}
+	}
 	event->tp_event->prog = prog;
 
 	return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..ced963049e0a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
 	}
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+	struct ftrace_event_field *tail;
+	struct list_head *head;
+
+	head = trace_get_fields(call);
+	/*
+	 * head->next points to the last field with the largest offset,
+	 * since it was added last by trace_define_field()
+	 */
+	tail = list_first_entry(head, struct ftrace_event_field, link);
+	return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
-- 
cgit v1.2.3


From 9a6f2b0113c8fce815db7c9d23754bdea4b428a0 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 11 Apr 2016 21:40:05 +0200
Subject: net: mdio: Fix lockdep falls positive splat

MDIO devices can be stacked upon each other. The current code supports
two levels, which until recently has been enough for a DSA mdio bus on
top of another bus. Now we have hardware which has an MDIO mux in the
middle.

Define an MDIO MUTEX class with three levels.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-mux.c | 10 ++--------
 drivers/net/phy/mdio_bus.c |  4 ++--
 include/linux/mdio.h       | 11 +++++++++++
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c
index 308ade0eb1b6..5c81d6faf304 100644
--- a/drivers/net/phy/mdio-mux.c
+++ b/drivers/net/phy/mdio-mux.c
@@ -45,13 +45,7 @@ static int mdio_mux_read(struct mii_bus *bus, int phy_id, int regnum)
 	struct mdio_mux_parent_bus *pb = cb->parent;
 	int r;
 
-	/* In theory multiple mdio_mux could be stacked, thus creating
-	 * more than a single level of nesting.  But in practice,
-	 * SINGLE_DEPTH_NESTING will cover the vast majority of use
-	 * cases.  We use it, instead of trying to handle the general
-	 * case.
-	 */
-	mutex_lock_nested(&pb->mii_bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&pb->mii_bus->mdio_lock, MDIO_MUTEX_MUX);
 	r = pb->switch_fn(pb->current_child, cb->bus_number, pb->switch_data);
 	if (r)
 		goto out;
@@ -76,7 +70,7 @@ static int mdio_mux_write(struct mii_bus *bus, int phy_id,
 
 	int r;
 
-	mutex_lock_nested(&pb->mii_bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&pb->mii_bus->mdio_lock, MDIO_MUTEX_MUX);
 	r = pb->switch_fn(pb->current_child, cb->bus_number, pb->switch_data);
 	if (r)
 		goto out;
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 0cba64f1ecf4..751202a285a6 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -457,7 +457,7 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum)
 
 	BUG_ON(in_interrupt());
 
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 	retval = bus->read(bus, addr, regnum);
 	mutex_unlock(&bus->mdio_lock);
 
@@ -509,7 +509,7 @@ int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val)
 
 	BUG_ON(in_interrupt());
 
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 	err = bus->write(bus, addr, regnum, val);
 	mutex_unlock(&bus->mdio_lock);
 
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 5bfd99d1a40a..bf9d1d750693 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -13,6 +13,17 @@
 
 struct mii_bus;
 
+/* Multiple levels of nesting are possible. However typically this is
+ * limited to nested DSA like layer, a MUX layer, and the normal
+ * user. Instead of trying to handle the general case, just define
+ * these cases.
+ */
+enum mdio_mutex_lock_class {
+	MDIO_MUTEX_NORMAL,
+	MDIO_MUTEX_MUX,
+	MDIO_MUTEX_NESTED,
+};
+
 struct mdio_device {
 	struct device dev;
 
-- 
cgit v1.2.3


From bc405cd69a728d0a82bae8395fe43ec7b0afd1c6 Mon Sep 17 00:00:00 2001
From: Alexandre Macabies <web+oss@zopieux.com>
Date: Tue, 12 Apr 2016 18:53:00 +0200
Subject: ieee802154: add security bit check function

ieee802154_is_secen checks if the 802.15.4 security bit is set in the
frame control field.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Alexandre Macabies <web+oss@zopieux.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Acked-by: Alan Ott <alan@signal11.us>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index d3e415674dac..56090f195339 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -218,6 +218,7 @@ enum {
 /* frame control handling */
 #define IEEE802154_FCTL_FTYPE		0x0003
 #define IEEE802154_FCTL_ACKREQ		0x0020
+#define IEEE802154_FCTL_SECEN		0x0004
 #define IEEE802154_FCTL_INTRA_PAN	0x0040
 
 #define IEEE802154_FTYPE_DATA		0x0001
@@ -232,6 +233,15 @@ static inline int ieee802154_is_data(__le16 fc)
 		cpu_to_le16(IEEE802154_FTYPE_DATA);
 }
 
+/**
+ * ieee802154_is_secen - check if Security bit is set
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline bool ieee802154_is_secen(__le16 fc)
+{
+	return fc & cpu_to_le16(IEEE802154_FCTL_SECEN);
+}
+
 /**
  * ieee802154_is_ackreq - check if acknowledgment request bit is set
  * @fc: frame control bytes in little-endian byteorder
-- 
cgit v1.2.3


From b7594148c73cb506487b5f00a6574beceea0e3a0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 11 Apr 2016 11:04:14 +0200
Subject: ieee802154: cleanups for ieee802154.h

This patch removes some const from non-pointer types and fixes the
function name for the ieee802154_is_valid_extended_unicast_addr
comment.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Acked-by: Jukka Rissanen <jukka.rissanen@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 56090f195339..9d84a924b747 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -270,17 +270,17 @@ static inline bool ieee802154_is_intra_pan(__le16 fc)
  *
  * @len: psdu len with (MHR + payload + MFR)
  */
-static inline bool ieee802154_is_valid_psdu_len(const u8 len)
+static inline bool ieee802154_is_valid_psdu_len(u8 len)
 {
 	return (len == IEEE802154_ACK_PSDU_LEN ||
 		(len >= IEEE802154_MIN_PSDU_LEN && len <= IEEE802154_MTU));
 }
 
 /**
- * ieee802154_is_valid_psdu_len - check if extended addr is valid
+ * ieee802154_is_valid_extended_unicast_addr - check if extended addr is valid
  * @addr: extended addr to check
  */
-static inline bool ieee802154_is_valid_extended_unicast_addr(const __le64 addr)
+static inline bool ieee802154_is_valid_extended_unicast_addr(__le64 addr)
 {
 	/* Bail out if the address is all zero, or if the group
 	 * address bit is set.
-- 
cgit v1.2.3


From 118a5cf8ae236cdfa1eb4f21530843a8494722ef Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 11 Apr 2016 11:04:15 +0200
Subject: ieee802154: add short address helpers

This patch introduce some short address handling functionality into
ieee802154 headers.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Acked-by: Jukka Rissanen <jukka.rissanen@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 29 +++++++++++++++++++++++++++++
 include/net/mac802154.h    | 10 ++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 9d84a924b747..acedbb68a5a3 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -47,6 +47,7 @@
 #define IEEE802154_ADDR_SHORT_UNSPEC	0xfffe
 
 #define IEEE802154_EXTENDED_ADDR_LEN	8
+#define IEEE802154_SHORT_ADDR_LEN	2
 
 #define IEEE802154_LIFS_PERIOD		40
 #define IEEE802154_SIFS_PERIOD		12
@@ -289,6 +290,34 @@ static inline bool ieee802154_is_valid_extended_unicast_addr(__le64 addr)
 		!(addr & cpu_to_le64(0x0100000000000000ULL)));
 }
 
+/**
+ * ieee802154_is_broadcast_short_addr - check if short addr is broadcast
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_broadcast_short_addr(__le16 addr)
+{
+	return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST));
+}
+
+/**
+ * ieee802154_is_unspec_short_addr - check if short addr is unspecified
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_unspec_short_addr(__le16 addr)
+{
+	return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC));
+}
+
+/**
+ * ieee802154_is_valid_src_short_addr - check if source short address is valid
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_valid_src_short_addr(__le16 addr)
+{
+	return !(ieee802154_is_broadcast_short_addr(addr) ||
+		 ieee802154_is_unspec_short_addr(addr));
+}
+
 /**
  * ieee802154_random_extended_addr - generates a random extended address
  * @addr: extended addr pointer to place the random address
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 6cd7a70706a9..e465c8551ac3 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -287,6 +287,16 @@ static inline void ieee802154_le16_to_be16(void *be16_dst, const void *le16_src)
 	put_unaligned_be16(get_unaligned_le16(le16_src), be16_dst);
 }
 
+/**
+ * ieee802154_be16_to_le16 - copies and convert be16 to le16
+ * @le16_dst: le16 destination pointer
+ * @be16_src: be16 source pointer
+ */
+static inline void ieee802154_be16_to_le16(void *le16_dst, const void *be16_src)
+{
+	put_unaligned_le16(get_unaligned_be16(be16_src), le16_dst);
+}
+
 /**
  * ieee802154_alloc_hw - Allocate a new hardware device
  *
-- 
cgit v1.2.3


From 7d35812c3214afa5b37a675113555259cfd67b98 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:23 +0200
Subject: netfilter: x_tables: add and use xt_check_entry_offsets

Currently arp/ip and ip6tables each implement a short helper to check that
the target offset is large enough to hold one xt_entry_target struct and
that t->u.target_size fits within the current rule.

Unfortunately these checks are not sufficient.

To avoid adding new tests to all of ip/ip6/arptables move the current
checks into a helper, then extend this helper in followup patches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++++
 net/ipv4/netfilter/arp_tables.c    | 11 +----------
 net/ipv4/netfilter/ip_tables.c     | 12 +-----------
 net/ipv6/netfilter/ip6_tables.c    | 12 +-----------
 net/netfilter/x_tables.c           | 34 ++++++++++++++++++++++++++++++++++
 5 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 80a305b85323..0de0862897a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,6 +242,10 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset);
+
 int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 		   bool inv_proto);
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ec37f7c3a033..74668c1d3243 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -496,19 +496,10 @@ next:
 
 static inline int check_entry(const struct arpt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
-		return -EINVAL;
-
-	t = arpt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static inline int check_target(struct arpt_entry *e, const char *name)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 503038ea7735..71c204d4ca5f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -590,20 +590,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ipt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 126f2a0f006a..24ae7f458b3b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -602,20 +602,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ip6t_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 582c9cfd6567..1f44bfa8dd94 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -541,6 +541,40 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 #endif /* CONFIG_COMPAT */
 
+/**
+ * xt_check_entry_offsets - validate arp/ip/ip6t_entry
+ *
+ * @base: pointer to arp/ip/ip6t_entry
+ * @target_offset: the arp/ip/ip6_t->target_offset
+ * @next_offset: the arp/ip/ip6_t->next_offset
+ *
+ * validates that target_offset and next_offset are sane.
+ *
+ * The arp/ip/ip6t_entry structure @base must have passed following tests:
+ * - it must point to a valid memory location
+ * - base to base + next_offset must be accessible, i.e. not exceed allocated
+ *   length.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset)
+{
+	const struct xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_entry_offsets);
+
 int xt_check_target(struct xt_tgchk_param *par,
 		    unsigned int size, u_int8_t proto, bool inv_proto)
 {
-- 
cgit v1.2.3


From fc1221b3a163d1386d1052184202d5dc50d302d1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:26 +0200
Subject: netfilter: x_tables: add compat version of xt_check_entry_offsets

32bit rulesets have different layout and alignment requirements, so once
more integrity checks get added to xt_check_entry_offsets it will reject
well-formed 32bit rulesets.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  3 +++
 net/ipv4/netfilter/arp_tables.c    |  3 ++-
 net/ipv4/netfilter/ip_tables.c     |  3 ++-
 net/ipv6/netfilter/ip6_tables.c    |  3 ++-
 net/netfilter/x_tables.c           | 22 ++++++++++++++++++++++
 5 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 0de0862897a4..08de48bbe92e 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -494,6 +494,9 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset);
 
 #endif /* CONFIG_COMPAT */
 #endif /* _X_TABLES_H */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 24ad92a60b7a..ab8952a49bfa 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1254,7 +1254,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+					    e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cdf18502a047..7d24c872723f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1513,7 +1513,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e3783116ed48..73eee7b5fd60 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1525,7 +1525,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ec1b7183fff9..fa206ceb269f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -539,6 +539,27 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset)
+{
+	const struct compat_xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (t->u.target_size < sizeof(*t))
+		return -EINVAL;
+
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_compat_check_entry_offsets);
 #endif /* CONFIG_COMPAT */
 
 /**
@@ -549,6 +570,7 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
+ * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
-- 
cgit v1.2.3


From ce683e5f9d045e5d67d1312a42b359cb2ab2a13c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:28 +0200
Subject: netfilter: x_tables: check for bogus target offset

We're currently asserting that targetoff + targetsize <= nextoff.

Extend it to also check that targetoff is >= sizeof(xt_entry).
Since this is generic code, add an argument pointing to the start of the
match/target, we can then derive the base structure size from the delta.

We also need the e->elems pointer in a followup change to validate matches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++--
 net/ipv4/netfilter/arp_tables.c    |  5 +++--
 net/ipv4/netfilter/ip_tables.c     |  5 +++--
 net/ipv6/netfilter/ip6_tables.c    |  5 +++--
 net/netfilter/x_tables.c           | 17 +++++++++++++++--
 5 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 08de48bbe92e..30cfb1e943fb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,7 +242,7 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
-int xt_check_entry_offsets(const void *base,
+int xt_check_entry_offsets(const void *base, const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset);
 
@@ -494,7 +494,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ab8952a49bfa..95ed4e454c60 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -592,7 +592,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1254,7 +1255,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+	ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset,
 					    e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7d24c872723f..baab033d74e0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -754,7 +754,8 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1513,7 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73eee7b5fd60..6957627c7931 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -766,7 +766,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1525,7 +1526,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 1cb7a271c024..e2a6f2a9051b 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -546,14 +546,17 @@ struct compat_xt_standard_target {
 	compat_uint_t verdict;
 };
 
-/* see xt_check_entry_offsets */
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct compat_xt_entry_target *t;
 	const char *e = base;
 
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
@@ -577,12 +580,16 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * xt_check_entry_offsets - validate arp/ip/ip6t_entry
  *
  * @base: pointer to arp/ip/ip6t_entry
+ * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems
  * @target_offset: the arp/ip/ip6_t->target_offset
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
  * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
+ * This function does not validate the targets or matches themselves, it
+ * only tests that all the offsets and sizes are correct.
+ *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
  * - base to base + next_offset must be accessible, i.e. not exceed allocated
@@ -591,12 +598,18 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * Return: 0 on success, negative errno on failure.
  */
 int xt_check_entry_offsets(const void *base,
+			   const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct xt_entry_target *t;
 	const char *e = base;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 0188346f21e6546498c2a0f84888797ad4063fc5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:33 +0200
Subject: netfilter: x_tables: xt_compat_match_from_user doesn't need a retval

Always returned 0.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  2 +-
 net/ipv4/netfilter/arp_tables.c    | 17 +++++------------
 net/ipv4/netfilter/ip_tables.c     | 26 +++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    | 27 +++++++++------------------
 net/netfilter/x_tables.c           |  5 ++---
 5 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 30cfb1e943fb..e2da9b90f1b8 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -484,7 +484,7 @@ void xt_compat_init_offsets(u_int8_t af, unsigned int number);
 int xt_compat_calc_jump(u_int8_t af, unsigned int offset);
 
 int xt_compat_match_offset(const struct xt_match *match);
-int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 			      unsigned int *size);
 int xt_compat_match_to_user(const struct xt_entry_match *m,
 			    void __user **dstptr, unsigned int *size);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1d1386dc159b..be514c676fad 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1310,7 +1310,7 @@ out:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1319,9 +1319,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 	struct xt_target *target;
 	struct arpt_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct arpt_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct arpt_entry));
@@ -1342,7 +1341,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int translate_compat_table(struct xt_table_info **pinfo,
@@ -1421,16 +1419,11 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	entry1 = newinfo->entries;
 	pos = entry1;
 	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
 	xt_compat_flush_offsets(NFPROTO_ARP);
 	xt_compat_unlock(NFPROTO_ARP);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d70418604503..5c20eef980c1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1568,7 +1568,7 @@ release_matches:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1577,10 +1577,9 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 	struct xt_target *target;
 	struct ipt_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 	struct xt_entry_match *ematch;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct ipt_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct ipt_entry));
@@ -1589,11 +1588,9 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 	*dstptr += sizeof(struct ipt_entry);
 	*size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
 
-	xt_ematch_foreach(ematch, e) {
-		ret = xt_compat_match_from_user(ematch, dstptr, size);
-		if (ret != 0)
-			return ret;
-	}
+	xt_ematch_foreach(ematch, e)
+		xt_compat_match_from_user(ematch, dstptr, size);
+
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = compat_ipt_get_target(e);
 	target = t->u.kernel.target;
@@ -1606,7 +1603,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int
@@ -1729,16 +1725,12 @@ translate_compat_table(struct net *net,
 	entry1 = newinfo->entries;
 	pos = entry1;
 	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
 	xt_compat_flush_offsets(AF_INET);
 	xt_compat_unlock(AF_INET);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 8d082c557771..620d54c1c119 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1580,7 +1580,7 @@ release_matches:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1588,10 +1588,9 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 	struct xt_entry_target *t;
 	struct ip6t_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 	struct xt_entry_match *ematch;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct ip6t_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct ip6t_entry));
@@ -1600,11 +1599,9 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 	*dstptr += sizeof(struct ip6t_entry);
 	*size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
 
-	xt_ematch_foreach(ematch, e) {
-		ret = xt_compat_match_from_user(ematch, dstptr, size);
-		if (ret != 0)
-			return ret;
-	}
+	xt_ematch_foreach(ematch, e)
+		xt_compat_match_from_user(ematch, dstptr, size);
+
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = compat_ip6t_get_target(e);
 	xt_compat_target_from_user(t, dstptr, size);
@@ -1616,7 +1613,6 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int compat_check_entry(struct ip6t_entry *e, struct net *net,
@@ -1737,17 +1733,12 @@ translate_compat_table(struct net *net,
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
-	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
 	xt_compat_flush_offsets(AF_INET6);
 	xt_compat_unlock(AF_INET6);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f9aa9715c32e..7e7173b68344 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -526,8 +526,8 @@ int xt_compat_match_offset(const struct xt_match *match)
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_offset);
 
-int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
-			      unsigned int *size)
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+			       unsigned int *size)
 {
 	const struct xt_match *match = m->u.kernel.match;
 	struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
@@ -549,7 +549,6 @@ int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 
 	*size += off;
 	*dstptr += msize;
-	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
 
-- 
cgit v1.2.3


From d7591f0c41ce3e67600a982bab6989ef0f07b3ce Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 15:37:59 +0200
Subject: netfilter: x_tables: introduce and use xt_copy_counters_from_user

The three variants use same copy&pasted code, condense this into a
helper and use that.

Make sure info.name is 0-terminated.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  3 ++
 net/ipv4/netfilter/arp_tables.c    | 48 +++----------------------
 net/ipv4/netfilter/ip_tables.c     | 48 +++----------------------
 net/ipv6/netfilter/ip6_tables.c    | 49 +++----------------------
 net/netfilter/x_tables.c           | 74 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index e2da9b90f1b8..4dd9306c9d56 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -251,6 +251,9 @@ int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
 		    bool inv_proto);
 
+void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
+				 struct xt_counters_info *info, bool compat);
+
 struct xt_table *xt_register_table(struct net *net,
 				   const struct xt_table *table,
 				   struct xt_table_info *bootstrap,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8cefb7a2606b..60f5161abcb4 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1123,55 +1123,17 @@ static int do_add_counters(struct net *net, const void __user *user,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	const char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct arpt_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
 
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
 
-	t = xt_find_table_lock(net, NFPROTO_ARP, name);
+	t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1179,7 +1141,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 9340ce0a7549..735d1ee8c1ab 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1307,55 +1307,17 @@ do_add_counters(struct net *net, const void __user *user,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	const char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct ipt_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
 
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
 
-	t = xt_find_table_lock(net, AF_INET, name);
+	t = xt_find_table_lock(net, AF_INET, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1363,7 +1325,7 @@ do_add_counters(struct net *net, const void __user *user,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index aa010856a255..73e606c719ef 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1319,55 +1319,16 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct ip6t_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
-
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
 
-	t = xt_find_table_lock(net, AF_INET6, name);
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
+	t = xt_find_table_lock(net, AF_INET6, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1375,7 +1336,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 9ec23ffa43b4..c69c892231d7 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -752,6 +752,80 @@ int xt_check_target(struct xt_tgchk_param *par,
 }
 EXPORT_SYMBOL_GPL(xt_check_target);
 
+/**
+ * xt_copy_counters_from_user - copy counters and metadata from userspace
+ *
+ * @user: src pointer to userspace memory
+ * @len: alleged size of userspace memory
+ * @info: where to store the xt_counters_info metadata
+ * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
+ *
+ * Copies counter meta data from @user and stores it in @info.
+ *
+ * vmallocs memory to hold the counters, then copies the counter data
+ * from @user to the new memory and returns a pointer to it.
+ *
+ * If @compat is true, @info gets converted automatically to the 64bit
+ * representation.
+ *
+ * The metadata associated with the counters is stored in @info.
+ *
+ * Return: returns pointer that caller has to test via IS_ERR().
+ * If IS_ERR is false, caller has to vfree the pointer.
+ */
+void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
+				 struct xt_counters_info *info, bool compat)
+{
+	void *mem;
+	u64 size;
+
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		/* structures only differ in size due to alignment */
+		struct compat_xt_counters_info compat_tmp;
+
+		if (len <= sizeof(compat_tmp))
+			return ERR_PTR(-EINVAL);
+
+		len -= sizeof(compat_tmp);
+		if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+			return ERR_PTR(-EFAULT);
+
+		strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+		info->num_counters = compat_tmp.num_counters;
+		user += sizeof(compat_tmp);
+	} else
+#endif
+	{
+		if (len <= sizeof(*info))
+			return ERR_PTR(-EINVAL);
+
+		len -= sizeof(*info);
+		if (copy_from_user(info, user, sizeof(*info)) != 0)
+			return ERR_PTR(-EFAULT);
+
+		info->name[sizeof(info->name) - 1] = '\0';
+		user += sizeof(*info);
+	}
+
+	size = sizeof(struct xt_counters);
+	size *= info->num_counters;
+
+	if (size != (u64)len)
+		return ERR_PTR(-EINVAL);
+
+	mem = vmalloc(len);
+	if (!mem)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(mem, user, len) == 0)
+		return mem;
+
+	vfree(mem);
+	return ERR_PTR(-EFAULT);
+}
+EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+
 #ifdef CONFIG_COMPAT
 int xt_compat_target_offset(const struct xt_target *target)
 {
-- 
cgit v1.2.3


From f9a7cbbf18f1640907d6ca345b8337e4b50ea56f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Fri, 8 Apr 2016 17:51:54 +0200
Subject: net: force inlining of netif_tx_start/stop_queue, sock_hold,
 __sock_put

Sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122
Arguably, gcc should do better, but gcc people aren't willing
to invest time into it, asking to use __always_inline instead.

With this .config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os,
the following functions get deinlined many times.

netif_tx_stop_queue: 207 copies, 590 calls:
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 80 8f e0 01 00 00 01 lock orb $0x1,0x1e0(%rdi)
	5d                      pop    %rbp
	c3                      retq

netif_tx_start_queue: 47 copies, 111 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 80 a7 e0 01 00 00 fe lock andb $0xfe,0x1e0(%rdi)
	5d                      pop    %rbp
	c3                      retq

sock_hold: 39 copies, 124 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 ff 87 80 00 00 00    lock incl 0x80(%rdi)
	5d                      pop    %rbp
	c3                      retq

__sock_put: 6 copies, 13 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 ff 8f 80 00 00 00    lock decl 0x80(%rdi)
	5d                      pop    %rbp
	c3                      retq

This patch fixes this via s/inline/__always_inline/.

Code size decrease after the patch is ~2.5k:

    text      data      bss       dec     hex filename
56719876  56364551 36196352 149280779 8e5d80b vmlinux_before
56717440  56364551 36196352 149278343 8e5ce87 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: David S. Miller <davem@davemloft.net>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 include/net/sock.h        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 166402ae3324..e906c6570b38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2787,7 +2787,7 @@ static inline void netif_tx_schedule_all(struct net_device *dev)
 		netif_schedule_queue(netdev_get_tx_queue(dev, i));
 }
 
-static inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
+static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
 {
 	clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
@@ -2837,7 +2837,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 	}
 }
 
-static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
+static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index baba58770ac5..d997ec13a643 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -569,7 +569,7 @@ static inline bool __sk_del_node_init(struct sock *sk)
    modifications.
  */
 
-static inline void sock_hold(struct sock *sk)
+static __always_inline void sock_hold(struct sock *sk)
 {
 	atomic_inc(&sk->sk_refcnt);
 }
@@ -577,7 +577,7 @@ static inline void sock_hold(struct sock *sk)
 /* Ungrab socket in the context, which assumes that socket refcnt
    cannot hit zero, f.e. it is true in context of any socketcall.
  */
-static inline void __sock_put(struct sock *sk)
+static __always_inline void __sock_put(struct sock *sk)
 {
 	atomic_dec(&sk->sk_refcnt);
 }
-- 
cgit v1.2.3


From 743b03a83297690f0bd38c452a3bbb47d2be300a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 9 Apr 2016 11:29:58 -0700
Subject: net: remove netdevice gso_min_segs

After introduction of ndo_features_check(), we believe that very
specific checks for rare features should not be done in core
networking stack.

No driver uses gso_min_segs yet, so we revert this feature and save
few instructions per tx packet in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 +---
 net/core/dev.c            | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e906c6570b38..9884fe9a6552 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1586,8 +1586,6 @@ enum netdev_priv_flags {
  *	@gso_max_size:	Maximum size of generic segmentation offload
  *	@gso_max_segs:	Maximum number of segments that can be passed to the
  *			NIC for GSO
- *	@gso_min_segs:	Minimum number of segments that can be passed to the
- *			NIC for GSO
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -1858,7 +1856,7 @@ struct net_device {
 	unsigned int		gso_max_size;
 #define GSO_MAX_SEGS		65535
 	u16			gso_max_segs;
-	u16			gso_min_segs;
+
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index d51343a821ed..09fb1ace9dc8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2831,7 +2831,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	netdev_features_t features = dev->features;
 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 
-	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
+	if (gso_segs > dev->gso_max_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
 	/* If encapsulation offload request, verify we are testing
@@ -7429,7 +7429,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
-	dev->gso_min_segs = 0;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
-- 
cgit v1.2.3


From 95114344ea78649b1797d00ab6e88147bef66fa4 Mon Sep 17 00:00:00 2001
From: Rahul Verma <rahul.verma@qlogic.com>
Date: Sun, 10 Apr 2016 12:42:59 +0300
Subject: qed*: remove version dependency

Inbox drivers don't need versioning scheme in order to guarantee
compatibility, as both qed and qede are compiled from same codebase.

Signed-off-by: Rahul Verma <rahul.verma@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |  2 --
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  8 +-------
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 11 -----------
 drivers/net/ethernet/qlogic/qede/qede.h      |  2 --
 drivers/net/ethernet/qlogic/qede/qede_main.c | 11 +----------
 include/linux/qed/qed_eth_if.h               |  2 +-
 include/linux/qed/qed_if.h                   |  9 ---------
 7 files changed, 3 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index fcb8e9ba51d9..a3ee9df16dfe 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -507,6 +507,4 @@ u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
 
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 
-#define QED_ETH_INTERFACE_VERSION       300
-
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 3f35c6ca9252..e848d5a1f7f6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2043,14 +2043,8 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.get_vport_stats = &qed_get_vport_stats,
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version)
+const struct qed_eth_ops *qed_get_eth_ops(void)
 {
-	if (version != QED_ETH_INTERFACE_VERSION) {
-		pr_notice("Cannot supply ethtool operations [%08x != %08x]\n",
-			  version, QED_ETH_INTERFACE_VERSION);
-		return NULL;
-	}
-
 	return &qed_eth_ops_pass;
 }
 EXPORT_SYMBOL(qed_get_eth_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 26d40db07ddd..c31d485f72d6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1172,14 +1172,3 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.chain_free = &qed_chain_free,
 	.set_led = &qed_set_led,
 };
-
-u32 qed_get_protocol_version(enum qed_protocol protocol)
-{
-	switch (protocol) {
-	case QED_PROTOCOL_ETH:
-		return QED_ETH_INTERFACE_VERSION;
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL(qed_get_protocol_version);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index d023251544d9..e0a696a57d4d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -32,8 +32,6 @@
 		__stringify(QEDE_REVISION_VERSION) "."		\
 		__stringify(QEDE_ENGINEERING_VERSION)
 
-#define QEDE_ETH_INTERFACE_VERSION	300
-
 #define DRV_MODULE_SYM		qede
 
 struct qede_stats {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 518af329502d..a55d93eb41fa 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -141,19 +141,10 @@ static
 int __init qede_init(void)
 {
 	int ret;
-	u32 qed_ver;
 
 	pr_notice("qede_init: %s\n", version);
 
-	qed_ver = qed_get_protocol_version(QED_PROTOCOL_ETH);
-	if (qed_ver !=  QEDE_ETH_INTERFACE_VERSION) {
-		pr_notice("Version mismatch [%08x != %08x]\n",
-			  qed_ver,
-			  QEDE_ETH_INTERFACE_VERSION);
-		return -EINVAL;
-	}
-
-	qed_ops = qed_get_eth_ops(QEDE_ETH_INTERFACE_VERSION);
+	qed_ops = qed_get_eth_ops();
 	if (!qed_ops) {
 		pr_notice("Failed to get qed ethtool operations\n");
 		return -EINVAL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e1d69834a11f..e00c8dbfc324 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -167,7 +167,7 @@ struct qed_eth_ops {
 				struct qed_eth_stats *stats);
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version);
+const struct qed_eth_ops *qed_get_eth_ops(void);
 void qed_put_eth_ops(void);
 
 #endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 1f7599c77cd4..b007011e1c82 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -271,15 +271,6 @@ struct qed_common_ops {
 		       enum qed_led_mode mode);
 };
 
-/**
- * @brief qed_get_protocol_version
- *
- * @param protocol
- *
- * @return version supported by qed for given protocol driver
- */
-u32 qed_get_protocol_version(enum qed_protocol protocol);
-
 #define MASK_FIELD(_name, _value) \
 	((_value) &= (_name ## _MASK))
 
-- 
cgit v1.2.3


From 8c5ebd0c792a097fcc0e526debbe0887ee378ae5 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Sun, 10 Apr 2016 12:43:00 +0300
Subject: qed: add Rx flow hash/indirection support.

Adds the required API for passing RSS-related configuration from qede.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c | 17 +----------------
 include/linux/qed/qed_eth_if.h           |  1 +
 include/linux/qed/qed_if.h               | 11 +++++++++++
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e848d5a1f7f6..5005497ee23e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -35,19 +35,6 @@
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 
-enum qed_rss_caps {
-	QED_RSS_IPV4		= 0x1,
-	QED_RSS_IPV6		= 0x2,
-	QED_RSS_IPV4_TCP	= 0x4,
-	QED_RSS_IPV6_TCP	= 0x8,
-	QED_RSS_IPV4_UDP	= 0x10,
-	QED_RSS_IPV6_UDP	= 0x20,
-};
-
-/* Should be the same as ETH_RSS_IND_TABLE_ENTRIES_NUM */
-#define QED_RSS_IND_TABLE_SIZE 128
-#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
-
 struct qed_rss_params {
 	u8	update_rss_config;
 	u8	rss_enable;
@@ -1744,9 +1731,7 @@ static int qed_update_vport(struct qed_dev *cdev,
 		sp_rss_params.update_rss_capabilities = 1;
 		sp_rss_params.update_rss_ind_table = 1;
 		sp_rss_params.update_rss_key = 1;
-		sp_rss_params.rss_caps = QED_RSS_IPV4 |
-					 QED_RSS_IPV6 |
-					 QED_RSS_IPV4_TCP | QED_RSS_IPV6_TCP;
+		sp_rss_params.rss_caps = params->rss_params.rss_caps;
 		sp_rss_params.rss_table_size_log = 7; /* 2^7 = 128 */
 		memcpy(sp_rss_params.rss_ind_table,
 		       params->rss_params.rss_ind_table,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e00c8dbfc324..795c9902e02f 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -27,6 +27,7 @@ struct qed_dev_eth_info {
 struct qed_update_vport_rss_params {
 	u16	rss_ind_table[128];
 	u32	rss_key[10];
+	u8	rss_caps;
 };
 
 struct qed_update_vport_params {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b007011e1c82..67e8c206b2c1 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -515,4 +515,15 @@ static inline void internal_ram_wr(void __iomem *addr,
 	__internal_ram_wr(NULL, addr, size, data);
 }
 
+enum qed_rss_caps {
+	QED_RSS_IPV4		= 0x1,
+	QED_RSS_IPV6		= 0x2,
+	QED_RSS_IPV4_TCP	= 0x4,
+	QED_RSS_IPV6_TCP	= 0x8,
+	QED_RSS_IPV4_UDP	= 0x10,
+	QED_RSS_IPV6_UDP	= 0x20,
+};
+
+#define QED_RSS_IND_TABLE_SIZE 128
+#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
 #endif
-- 
cgit v1.2.3


From cbc53e08a793b073e79f42ca33f1f3568703540d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:44:51 -0400
Subject: GSO: Add GSO type for fixed IPv4 ID

This patch adds support for TSO using IPv4 headers with a fixed IP ID
field.  This is meant to allow us to do a lossless GRO in the case of TCP
flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
headers.

In addition I am adding a feature that for now I am referring to TSO with
IP ID mangling.  Basically when this flag is enabled the device has the
option to either output the flow with incrementing IP IDs or with a fixed
IP ID regardless of what the original IP ID ordering was.  This is useful
in cases where the DF bit is set and we do not care if the original IP ID
value is maintained.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  3 +++
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          | 20 +++++++++++---------
 net/core/dev.c                  | 34 +++++++++++++++++++++++++++++-----
 net/core/ethtool.c              |  1 +
 net/ipv4/af_inet.c              | 19 +++++++++++--------
 net/ipv4/gre_offload.c          |  1 +
 net/ipv4/tcp_offload.c          |  4 +++-
 net/ipv6/ip6_offload.c          |  3 ++-
 net/mpls/mpls_gso.c             |  1 +
 10 files changed, 63 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..7cf272a4b5c8 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
 	NETIF_F_UFO_BIT,		/* ... UDPv4 fragmentation */
 	NETIF_F_GSO_ROBUST_BIT,		/* ... ->SKB_GSO_DODGY */
 	NETIF_F_TSO_ECN_BIT,		/* ... TCP ECN support */
+	NETIF_F_TSO_MANGLEID_BIT,	/* ... IPV4 ID mangling allowed */
 	NETIF_F_TSO6_BIT,		/* ... TCPv6 segmentation */
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
@@ -120,6 +121,7 @@ enum {
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +149,7 @@ enum {
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
+				 NETIF_F_TSO_MANGLEID | \
 				 NETIF_F_TSO6 | NETIF_F_UFO)
 
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9884fe9a6552..8e372d01b3c1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3992,6 +3992,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_UDP     != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..5fba16658f9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,25 @@ enum {
 	/* This indicates the tcp segment has CWR set. */
 	SKB_GSO_TCP_ECN = 1 << 3,
 
-	SKB_GSO_TCPV6 = 1 << 4,
+	SKB_GSO_TCP_FIXEDID = 1 << 4,
 
-	SKB_GSO_FCOE = 1 << 5,
+	SKB_GSO_TCPV6 = 1 << 5,
 
-	SKB_GSO_GRE = 1 << 6,
+	SKB_GSO_FCOE = 1 << 6,
 
-	SKB_GSO_GRE_CSUM = 1 << 7,
+	SKB_GSO_GRE = 1 << 7,
 
-	SKB_GSO_IPIP = 1 << 8,
+	SKB_GSO_GRE_CSUM = 1 << 8,
 
-	SKB_GSO_SIT = 1 << 9,
+	SKB_GSO_IPIP = 1 << 9,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 10,
+	SKB_GSO_SIT = 1 << 10,
 
-	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+	SKB_GSO_UDP_TUNNEL = 1 << 11,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/dev.c b/net/core/dev.c
index 09fb1ace9dc8..e896b1953ab6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2825,14 +2825,36 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb,
 	return vlan_features_check(skb, features);
 }
 
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+					    struct net_device *dev,
+					    netdev_features_t features)
+{
+	u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+	if (gso_segs > dev->gso_max_segs)
+		return features & ~NETIF_F_GSO_MASK;
+
+	/* Make sure to clear the IPv4 ID mangling feature if
+	 * the IPv4 header has the potential to be fragmented.
+	 */
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+		struct iphdr *iph = skb->encapsulation ?
+				    inner_ip_hdr(skb) : ip_hdr(skb);
+
+		if (!(iph->frag_off & htons(IP_DF)))
+			features &= ~NETIF_F_TSO_MANGLEID;
+	}
+
+	return features;
+}
+
 netdev_features_t netif_skb_features(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
 	netdev_features_t features = dev->features;
-	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 
-	if (gso_segs > dev->gso_max_segs)
-		features &= ~NETIF_F_GSO_MASK;
+	if (skb_is_gso(skb))
+		features = gso_features_check(skb, dev, features);
 
 	/* If encapsulation offload request, verify we are testing
 	 * hardware encapsulation features instead of standard
@@ -6976,9 +6998,11 @@ int register_netdevice(struct net_device *dev)
 	dev->features |= NETIF_F_SOFT_FEATURES;
 	dev->wanted_features = dev->features & dev->hw_features;
 
-	if (!(dev->flags & IFF_LOOPBACK)) {
+	if (!(dev->flags & IFF_LOOPBACK))
 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
-	}
+
+	if (dev->hw_features & NETIF_F_TSO)
+		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 
 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 	 */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6a7f99661c2f..9494c41cc77c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,6 +79,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_UFO_BIT] =              "tx-udp-fragmentation",
 	[NETIF_F_GSO_ROBUST_BIT] =       "tx-gso-robust",
 	[NETIF_F_TSO_ECN_BIT] =          "tx-tcp-ecn-segmentation",
+	[NETIF_F_TSO_MANGLEID_BIT] =	 "tx-tcp-mangleid-segmentation",
 	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8217cd22f921..5bbea9a0ce96 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1195,10 +1195,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 					netdev_features_t features)
 {
+	bool udpfrag = false, fixedid = false, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
-	bool udpfrag, encap;
 	struct iphdr *iph;
 	int proto;
 	int nhoff;
@@ -1217,6 +1217,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TCP_FIXEDID |
 		       SKB_GSO_TUNNEL_REMCSUM |
 		       0)))
 		goto out;
@@ -1248,11 +1249,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
-	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
-		udpfrag = proto == IPPROTO_UDP && encap;
-	else
-		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+	if (!skb->encapsulation || encap) {
+		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+		/* fixed ID is invalid if DF bit is not set */
+		if (fixedid && !(iph->frag_off & htons(IP_DF)))
+			goto out;
+	}
 
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment))
@@ -1265,12 +1269,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
 		if (udpfrag) {
-			iph->id = htons(id);
 			iph->frag_off = htons(offset >> 3);
 			if (skb->next)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else {
+		} else if (!fixedid) {
 			iph->id = htons(id++);
 		}
 		iph->tot_len = htons(skb->len - nhoff);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a5bd4317866..6376b0cdf693 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -32,6 +32,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_UDP |
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
+				  SKB_GSO_TCP_FIXEDID |
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 773083b7f1e9..08dd25d835af 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -89,6 +89,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			     ~(SKB_GSO_TCPV4 |
 			       SKB_GSO_DODGY |
 			       SKB_GSO_TCP_ECN |
+			       SKB_GSO_TCP_FIXEDID |
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
 			       SKB_GSO_GRE_CSUM |
@@ -98,7 +99,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_UDP_TUNNEL_CSUM |
 			       SKB_GSO_TUNNEL_REMCSUM |
 			       0) ||
-			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+			     !(type & (SKB_GSO_TCPV4 |
+				       SKB_GSO_TCPV6))))
 			goto out;
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 204af2219471..b3a779393d71 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -73,6 +73,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP |
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
+		       SKB_GSO_TCP_FIXEDID |
+		       SKB_GSO_TCPV6 |
 		       SKB_GSO_GRE |
 		       SKB_GSO_GRE_CSUM |
 		       SKB_GSO_IPIP |
@@ -80,7 +82,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_TCPV6 |
 		       0)))
 		goto out;
 
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 0183b32da942..bbcf60465e5c 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -31,6 +31,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_TCPV6 |
 				  SKB_GSO_UDP |
 				  SKB_GSO_DODGY |
+				  SKB_GSO_TCP_FIXEDID |
 				  SKB_GSO_TCP_ECN)))
 		goto out;
 
-- 
cgit v1.2.3


From 1530545ed64b42e87acb43c0c16401bd1ebae6bf Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:44:57 -0400
Subject: GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID
 values

This patch does two things.

First it allows TCP to aggregate TCP frames with a fixed IPv4 ID field.  As
a result we should now be able to aggregate flows that were converted from
IPv6 to IPv4.  In addition this allows us more flexibility for future
implementations of segmentation as we may be able to use a fixed IP ID when
segmenting the flow.

The second thing this does is that it places limitations on the outer IPv4
ID header in the case of tunneled frames.  Specifically it forces the IP ID
to be incrementing by 1 unless the DF bit is set in the outer IPv4 header.
This way we can avoid creating overlapping series of IP IDs that could
possibly be fragmented if the frame goes through GRO and is then
resegmented via GSO.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  5 ++++-
 net/core/dev.c            |  1 +
 net/ipv4/af_inet.c        | 35 ++++++++++++++++++++++++++++-------
 net/ipv4/tcp_offload.c    | 16 +++++++++++++++-
 net/ipv6/ip6_offload.c    |  8 ++++++--
 5 files changed, 54 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8e372d01b3c1..2d70c521d516 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2121,7 +2121,10 @@ struct napi_gro_cb {
 	/* Used in GRE, set in fou/gue_gro_receive */
 	u8	is_fou:1;
 
-	/* 6 bit hole */
+	/* Used to determine if flush_id can be ignored */
+	u8	is_atomic:1;
+
+	/* 5 bit hole */
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index e896b1953ab6..b78b586b1856 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4462,6 +4462,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->encap_mark = 0;
 		NAPI_GRO_CB(skb)->is_fou = 0;
+		NAPI_GRO_CB(skb)->is_atomic = 1;
 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
 		/* Setup for GRO checksum validation */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5bbea9a0ce96..8564cab96189 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,6 +1328,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 
 	for (p = *head; p; p = p->next) {
 		struct iphdr *iph2;
+		u16 flush_id;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
@@ -1351,16 +1352,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 			(iph->tos ^ iph2->tos) |
 			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
-		/* Save the IP ID check to be included later when we get to
-		 * the transport layer so only the inner most IP ID is checked.
-		 * This is because some GSO/TSO implementations do not
-		 * correctly increment the IP ID for the outer hdrs.
-		 */
-		NAPI_GRO_CB(p)->flush_id =
-			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
 		NAPI_GRO_CB(p)->flush |= flush;
+
+		/* We need to store of the IP ID check to be included later
+		 * when we can verify that this packet does in fact belong
+		 * to a given flow.
+		 */
+		flush_id = (u16)(id - ntohs(iph2->id));
+
+		/* This bit of code makes it much easier for us to identify
+		 * the cases where we are doing atomic vs non-atomic IP ID
+		 * checks.  Specifically an atomic check can return IP ID
+		 * values 0 - 0xFFFF, while a non-atomic check can only
+		 * return 0 or 0xFFFF.
+		 */
+		if (!NAPI_GRO_CB(p)->is_atomic ||
+		    !(iph->frag_off & htons(IP_DF))) {
+			flush_id ^= NAPI_GRO_CB(p)->count;
+			flush_id = flush_id ? 0xFFFF : 0;
+		}
+
+		/* If the previous IP ID value was based on an atomic
+		 * datagram we can overwrite the value and ignore it.
+		 */
+		if (NAPI_GRO_CB(skb)->is_atomic)
+			NAPI_GRO_CB(p)->flush_id = flush_id;
+		else
+			NAPI_GRO_CB(p)->flush_id |= flush_id;
 	}
 
+	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
 	NAPI_GRO_CB(skb)->flush |= flush;
 	skb_set_network_header(skb, off);
 	/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 08dd25d835af..d1ffd55289bd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -239,7 +239,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 found:
 	/* Include the IP ID check below from the inner most IP hdr */
-	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+	flush = NAPI_GRO_CB(p)->flush;
 	flush |= (__force int)(flags & TCP_FLAG_CWR);
 	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
 		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -248,6 +248,17 @@ found:
 		flush |= *(u32 *)((u8 *)th + i) ^
 			 *(u32 *)((u8 *)th2 + i);
 
+	/* When we receive our second frame we can made a decision on if we
+	 * continue this flow as an atomic flow with a fixed ID or if we use
+	 * an incrementing ID.
+	 */
+	if (NAPI_GRO_CB(p)->flush_id != 1 ||
+	    NAPI_GRO_CB(p)->count != 1 ||
+	    !NAPI_GRO_CB(p)->is_atomic)
+		flush |= NAPI_GRO_CB(p)->flush_id;
+	else
+		NAPI_GRO_CB(p)->is_atomic = false;
+
 	mss = skb_shinfo(p)->gso_size;
 
 	flush |= (len - 1) >= mss;
@@ -316,6 +327,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 				  iph->daddr, 0);
 	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
 
+	if (NAPI_GRO_CB(skb)->is_atomic)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
 	return tcp_gro_complete(skb);
 }
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b3a779393d71..061adcda65f3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -240,10 +240,14 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
 		NAPI_GRO_CB(p)->flush |= flush;
 
-		/* Clear flush_id, there's really no concept of ID in IPv6. */
-		NAPI_GRO_CB(p)->flush_id = 0;
+		/* If the previous IP ID value was based on an atomic
+		 * datagram we can overwrite the value and ignore it.
+		 */
+		if (NAPI_GRO_CB(skb)->is_atomic)
+			NAPI_GRO_CB(p)->flush_id = 0;
 	}
 
+	NAPI_GRO_CB(skb)->is_atomic = true;
 	NAPI_GRO_CB(skb)->flush |= flush;
 
 	skb_gro_postpull_rcsum(skb, iph, nlen);
-- 
cgit v1.2.3


From 802ab55adc39a06940a1b384e9fd0387fc762d7e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:45:03 -0400
Subject: GSO: Support partial segmentation offload

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  5 +++++
 include/linux/netdevice.h       |  2 ++
 include/linux/skbuff.h          |  9 +++++++--
 net/core/dev.c                  | 36 +++++++++++++++++++++++++++++++++---
 net/core/ethtool.c              |  1 +
 net/core/skbuff.c               | 29 ++++++++++++++++++++++++++++-
 net/ipv4/af_inet.c              | 20 ++++++++++++++++----
 net/ipv4/gre_offload.c          | 26 +++++++++++++++++++++-----
 net/ipv4/tcp_offload.c          | 10 ++++++++--
 net/ipv4/udp_offload.c          | 27 +++++++++++++++++++++------
 net/ipv6/ip6_offload.c          | 10 +++++++++-
 11 files changed, 151 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7cf272a4b5c8..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+	NETIF_F_GSO_PARTIAL_BIT,	/* ... Only segment inner-most L4
+					 *     in hardware and all other
+					 *     headers in software.
+					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d70c521d516..a3bb534576a3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1654,6 +1654,7 @@ struct net_device {
 	netdev_features_t	vlan_features;
 	netdev_features_t	hw_enc_features;
 	netdev_features_t	mpls_features;
+	netdev_features_t	gso_partial_features;
 
 	int			ifindex;
 	int			group;
@@ -4004,6 +4005,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+	SKB_GSO_PARTIAL = 1 << 13,
+
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-	int	mac_offset;
+	union {
+		int	mac_offset;
+		int	data_offset;
+	};
 	int	encap_level;
 	__wsum	csum;
 	__u16	csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index b78b586b1856..556dd09af3b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 			return ERR_PTR(err);
 	}
 
+	/* Only report GSO partial support if it will enable us to
+	 * support segmentation on this frame without needing additional
+	 * work.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+		struct net_device *dev = skb->dev;
+
+		partial_features |= dev->features & dev->gso_partial_features;
+		if (!skb_gso_ok(skb, features | partial_features))
+			features &= ~NETIF_F_GSO_PARTIAL;
+	}
+
 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
@@ -2834,8 +2847,17 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
 	if (gso_segs > dev->gso_max_segs)
 		return features & ~NETIF_F_GSO_MASK;
 
-	/* Make sure to clear the IPv4 ID mangling feature if
-	 * the IPv4 header has the potential to be fragmented.
+	/* Support for GSO partial features requires software
+	 * intervention before we can actually process the packets
+	 * so we need to strip support for any partial features now
+	 * and we can pull them back in after we have partially
+	 * segmented the frame.
+	 */
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+		features &= ~dev->gso_partial_features;
+
+	/* Make sure to clear the IPv4 ID mangling feature if the
+	 * IPv4 header has the potential to be fragmented.
 	 */
 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 		struct iphdr *iph = skb->encapsulation ?
@@ -6729,6 +6751,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
+	/* GSO partial features require GSO partial be set */
+	if ((features & dev->gso_partial_features) &&
+	    !(features & NETIF_F_GSO_PARTIAL)) {
+		netdev_dbg(dev,
+			   "Dropping partially supported GSO features since no GSO partial.\n");
+		features &= ~dev->gso_partial_features;
+	}
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	if (dev->netdev_ops->ndo_busy_poll)
 		features |= NETIF_F_BUSY_POLL;
@@ -7011,7 +7041,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* Make NETIF_F_SG inheritable to tunnel devices.
 	 */
-	dev->hw_enc_features |= NETIF_F_SG;
+	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 
 	/* Make NETIF_F_SG inheritable to MPLS.
 	 */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9494c41cc77c..e0cf20a3b3dd 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -88,6 +88,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04c2d1c8c87..4cc594cdaada 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3076,8 +3076,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	struct sk_buff *frag_skb = head_skb;
 	unsigned int offset = doffset;
 	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+	unsigned int partial_segs = 0;
 	unsigned int headroom;
-	unsigned int len;
+	unsigned int len = head_skb->len;
 	__be16 proto;
 	bool csum;
 	int sg = !!(features & NETIF_F_SG);
@@ -3094,6 +3095,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 	csum = !!can_checksum_protocol(features, proto);
 
+	/* GSO partial only requires that we trim off any excess that
+	 * doesn't fit into an MSS sized block, so take care of that
+	 * now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		partial_segs = len / mss;
+		mss *= partial_segs;
+	}
+
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
 
@@ -3281,6 +3291,23 @@ perform_csum_check:
 	 */
 	segs->prev = tail;
 
+	/* Update GSO info on first skb in partial sequence. */
+	if (partial_segs) {
+		int type = skb_shinfo(head_skb)->gso_type;
+
+		/* Update type to add partial and then remove dodgy if set */
+		type |= SKB_GSO_PARTIAL;
+		type &= ~SKB_GSO_DODGY;
+
+		/* Update GSO info and prepare to start updating headers on
+		 * our way back down the stack of protocols.
+		 */
+		skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+		skb_shinfo(segs)->gso_segs = partial_segs;
+		skb_shinfo(segs)->gso_type = type;
+		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+	}
+
 	/* Following permits correct backpressure, for protocols
 	 * using skb_set_owner_w().
 	 * Idea is to tranfert ownership from head_skb to last segment.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8564cab96189..2e6e65fc4d20 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1200,7 +1200,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	const struct net_offload *ops;
 	unsigned int offset = 0;
 	struct iphdr *iph;
-	int proto;
+	int proto, tot_len;
 	int nhoff;
 	int ihl;
 	int id;
@@ -1219,6 +1219,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TCP_FIXEDID |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -1273,10 +1274,21 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 			if (skb->next)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else if (!fixedid) {
-			iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
+			if (!fixedid) {
+				iph->id = htons(id);
+				id += skb_shinfo(skb)->gso_segs;
+			}
+			tot_len = skb_shinfo(skb)->gso_size +
+				  SKB_GSO_CB(skb)->data_offset +
+				  skb->head - (unsigned char *)iph;
+		} else {
+			if (!fixedid)
+				iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
 		}
-		iph->tot_len = htons(skb->len - nhoff);
+		iph->tot_len = htons(tot_len);
 		ip_send_check(iph);
 		if (encap)
 			skb_reset_inner_headers(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6376b0cdf693..20557f211408 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
-				  SKB_GSO_SIT)))
+				  SKB_GSO_SIT |
+				  SKB_GSO_PARTIAL)))
 		goto out;
 
 	if (!skb->encapsulation)
@@ -87,7 +88,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		struct gre_base_hdr *greh;
-		__be32 *pcsum;
+		__sum16 *pcsum;
 
 		/* Set up inner headers if we are offloading inner checksum */
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -107,10 +108,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 			continue;
 
 		greh = (struct gre_base_hdr *)skb_transport_header(skb);
-		pcsum = (__be32 *)(greh + 1);
+		pcsum = (__sum16 *)(greh + 1);
+
+		if (skb_is_gso(skb)) {
+			unsigned int partial_adj;
+
+			/* Adjust checksum to account for the fact that
+			 * the partial checksum is based on actual size
+			 * whereas headers should be based on MSS size.
+			 */
+			partial_adj = skb->len + skb_headroom(skb) -
+				      SKB_GSO_CB(skb)->data_offset -
+				      skb_shinfo(skb)->gso_size;
+			*pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+		} else {
+			*pcsum = 0;
+		}
 
-		*pcsum = 0;
-		*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+		*(pcsum + 1) = 0;
+		*pcsum = gso_make_checksum(skb, 0);
 	} while ((skb = skb->next));
 out:
 	return segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index d1ffd55289bd..02737b607aa7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -109,6 +109,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	/* GSO partial only requires splitting the frame into an MSS
+	 * multiple and possibly a remainder.  So update the mss now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL)
+		mss = skb->len - (skb->len % mss);
+
 	copy_destructor = gso_skb->destructor == tcp_wfree;
 	ooo_okay = gso_skb->ooo_okay;
 	/* All segments but the first should have ooo_okay cleared */
@@ -133,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
 					       (__force u32)delta));
 
-	do {
+	while (skb->next) {
 		th->fin = th->psh = 0;
 		th->check = newcheck;
 
@@ -153,7 +159,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 		th->seq = htonl(seq);
 		th->cwr = 0;
-	} while (skb->next);
+	}
 
 	/* Following permits TCP Small Queues to work well with GSO :
 	 * The callback to TCP stack will be called at the time last frag
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6230cf4b0d2d..097060def7f0 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -39,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	 * 16 bit length field due to the header being added outside of an
 	 * IP or IPv6 frame that was already limited to 64K - 1.
 	 */
-	partial = csum_sub(csum_unfold(uh->check),
-			   (__force __wsum)htonl(skb->len));
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+		partial = (__force __wsum)uh->len;
+	else
+		partial = (__force __wsum)htonl(skb->len);
+	partial = csum_sub(csum_unfold(uh->check), partial);
 
 	/* setup inner skb. */
 	skb->encapsulation = 0;
@@ -89,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
-		__be16 len;
+		unsigned int len;
 
 		if (remcsum)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -107,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb_set_transport_header(skb, udp_offset);
-		len = htons(skb->len - udp_offset);
+		len = skb->len - udp_offset;
 		uh = udp_hdr(skb);
-		uh->len = len;
+
+		/* If we are only performing partial GSO the inner header
+		 * will be using a length value equal to only one MSS sized
+		 * segment instead of the entire frame.
+		 */
+		if (skb_is_gso(skb)) {
+			uh->len = htons(skb_shinfo(skb)->gso_size +
+					SKB_GSO_CB(skb)->data_offset +
+					skb->head - (unsigned char *)uh);
+		} else {
+			uh->len = htons(len);
+		}
 
 		if (!need_csum)
 			continue;
 
-		uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+		uh->check = ~csum_fold(csum_add(partial,
+				       (__force __wsum)htonl(len)));
 
 		if (skb->encapsulation || !offload_csum) {
 			uh->check = gso_make_checksum(skb, ~uh->check);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 061adcda65f3..f5eb184e1093 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,6 +63,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int proto;
 	struct frag_hdr *fptr;
 	unsigned int unfrag_ip6hlen;
+	unsigned int payload_len;
 	u8 *prevhdr;
 	int offset = 0;
 	bool encap, udpfrag;
@@ -82,6 +83,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -118,7 +120,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+		if (skb_is_gso(skb))
+			payload_len = skb_shinfo(skb)->gso_size +
+				      SKB_GSO_CB(skb)->data_offset +
+				      skb->head - (unsigned char *)(ipv6h + 1);
+		else
+			payload_len = skb->len - nhoff - sizeof(*ipv6h);
+		ipv6h->payload_len = htons(payload_len);
 		skb->network_header = (u8 *)ipv6h - skb->head;
 
 		if (udpfrag) {
-- 
cgit v1.2.3


From 435faee1aae9c1ac231f89e4faf0437bfe29f425 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:51 +0200
Subject: bpf, verifier: add ARG_PTR_TO_RAW_STACK type

When passing buffers from eBPF stack space into a helper function, we have
ARG_PTR_TO_STACK argument type for helpers available. The verifier makes sure
that such buffers are initialized, within boundaries, etc.

However, the downside with this is that we have a couple of helper functions
such as bpf_skb_load_bytes() that fill out the passed buffer in the expected
success case anyway, so zero initializing them prior to the helper call is
unneeded/wasted instructions in the eBPF program that can be avoided.

Therefore, add a new helper function argument type called ARG_PTR_TO_RAW_STACK.
The idea is to skip the STACK_MISC check in check_stack_boundary() and color
the related stack slots as STACK_MISC after we checked all call arguments.

Helper functions using ARG_PTR_TO_RAW_STACK must make sure that every path of
the helper function will fill the provided buffer area, so that we cannot leak
any uninitialized stack memory. This f.e. means that error paths need to
memset() the buffers, but the expected fast-path doesn't have to do this
anymore.

Since there's no such helper needing more than at most one ARG_PTR_TO_RAW_STACK
argument, we can keep it simple and don't need to check for multiple areas.
Should in future such a use-case really appear, we have check_raw_mode() that
will make sure we implement support for it first.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  5 +++++
 kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b2365a6eba3d..5fb3c610fa96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
+	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
+				 * need to be initialized, helper function must fill
+				 * all bytes or clear them in error case.
+				 */
+
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
 	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 202f8f738542..9c843a5417da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -207,6 +207,9 @@ struct verifier_env {
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
+	bool raw_mode;
+	int regno;
+	int access_size;
 };
 
 /* verbose verifier prints what it's seeing
@@ -789,7 +792,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * and all elements of stack are initialized
  */
 static int check_stack_boundary(struct verifier_env *env, int regno,
-				int access_size, bool zero_size_allowed)
+				int access_size, bool zero_size_allowed,
+				struct bpf_call_arg_meta *meta)
 {
 	struct verifier_state *state = &env->cur_state;
 	struct reg_state *regs = state->regs;
@@ -815,6 +819,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (meta && meta->raw_mode) {
+		meta->access_size = access_size;
+		meta->regno = regno;
+		return 0;
+	}
+
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
 			verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -859,7 +869,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 	} else if (arg_type == ARG_PTR_TO_CTX) {
 		expected_type = PTR_TO_CTX;
-	} else if (arg_type == ARG_PTR_TO_STACK) {
+	} else if (arg_type == ARG_PTR_TO_STACK ||
+		   arg_type == ARG_PTR_TO_RAW_STACK) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -867,6 +878,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		 */
 		if (reg->type == CONST_IMM && reg->imm == 0)
 			expected_type = CONST_IMM;
+		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -896,7 +908,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
-					   false);
+					   false, NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -907,7 +919,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno,
-					   meta->map_ptr->value_size, false);
+					   meta->map_ptr->value_size,
+					   false, NULL);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -922,7 +935,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno - 1, reg->imm,
-					   zero_size_allowed);
+					   zero_size_allowed, meta);
 	}
 
 	return err;
@@ -953,6 +966,24 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	return 0;
 }
 
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+
+	return count > 1 ? -EINVAL : 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -984,6 +1015,15 @@ static int check_call(struct verifier_env *env, int func_id)
 
 	memset(&meta, 0, sizeof(meta));
 
+	/* We only support one arg being in raw mode at the moment, which
+	 * is sufficient for the helper functions we have right now.
+	 */
+	err = check_raw_mode(fn);
+	if (err) {
+		verbose("kernel subsystem misconfigured func %d\n", func_id);
+		return err;
+	}
+
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -1001,6 +1041,15 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	/* Mark slots with STACK_MISC in case of raw mode, stack offset
+	 * is inferred from register state.
+	 */
+	for (i = 0; i < meta.access_size; i++) {
+		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		if (err)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		reg = regs + caller_saved[i];
-- 
cgit v1.2.3


From 464f664501816ef5fbbc00b8de96f4ae5a1c9325 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@qlogic.com>
Date: Thu, 14 Apr 2016 01:38:29 -0400
Subject: qed: Add infrastructure support for tunneling

This patch adds various structure/APIs needed to configure/enable different
tunnel [VXLAN/GRE/GENEVE] parameters on the adapter.

Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: Ariel Elior <Ariel.Elior@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h              |  46 ++++
 drivers/net/ethernet/qlogic/qed/qed_dev.c          |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h      |   2 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          |  51 ++++-
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    | 127 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_l2.c           |  31 +++
 drivers/net/ethernet/qlogic/qed/qed_main.c         |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |  31 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h           |   7 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c  | 254 +++++++++++++++++++++
 include/linux/qed/qed_eth_if.h                     |  10 +
 11 files changed, 563 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 0f0d2d1d77e5..33e2ed60c18f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -74,6 +74,51 @@ struct qed_rt_data {
 	bool	*b_valid;
 };
 
+enum qed_tunn_mode {
+	QED_MODE_L2GENEVE_TUNN,
+	QED_MODE_IPGENEVE_TUNN,
+	QED_MODE_L2GRE_TUNN,
+	QED_MODE_IPGRE_TUNN,
+	QED_MODE_VXLAN_TUNN,
+};
+
+enum qed_tunn_clss {
+	QED_TUNN_CLSS_MAC_VLAN,
+	QED_TUNN_CLSS_MAC_VNI,
+	QED_TUNN_CLSS_INNER_MAC_VLAN,
+	QED_TUNN_CLSS_INNER_MAC_VNI,
+	MAX_QED_TUNN_CLSS,
+};
+
+struct qed_tunn_start_params {
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
+struct qed_tunn_update_params {
+	unsigned long	tunn_mode_update_mask;
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_rx_pf_clss;
+	u8		update_tx_pf_clss;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
 /* The PCI personality is not quite synonymous to protocol ID:
  * 1. All personalities need CORE connections
  * 2. The Ethernet personality may support also the RoCE protocol
@@ -430,6 +475,7 @@ struct qed_dev {
 	u8				num_hwfns;
 	struct qed_hwfn			hwfns[MAX_HWFNS_PER_DEVICE];
 
+	unsigned long			tunn_mode;
 	u32				drv_type;
 
 	struct qed_eth_stats		*reset_stats;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index b7d100f6bd6f..bdae5a55afa4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -558,6 +558,7 @@ static int qed_hw_init_port(struct qed_hwfn *p_hwfn,
 
 static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt,
+			  struct qed_tunn_start_params *p_tunn,
 			  int hw_mode,
 			  bool b_hw_start,
 			  enum qed_int_mode int_mode,
@@ -625,7 +626,7 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		qed_int_igu_enable(p_hwfn, p_ptt, int_mode);
 
 		/* send function start command */
-		rc = qed_sp_pf_start(p_hwfn, p_hwfn->cdev->mf_mode);
+		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode);
 		if (rc)
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
 	}
@@ -672,6 +673,7 @@ static void qed_reset_mb_shadow(struct qed_hwfn *p_hwfn,
 }
 
 int qed_hw_init(struct qed_dev *cdev,
+		struct qed_tunn_start_params *p_tunn,
 		bool b_hw_start,
 		enum qed_int_mode int_mode,
 		bool allow_npar_tx_switch,
@@ -724,7 +726,7 @@ int qed_hw_init(struct qed_dev *cdev,
 		/* Fall into */
 		case FW_MSG_CODE_DRV_LOAD_FUNCTION:
 			rc = qed_hw_init_pf(p_hwfn, p_hwfn->p_main_ptt,
-					    p_hwfn->hw_info.hw_mode,
+					    p_tunn, p_hwfn->hw_info.hw_mode,
 					    b_hw_start, int_mode,
 					    allow_npar_tx_switch);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index d6c7ddf4f4d4..6aac3f855aa1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -62,6 +62,7 @@ void qed_resc_setup(struct qed_dev *cdev);
  * @brief qed_hw_init -
  *
  * @param cdev
+ * @param p_tunn
  * @param b_hw_start
  * @param int_mode - interrupt mode [msix, inta, etc.] to use.
  * @param allow_npar_tx_switch - npar tx switching to be used
@@ -72,6 +73,7 @@ void qed_resc_setup(struct qed_dev *cdev);
  * @return int
  */
 int qed_hw_init(struct qed_dev *cdev,
+		struct qed_tunn_start_params *p_tunn,
 		bool b_hw_start,
 		enum qed_int_mode int_mode,
 		bool allow_npar_tx_switch,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index a368f5e71d95..15e02ab9be5a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -46,7 +46,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
 	COMMON_RAMROD_RESERVED,
 	COMMON_RAMROD_RESERVED2,
-	COMMON_RAMROD_RESERVED3,
+	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
 	MAX_COMMON_RAMROD_CMD_ID
 };
@@ -626,6 +626,42 @@ struct pf_start_ramrod_data {
 	u8				reserved0[4];
 };
 
+/* tunnel configuration */
+struct pf_update_tunnel_config {
+	u8	update_rx_pf_clss;
+	u8	update_tx_pf_clss;
+	u8	set_vxlan_udp_port_flg;
+	u8	set_geneve_udp_port_flg;
+	u8	tx_enable_vxlan;
+	u8	tx_enable_l2geneve;
+	u8	tx_enable_ipgeneve;
+	u8	tx_enable_l2gre;
+	u8	tx_enable_ipgre;
+	u8	tunnel_clss_vxlan;
+	u8	tunnel_clss_l2geneve;
+	u8	tunnel_clss_ipgeneve;
+	u8	tunnel_clss_l2gre;
+	u8	tunnel_clss_ipgre;
+	__le16	vxlan_udp_port;
+	__le16	geneve_udp_port;
+	__le16	reserved[3];
+};
+
+struct pf_update_ramrod_data {
+	u32				reserved[2];
+	u32				reserved_1[6];
+	struct pf_update_tunnel_config	tunnel_config;
+};
+
+/* Tunnel classification scheme */
+enum tunnel_clss {
+	TUNNEL_CLSS_MAC_VLAN = 0,
+	TUNNEL_CLSS_MAC_VNI,
+	TUNNEL_CLSS_INNER_MAC_VLAN,
+	TUNNEL_CLSS_INNER_MAC_VNI,
+	MAX_TUNNEL_CLSS
+};
+
 enum ports_mode {
 	ENGX2_PORTX1 /* 2 engines x 1 port */,
 	ENGX2_PORTX2 /* 2 engines x 2 ports */,
@@ -1603,6 +1639,19 @@ bool qed_send_qm_stop_cmd(struct qed_hwfn	*p_hwfn,
 			  u16			start_pq,
 			  u16			num_pqs);
 
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt  *p_ptt, u16 dest_port);
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, bool vxlan_enable);
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn,
+			struct qed_ptt  *p_ptt, bool eth_gre_enable,
+			bool ip_gre_enable);
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 dest_port);
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, bool eth_geneve_enable,
+			   bool ip_geneve_enable);
+
 /* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
 #define YSTORM_FLOW_CONTROL_MODE_OFFSET  (IRO[0].base)
 #define YSTORM_FLOW_CONTROL_MODE_SIZE    (IRO[0].size)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index f55ebdc3c832..1dd53248b984 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -788,3 +788,130 @@ bool qed_send_qm_stop_cmd(struct qed_hwfn *p_hwfn,
 
 	return true;
 }
+
+static void
+qed_set_tunnel_type_enable_bit(unsigned long *var, int bit, bool enable)
+{
+	if (enable)
+		set_bit(bit, var);
+	else
+		clear_bit(bit, var);
+}
+
+#define PRS_ETH_TUNN_FIC_FORMAT	-188897008
+
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     u16 dest_port)
+{
+	qed_wr(p_hwfn, p_ptt, PRS_REG_VXLAN_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_VXLAN_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_VXLAN_PORT, dest_port);
+}
+
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  bool vxlan_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_VXLAN_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, vxlan_enable);
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, vxlan_enable);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN,
+	       vxlan_enable ? 1 : 0);
+}
+
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			bool eth_gre_enable, bool ip_gre_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_gre_enable);
+
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_gre_enable);
+
+	shift = NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN,
+	       eth_gre_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN,
+	       ip_gre_enable ? 1 : 0);
+}
+
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u16 dest_port)
+{
+	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_PORT, dest_port);
+}
+
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   bool eth_geneve_enable,
+			   bool ip_geneve_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GENEVE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_geneve_enable);
+
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GENEVE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_geneve_enable);
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_ETH_ENABLE,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_IP_ENABLE, ip_geneve_enable ? 1 : 0);
+
+	/* comp ver */
+	reg_val = (ip_geneve_enable || eth_geneve_enable) ? 1 : 0;
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_COMP_VER, reg_val);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_COMP_VER, reg_val);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_COMP_VER, reg_val);
+
+	/* EDPM with geneve tunnel not supported in BB_B0 */
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		return;
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN,
+	       ip_geneve_enable ? 1 : 0);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 5005497ee23e..fb5f3b815340 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1884,6 +1884,36 @@ static int qed_stop_txq(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_tunn_configure(struct qed_dev *cdev,
+			      struct qed_tunn_params *tunn_params)
+{
+	struct qed_tunn_update_params tunn_info;
+	int i, rc;
+
+	memset(&tunn_info, 0, sizeof(tunn_info));
+	if (tunn_params->update_vxlan_port == 1) {
+		tunn_info.update_vxlan_udp_port = 1;
+		tunn_info.vxlan_udp_port = tunn_params->vxlan_port;
+	}
+
+	if (tunn_params->update_geneve_port == 1) {
+		tunn_info.update_geneve_udp_port = 1;
+		tunn_info.geneve_udp_port = tunn_params->geneve_port;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+
+		rc = qed_sp_pf_update_tunn_cfg(hwfn, &tunn_info,
+					       QED_SPQ_MODE_EBLOCK, NULL);
+
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
 static int qed_configure_filter_rx_mode(struct qed_dev *cdev,
 					enum qed_filter_rx_mode_type type)
 {
@@ -2026,6 +2056,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.fastpath_stop = &qed_fastpath_stop,
 	.eth_cqe_completion = &qed_fp_cqe_completion,
 	.get_vport_stats = &qed_get_vport_stats,
+	.tunn_config = &qed_tunn_configure,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c31d485f72d6..1916992ae8b1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -776,7 +776,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	/* Start the slowpath */
 	data = cdev->firmware->data;
 
-	rc = qed_hw_init(cdev, true, cdev->int_params.out.int_mode,
+	rc = qed_hw_init(cdev, NULL, true, cdev->int_params.out.int_mode,
 			 true, data);
 	if (rc)
 		goto err2;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index c15b1622e636..55451a4dc587 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -427,4 +427,35 @@
 	0x2aae60UL
 #define PGLUE_B_REG_PF_BAR1_SIZE \
 	0x2aae64UL
+#define PRS_REG_ENCAPSULATION_TYPE_EN	0x1f0730UL
+#define PRS_REG_GRE_PROTOCOL		0x1f0734UL
+#define PRS_REG_VXLAN_PORT		0x1f0738UL
+#define PRS_REG_OUTPUT_FORMAT_4_0	0x1f099cUL
+#define NIG_REG_ENC_TYPE_ENABLE		0x501058UL
+
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE		(0x1 << 0)
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT	0
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE		(0x1 << 1)
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT	1
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE			(0x1 << 2)
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT		2
+
+#define NIG_REG_VXLAN_PORT		0x50105cUL
+#define PBF_REG_VXLAN_PORT		0xd80518UL
+#define PBF_REG_NGE_PORT		0xd8051cUL
+#define PRS_REG_NGE_PORT		0x1f086cUL
+#define NIG_REG_NGE_PORT		0x508b38UL
+
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN	0x10090cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN	0x100910UL
+#define DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN	0x100914UL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN	0x10092cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN	0x100930UL
+
+#define NIG_REG_NGE_IP_ENABLE			0x508b28UL
+#define NIG_REG_NGE_ETH_ENABLE			0x508b2cUL
+#define NIG_REG_NGE_COMP_VER			0x508b30UL
+#define PBF_REG_NGE_COMP_VER			0xd80524UL
+#define PRS_REG_NGE_COMP_VER			0x1f0878UL
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index d39f914b66ee..4b91cb32f317 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -52,6 +52,7 @@ int qed_eth_cqe_completion(struct qed_hwfn *p_hwfn,
 
 union ramrod_data {
 	struct pf_start_ramrod_data pf_start;
+	struct pf_update_ramrod_data pf_update;
 	struct rx_queue_start_ramrod_data rx_queue_start;
 	struct rx_queue_update_ramrod_data rx_queue_update;
 	struct rx_queue_stop_ramrod_data rx_queue_stop;
@@ -338,12 +339,14 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * to the internal RAM of the UStorm by the Function Start Ramrod.
  *
  * @param p_hwfn
+ * @param p_tunn
  * @param mode
  *
  * @return int
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode);
 
 /**
@@ -362,4 +365,8 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 1c06c37d4c3d..306da7000ddc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -87,7 +87,217 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static enum tunnel_clss qed_tunn_get_clss_type(u8 type)
+{
+	switch (type) {
+	case QED_TUNN_CLSS_MAC_VLAN:
+		return TUNNEL_CLSS_MAC_VLAN;
+	case QED_TUNN_CLSS_MAC_VNI:
+		return TUNNEL_CLSS_MAC_VNI;
+	case QED_TUNN_CLSS_INNER_MAC_VLAN:
+		return TUNNEL_CLSS_INNER_MAC_VLAN;
+	case QED_TUNN_CLSS_INNER_MAC_VNI:
+		return TUNNEL_CLSS_INNER_MAC_VNI;
+	default:
+		return TUNNEL_CLSS_MAC_VLAN;
+	}
+}
+
+static void
+qed_tunn_set_pf_fix_tunn_mode(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	unsigned long cached_tunn_mode = p_hwfn->cdev->tunn_mode;
+	unsigned long update_mask = p_src->tunn_mode_update_mask;
+	unsigned long tunn_mode = p_src->tunn_mode;
+	unsigned long new_tunn_mode = 0;
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_L2GRE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_IPGRE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_VXLAN_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
+	}
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_L2GENEVE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_IPGENEVE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
+	}
+
+	p_src->tunn_mode = new_tunn_mode;
+}
+
+static void
+qed_tunn_set_pf_update_params(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	unsigned long tunn_mode = p_src->tunn_mode;
+	enum tunnel_clss type;
+
+	qed_tunn_set_pf_fix_tunn_mode(p_hwfn, p_src, p_tunn_cfg);
+	p_tunn_cfg->update_rx_pf_clss = p_src->update_rx_pf_clss;
+	p_tunn_cfg->update_tx_pf_clss = p_src->update_tx_pf_clss;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
+	p_tunn_cfg->tunnel_clss_vxlan  = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
+	p_tunn_cfg->tunnel_clss_l2gre = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
+	p_tunn_cfg->tunnel_clss_ipgre = type;
+
+	if (p_src->update_vxlan_udp_port) {
+		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
+		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2gre = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgre = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_vxlan = 1;
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2geneve = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgeneve = 1;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
+	p_tunn_cfg->tunnel_clss_l2geneve = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
+	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+}
+
+static void qed_set_hw_tunn_mode(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 unsigned long tunn_mode)
+{
+	u8 l2gre_enable = 0, ipgre_enable = 0, vxlan_enable = 0;
+	u8 l2geneve_enable = 0, ipgeneve_enable = 0;
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		l2gre_enable = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		ipgre_enable = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		vxlan_enable = 1;
+
+	qed_set_gre_enable(p_hwfn, p_ptt, l2gre_enable, ipgre_enable);
+	qed_set_vxlan_enable(p_hwfn, p_ptt, vxlan_enable);
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		l2geneve_enable = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		ipgeneve_enable = 1;
+
+	qed_set_geneve_enable(p_hwfn, p_ptt, l2geneve_enable,
+			      ipgeneve_enable);
+}
+
+static void
+qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
+			     struct qed_tunn_start_params *p_src,
+			     struct pf_start_tunnel_config *p_tunn_cfg)
+{
+	unsigned long tunn_mode;
+	enum tunnel_clss type;
+
+	if (!p_src)
+		return;
+
+	tunn_mode = p_src->tunn_mode;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
+	p_tunn_cfg->tunnel_clss_vxlan = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
+	p_tunn_cfg->tunnel_clss_l2gre = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
+	p_tunn_cfg->tunnel_clss_ipgre = type;
+
+	if (p_src->update_vxlan_udp_port) {
+		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
+		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2gre = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgre = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_vxlan = 1;
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2geneve = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgeneve = 1;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
+	p_tunn_cfg->tunnel_clss_l2geneve = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
+	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+}
+
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
@@ -143,6 +353,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
 		       p_hwfn->p_consq->chain.pbl.p_phys_table);
 
+	qed_tunn_set_pf_start_params(p_hwfn, NULL, NULL);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
@@ -153,6 +364,49 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+/* Set pf update ramrod command params */
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_tunn_set_pf_update_params(p_hwfn, p_tunn,
+				      &p_ent->ramrod.pf_update.tunnel_config);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	if (p_tunn->update_vxlan_udp_port)
+		qed_set_vxlan_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					p_tunn->vxlan_udp_port);
+	if (p_tunn->update_geneve_udp_port)
+		qed_set_geneve_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					 p_tunn->geneve_udp_port);
+
+	qed_set_hw_tunn_mode(p_hwfn, p_hwfn->p_main_ptt, p_tunn->tunn_mode);
+	p_hwfn->cdev->tunn_mode = p_tunn->tunn_mode;
+
+	return rc;
+}
+
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn)
 {
 	struct qed_spq_entry *p_ent = NULL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 795c9902e02f..3a4c806be156 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -112,6 +112,13 @@ struct qed_queue_start_common_params {
 	u16 sb_idx;
 };
 
+struct qed_tunn_params {
+	u16 vxlan_port;
+	u8 update_vxlan_port;
+	u16 geneve_port;
+	u8 update_geneve_port;
+};
+
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
 };
@@ -166,6 +173,9 @@ struct qed_eth_ops {
 
 	void (*get_vport_stats)(struct qed_dev *cdev,
 				struct qed_eth_stats *stats);
+
+	int (*tunn_config)(struct qed_dev *cdev,
+			   struct qed_tunn_params *params);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
-- 
cgit v1.2.3


From e1c9c62b9a3a761b56359a7437215ae2e9253821 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 11 Apr 2016 23:10:21 +0300
Subject: net/mlx5: Fix mlx5 ifc cmd_hca_cap bad offsets

All reserved fields after early_vf_enable are off by 1, since
early_vf_enable was not explicitly declared as array of size 1.

Reserved field before cqe_zip had a wrong size, it should
be 0x80 + 0x3f.

Fixes: b0844444590e ("net/mlx5_core: Introduce access function to read internal timer ")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 107 ++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c15b8a864937..c300e7491d80 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -750,21 +750,21 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
 	u8         eswitch_flow_table[0x1];
-	u8	   early_vf_enable;
-	u8         reserved_at_1a8[0x2];
+	u8	   early_vf_enable[0x1];
+	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         reserved_at_1af[0x6];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_at_1bf[0x3];
+	u8         reserved_at_1c0[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_at_1c7[0x4];
+	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1cf[0x6];
+	u8         reserved_at_1d0[0x6];
 	u8         rol_s[0x1];
 	u8         rol_g[0x1];
-	u8         reserved_at_1d7[0x1];
+	u8         reserved_at_1d8[0x1];
 	u8         wol_s[0x1];
 	u8         wol_g[0x1];
 	u8         wol_a[0x1];
@@ -774,47 +774,47 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         wol_p[0x1];
 
 	u8         stat_rate_support[0x10];
-	u8         reserved_at_1ef[0xc];
+	u8         reserved_at_1f0[0xc];
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
 	u8         reserved_at_200[0x3];
 	u8         ipoib_basic_offloads[0x1];
-	u8         reserved_at_204[0xa];
+	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-	u8         reserved_at_212[0x1];
+	u8         reserved_at_213[0x1];
 	u8         wq_signature[0x1];
 	u8         sctr_data_cqe[0x1];
-	u8         reserved_at_215[0x1];
+	u8         reserved_at_216[0x1];
 	u8         sho[0x1];
 	u8         tph[0x1];
 	u8         rf[0x1];
 	u8         dct[0x1];
-	u8         reserved_at_21a[0x1];
+	u8         reserved_at_21b[0x1];
 	u8         eth_net_offloads[0x1];
 	u8         roce[0x1];
 	u8         atomic[0x1];
-	u8         reserved_at_21e[0x1];
+	u8         reserved_at_21f[0x1];
 
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         reserved_at_222[0x3];
+	u8         reserved_at_223[0x3];
 	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-	u8         reserved_at_228[0x1];
+	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
 	u8         reserved_at_22a[0x1];
 	u8         cd[0x1];
-	u8         reserved_at_22c[0x1];
+	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
 	u8         reserved_at_22f[0x1];
 	u8	   imaicl[0x1];
-	u8         reserved_at_231[0x4];
+	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
 	u8         pkv[0x1];
 	u8         set_deth_sqpn[0x1];
@@ -824,98 +824,101 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_at_23f[0xa];
+	u8         reserved_at_240[0xa];
 	u8         uar_sz[0x6];
-	u8         reserved_at_24f[0x8];
+	u8         reserved_at_250[0x8];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_260[0x1];
+	u8         reserved_at_261[0x1];
 	u8         pad_tx_eth_packet[0x1];
-	u8         reserved_at_262[0x8];
+	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
-	u8         reserved_at_26f[0x10];
+	u8         reserved_at_270[0x10];
 
-	u8         reserved_at_27f[0x10];
+	u8         reserved_at_280[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_at_29f[0x10];
+	u8         reserved_at_2a0[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_at_2bf[0x10];
+	u8         reserved_at_2c0[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
-	u8         reserved_at_2df[0x7];
+	u8         reserved_at_2e0[0x7];
 	u8         max_qp_mcg[0x19];
 
-	u8         reserved_at_2ff[0x18];
+	u8         reserved_at_300[0x18];
 	u8         log_max_mcg[0x8];
 
-	u8         reserved_at_31f[0x3];
+	u8         reserved_at_320[0x3];
 	u8         log_max_transport_domain[0x5];
-	u8         reserved_at_327[0x3];
+	u8         reserved_at_328[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_32f[0xb];
+	u8         reserved_at_330[0xb];
 	u8         log_max_xrcd[0x5];
 
-	u8         reserved_at_33f[0x20];
+	u8         reserved_at_340[0x20];
 
-	u8         reserved_at_35f[0x3];
+	u8         reserved_at_360[0x3];
 	u8         log_max_rq[0x5];
-	u8         reserved_at_367[0x3];
+	u8         reserved_at_368[0x3];
 	u8         log_max_sq[0x5];
-	u8         reserved_at_36f[0x3];
+	u8         reserved_at_370[0x3];
 	u8         log_max_tir[0x5];
-	u8         reserved_at_377[0x3];
+	u8         reserved_at_378[0x3];
 	u8         log_max_tis[0x5];
 
 	u8         basic_cyclic_rcv_wqe[0x1];
-	u8         reserved_at_380[0x2];
+	u8         reserved_at_381[0x2];
 	u8         log_max_rmp[0x5];
-	u8         reserved_at_387[0x3];
+	u8         reserved_at_388[0x3];
 	u8         log_max_rqt[0x5];
-	u8         reserved_at_38f[0x3];
+	u8         reserved_at_390[0x3];
 	u8         log_max_rqt_size[0x5];
-	u8         reserved_at_397[0x3];
+	u8         reserved_at_398[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_at_39f[0x3];
+	u8         reserved_at_3a0[0x3];
 	u8         log_max_stride_sz_rq[0x5];
-	u8         reserved_at_3a7[0x3];
+	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-	u8         reserved_at_3af[0x3];
+	u8         reserved_at_3b0[0x3];
 	u8         log_max_stride_sz_sq[0x5];
-	u8         reserved_at_3b7[0x3];
+	u8         reserved_at_3b8[0x3];
 	u8         log_min_stride_sz_sq[0x5];
 
-	u8         reserved_at_3bf[0x1b];
+	u8         reserved_at_3c0[0x1b];
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         reserved_at_3e0[0xa];
+	u8         reserved_at_3e1[0xa];
 	u8         log_max_vlan_list[0x5];
-	u8         reserved_at_3ef[0x3];
+	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
-	u8         reserved_at_3f7[0x3];
+	u8         reserved_at_3f8[0x3];
 	u8         log_max_current_uc_list[0x5];
 
-	u8         reserved_at_3ff[0x80];
+	u8         reserved_at_400[0x80];
 
-	u8         reserved_at_47f[0x3];
+	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
-	u8         reserved_at_487[0x8];
+	u8         reserved_at_488[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_at_49f[0x20];
+	u8         reserved_at_4a0[0x20];
 	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
-	u8         reserved_at_4ff[0x5f];
+
+	u8         reserved_at_500[0x80];
+
+	u8         reserved_at_580[0x3f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_at_57f[0x220];
+	u8         reserved_at_5e0[0x220];
 };
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From 7d5e14237a551a5de3d287f2e8db2d044ee81a1a Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 11 Apr 2016 23:10:22 +0300
Subject: net/mlx5: Update mlx5_ifc hardware features

Adding the needed mlx5_ifc hardware bits and structs
for the following feature:

* Add vport to steering commands for SRIOV ACL support
* Add mlcr, pcmr and mcia registers for dump module EEPROM
* Add support for FCS, baeacon led and disable_link bits to
  hca caps
* Add CQE period mode bit in  CQ context for CQE based CQ
  moderation support
* Add umr SQ bit for fragmented memory registration
* Add needed bits and caps for Striding RQ support

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 146 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c300e7491d80..4ce4ea422a10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -513,7 +513,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         max_lso_cap[0x5];
 	u8         reserved_at_10[0x4];
 	u8         rss_ind_tbl_cap[0x4];
-	u8         reserved_at_18[0x3];
+	u8         reg_umr_sq[0x1];
+	u8         scatter_fcs[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
 	u8         tunnel_statless_gre[0x1];
@@ -648,7 +650,7 @@ struct mlx5_ifc_vector_calc_cap_bits {
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
-	MLX5_WQ_TYPE_STRQ         = 0x2,
+	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
 };
 
 enum {
@@ -753,7 +755,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x6];
+	u8         reserved_at_1af[0x2];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b2[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
@@ -778,7 +784,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
-	u8         reserved_at_200[0x3];
+	u8         striding_rq[0x1];
+	u8         reserved_at_201[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -807,12 +814,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         block_lb_mc[0x1];
 	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
-	u8         reserved_at_22a[0x1];
+	u8         cq_period_start_from_cqe[0x1];
 	u8         cd[0x1];
 	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
-	u8         reserved_at_22f[0x1];
+	u8         umr_ptr_rlky[0x1];
 	u8	   imaicl[0x1];
 	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
@@ -913,10 +920,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_500[0x80];
 
 	u8         reserved_at_580[0x3f];
-	u8         cqe_zip[0x1];
+	u8         cqe_compression[0x1];
 
-	u8         cqe_zip_timeout[0x10];
-	u8         cqe_zip_max_num[0x10];
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
 
 	u8         reserved_at_5e0[0x220];
 };
@@ -1000,7 +1007,13 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_118[0x3];
 	u8         log_wq_sz[0x5];
 
-	u8         reserved_at_120[0x4e0];
+	u8         reserved_at_120[0x15];
+	u8         log_wqe_num_of_strides[0x3];
+	u8         two_byte_shift_en[0x1];
+	u8         reserved_at_139[0x4];
+	u8         log_wqe_stride_size[0x3];
+
+	u8         reserved_at_140[0x4c0];
 
 	struct mlx5_ifc_cmd_pas_bits pas[0];
 };
@@ -2199,7 +2212,8 @@ struct mlx5_ifc_sqc_bits {
 	u8         flush_in_error_en[0x1];
 	u8         reserved_at_4[0x4];
 	u8         state[0x4];
-	u8         reserved_at_c[0x14];
+	u8         reg_umr[0x1];
+	u8         reserved_at_d[0x13];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
@@ -2247,7 +2261,8 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         scatter_fcs[0x1];
 	u8         vsd[0x1];
 	u8         mem_rq_type[0x4];
 	u8         state[0x4];
@@ -2604,6 +2619,11 @@ enum {
 	MLX5_CQC_ST_FIRED                                 = 0xa,
 };
 
+enum {
+	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+};
+
 struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x4];
@@ -2612,8 +2632,8 @@ struct mlx5_ifc_cqc_bits {
 	u8         reserved_at_c[0x1];
 	u8         scqe_break_moderation_en[0x1];
 	u8         oi[0x1];
-	u8         reserved_at_f[0x2];
-	u8         cqe_zip_en[0x1];
+	u8         cq_period_mode[0x2];
+	u8         cqe_comp_en[0x1];
 	u8         mini_cqe_res_format[0x2];
 	u8         st[0x4];
 	u8         reserved_at_18[0x8];
@@ -2987,7 +3007,11 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5181,7 +5205,11 @@ struct mlx5_ifc_destroy_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5208,7 +5236,11 @@ struct mlx5_ifc_destroy_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5349,7 +5381,11 @@ struct mlx5_ifc_delete_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5795,7 +5831,11 @@ struct mlx5_ifc_create_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5839,7 +5879,11 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -6372,6 +6416,17 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_mlcr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x20];
+
+	u8         beacon_duration[0x10];
+	u8         reserved_at_40[0x10];
+
+	u8         beacon_remain[0x10];
+};
+
 struct mlx5_ifc_ptas_reg_bits {
 	u8         reserved_at_0[0x20];
 
@@ -6781,6 +6836,16 @@ struct mlx5_ifc_pamp_reg_bits {
 	u8         index_data[18][0x10];
 };
 
+struct mlx5_ifc_pcmr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2e];
+	u8         fcs_cap[0x1];
+	u8         reserved_at_3f[0x1f];
+	u8         fcs_chk[0x1];
+	u8         reserved_at_5f[0x1];
+};
+
 struct mlx5_ifc_lane_2_module_mapping_bits {
 	u8         reserved_at_0[0x6];
 	u8         rx_lane[0x2];
@@ -7117,6 +7182,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pspa_reg_bits pspa_reg;
 	struct mlx5_ifc_ptas_reg_bits ptas_reg;
 	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_mlcr_reg_bits mlcr_reg;
 	struct mlx5_ifc_pude_reg_bits pude_reg;
 	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
 	struct mlx5_ifc_slrg_reg_bits slrg_reg;
@@ -7150,7 +7216,11 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -7181,7 +7251,9 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x20];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x10];
 	u8         modify_field_select[0x10];
@@ -7247,4 +7319,34 @@ struct mlx5_ifc_qtct_reg_bits {
 	u8         tclass[0x3];
 };
 
+struct mlx5_ifc_mcia_reg_bits {
+	u8         l[0x1];
+	u8         reserved_at_1[0x7];
+	u8         module[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         i2c_device_address[0x8];
+	u8         page_number[0x8];
+	u8         device_address[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         dword_0[0x20];
+	u8         dword_1[0x20];
+	u8         dword_2[0x20];
+	u8         dword_3[0x20];
+	u8         dword_4[0x20];
+	u8         dword_5[0x20];
+	u8         dword_6[0x20];
+	u8         dword_7[0x20];
+	u8         dword_8[0x20];
+	u8         dword_9[0x20];
+	u8         dword_10[0x20];
+	u8         dword_11[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 52c52a61a39fb319c14a582f8631619e5d5f55bf Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 14 Apr 2016 15:35:30 +0800
Subject: sctp: add sctp_info dump api for sctp_diag

sctp_diag will dump some important details of sctp's assoc or ep, we use
sctp_info to describe them,  sctp_get_sctp_info to get them, and export
it to sctp_diag.ko.

v2->v3:
- we will not use list_for_each_safe in sctp_get_sctp_info, cause
  all the callers of it will use lock_sock.

- fix the holes in struct sctp_info with __reserved* field.
  because sctp_diag is a new feature, and sctp_info is just for now,
  it may be changed in the future.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 67 ++++++++++++++++++++++++++++++++++++++
 include/net/sctp/sctp.h |  3 ++
 net/sctp/socket.c       | 86 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 156 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a9414fd49dc6..dacb5e711994 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -705,4 +705,71 @@ typedef struct sctp_auth_chunk {
 	sctp_authhdr_t auth_hdr;
 } __packed sctp_auth_chunk_t;
 
+struct sctp_info {
+	__u32	sctpi_tag;
+	__u32	sctpi_state;
+	__u32	sctpi_rwnd;
+	__u16	sctpi_unackdata;
+	__u16	sctpi_penddata;
+	__u16	sctpi_instrms;
+	__u16	sctpi_outstrms;
+	__u32	sctpi_fragmentation_point;
+	__u32	sctpi_inqueue;
+	__u32	sctpi_outqueue;
+	__u32	sctpi_overall_error;
+	__u32	sctpi_max_burst;
+	__u32	sctpi_maxseg;
+	__u32	sctpi_peer_rwnd;
+	__u32	sctpi_peer_tag;
+	__u8	sctpi_peer_capable;
+	__u8	sctpi_peer_sack;
+	__u16	__reserved1;
+
+	/* assoc status info */
+	__u64	sctpi_isacks;
+	__u64	sctpi_osacks;
+	__u64	sctpi_opackets;
+	__u64	sctpi_ipackets;
+	__u64	sctpi_rtxchunks;
+	__u64	sctpi_outofseqtsns;
+	__u64	sctpi_idupchunks;
+	__u64	sctpi_gapcnt;
+	__u64	sctpi_ouodchunks;
+	__u64	sctpi_iuodchunks;
+	__u64	sctpi_oodchunks;
+	__u64	sctpi_iodchunks;
+	__u64	sctpi_octrlchunks;
+	__u64	sctpi_ictrlchunks;
+
+	/* primary transport info */
+	struct sockaddr_storage	sctpi_p_address;
+	__s32	sctpi_p_state;
+	__u32	sctpi_p_cwnd;
+	__u32	sctpi_p_srtt;
+	__u32	sctpi_p_rto;
+	__u32	sctpi_p_hbinterval;
+	__u32	sctpi_p_pathmaxrxt;
+	__u32	sctpi_p_sackdelay;
+	__u32	sctpi_p_sackfreq;
+	__u32	sctpi_p_ssthresh;
+	__u32	sctpi_p_partial_bytes_acked;
+	__u32	sctpi_p_flight_size;
+	__u16	sctpi_p_error;
+	__u16	__reserved2;
+
+	/* sctp sock info */
+	__u32	sctpi_s_autoclose;
+	__u32	sctpi_s_adaptation_ind;
+	__u32	sctpi_s_pd_point;
+	__u8	sctpi_s_nodelay;
+	__u8	sctpi_s_disable_fragments;
+	__u8	sctpi_s_v4mapped;
+	__u8	sctpi_s_frag_interleave;
+};
+
+struct sctp_infox {
+	struct sctp_info *sctpinfo;
+	struct sctp_association *asoc;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 978d5f67d5a7..268b10058ef5 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,9 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info);
+
 /*
  * sctp/primitive.c
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf265a4bba6e..cd0fb3bb493c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4202,6 +4202,92 @@ static void sctp_shutdown(struct sock *sk, int how)
 	}
 }
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info)
+{
+	struct sctp_transport *prim;
+	struct list_head *pos;
+	int mask;
+
+	memset(info, 0, sizeof(*info));
+	if (!asoc) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		info->sctpi_s_autoclose = sp->autoclose;
+		info->sctpi_s_adaptation_ind = sp->adaptation_ind;
+		info->sctpi_s_pd_point = sp->pd_point;
+		info->sctpi_s_nodelay = sp->nodelay;
+		info->sctpi_s_disable_fragments = sp->disable_fragments;
+		info->sctpi_s_v4mapped = sp->v4mapped;
+		info->sctpi_s_frag_interleave = sp->frag_interleave;
+
+		return 0;
+	}
+
+	info->sctpi_tag = asoc->c.my_vtag;
+	info->sctpi_state = asoc->state;
+	info->sctpi_rwnd = asoc->a_rwnd;
+	info->sctpi_unackdata = asoc->unack_data;
+	info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+	info->sctpi_instrms = asoc->c.sinit_max_instreams;
+	info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+	list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
+		info->sctpi_inqueue++;
+	list_for_each(pos, &asoc->outqueue.out_chunk_list)
+		info->sctpi_outqueue++;
+	info->sctpi_overall_error = asoc->overall_error_count;
+	info->sctpi_max_burst = asoc->max_burst;
+	info->sctpi_maxseg = asoc->frag_point;
+	info->sctpi_peer_rwnd = asoc->peer.rwnd;
+	info->sctpi_peer_tag = asoc->c.peer_vtag;
+
+	mask = asoc->peer.ecn_capable << 1;
+	mask = (mask | asoc->peer.ipv4_address) << 1;
+	mask = (mask | asoc->peer.ipv6_address) << 1;
+	mask = (mask | asoc->peer.hostname_address) << 1;
+	mask = (mask | asoc->peer.asconf_capable) << 1;
+	mask = (mask | asoc->peer.prsctp_capable) << 1;
+	mask = (mask | asoc->peer.auth_capable);
+	info->sctpi_peer_capable = mask;
+	mask = asoc->peer.sack_needed << 1;
+	mask = (mask | asoc->peer.sack_generation) << 1;
+	mask = (mask | asoc->peer.zero_window_announced);
+	info->sctpi_peer_sack = mask;
+
+	info->sctpi_isacks = asoc->stats.isacks;
+	info->sctpi_osacks = asoc->stats.osacks;
+	info->sctpi_opackets = asoc->stats.opackets;
+	info->sctpi_ipackets = asoc->stats.ipackets;
+	info->sctpi_rtxchunks = asoc->stats.rtxchunks;
+	info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
+	info->sctpi_idupchunks = asoc->stats.idupchunks;
+	info->sctpi_gapcnt = asoc->stats.gapcnt;
+	info->sctpi_ouodchunks = asoc->stats.ouodchunks;
+	info->sctpi_iuodchunks = asoc->stats.iuodchunks;
+	info->sctpi_oodchunks = asoc->stats.oodchunks;
+	info->sctpi_iodchunks = asoc->stats.iodchunks;
+	info->sctpi_octrlchunks = asoc->stats.octrlchunks;
+	info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
+
+	prim = asoc->peer.primary_path;
+	memcpy(&info->sctpi_p_address, &prim->ipaddr,
+	       sizeof(struct sockaddr_storage));
+	info->sctpi_p_state = prim->state;
+	info->sctpi_p_cwnd = prim->cwnd;
+	info->sctpi_p_srtt = prim->srtt;
+	info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
+	info->sctpi_p_hbinterval = prim->hbinterval;
+	info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
+	info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
+	info->sctpi_p_ssthresh = prim->ssthresh;
+	info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
+	info->sctpi_p_flight_size = prim->flight_size;
+	info->sctpi_p_error = prim->error_count;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
+
 /* 7.2.1 Association Status (SCTP_STATUS)
 
  * Applications can retrieve current status information about an
-- 
cgit v1.2.3


From 756ca874417695f77941948a77e9b8562635cc0a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 14 Apr 2016 17:04:34 -0400
Subject: netdev_features: Add NETIF_F_TSO_MANGLEID to NETIF_F_ALL_TSO

I realized that when I added NETIF_F_TSO_MANGLEID as a TSO type I forgot to
add it to NETIF_F_ALL_TSO.  This patch corrects that so the flag will be
included correctly.

The result should be minor as it was only used by a few drivers and in a
few specific cases such as when NETIF_F_SG was not supported on a device so
the TSO flags were cleared.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9fc79df0e561..15eb0b12fff9 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -164,7 +164,8 @@ enum {
 #define NETIF_F_CSUM_MASK	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
 				 NETIF_F_HW_CSUM)
 
-#define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | \
+				 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)
 
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
-- 
cgit v1.2.3


From 6d62b4d5fac620ee0ca65dc6d99b0306d96bc541 Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Fri, 15 Apr 2016 00:34:59 +0200
Subject: net: ethtool: export conversion function between u32 and link mode

The function convert_legacy_u32_to_link_mode and
convert_link_mode_to_legacy_u32 may be used outside
of ethtool.c. We rename them to ethtool_convert_...
and export them, so we could use them in others
drivers and modules.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  7 +++++++
 net/core/ethtool.c      | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e2b7bf27c03e..9ded8c6d8176 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -150,6 +150,13 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+					     u32 legacy_u32);
+
+/* return false if src had higher bits set. lower bits always updated. */
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+				     const unsigned long *src);
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e0cf20a3b3dd..bdb4013581b1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -391,15 +391,17 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
-static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+					     u32 legacy_u32)
 {
 	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
 	dst[0] = legacy_u32;
 }
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
 
 /* return false if src had higher bits set. lower bits always updated. */
-static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
-					    const unsigned long *src)
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					     const unsigned long *src)
 {
 	bool retval = true;
 
@@ -419,6 +421,7 @@ static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	*legacy_u32 = src[0];
 	return retval;
 }
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
 
 /* return false if legacy contained non-0 deprecated fields
  * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
@@ -441,13 +444,13 @@ convert_legacy_settings_to_link_ksettings(
 	    legacy_settings->maxrxpkt)
 		retval = false;
 
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.supported,
 		legacy_settings->supported);
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.advertising,
 		legacy_settings->advertising);
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.lp_advertising,
 		legacy_settings->lp_advertising);
 	link_ksettings->base.speed
@@ -486,13 +489,13 @@ convert_link_ksettings_to_legacy_settings(
 	 * __u32	maxrxpkt;
 	 */
 
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->supported,
 		link_ksettings->link_modes.supported);
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->advertising,
 		link_ksettings->link_modes.advertising);
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->lp_advertising,
 		link_ksettings->link_modes.lp_advertising);
 	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
-- 
cgit v1.2.3


From 2d55173e71b06c5a369489852d972304e14189fd Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Fri, 15 Apr 2016 00:35:00 +0200
Subject: phy: add generic function to support ksetting support

The old ethtool api (get_setting and set_setting) has
generic phy functions phy_ethtool_sset and phy_ethtool_gset.
To supprt the new ethtool api (get_link_ksettings and
set_link_ksettings), we add generic phy function
phy_ethtool_ksettings_get and phy_ethtool_ksettings_set.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h   |  4 +++
 2 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 5590b9c182c9..6f221c8c2a7f 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -362,6 +362,60 @@ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(phy_ethtool_sset);
 
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	u8 autoneg = cmd->base.autoneg;
+	u8 duplex = cmd->base.duplex;
+	u32 speed = cmd->base.speed;
+	u32 advertising;
+
+	if (cmd->base.phy_address != phydev->mdio.addr)
+		return -EINVAL;
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						cmd->link_modes.advertising);
+
+	/* We make sure that we don't pass unsupported values in to the PHY */
+	advertising &= phydev->supported;
+
+	/* Verify the settings we care about. */
+	if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
+		return -EINVAL;
+
+	if (autoneg == AUTONEG_ENABLE && advertising == 0)
+		return -EINVAL;
+
+	if (autoneg == AUTONEG_DISABLE &&
+	    ((speed != SPEED_1000 &&
+	      speed != SPEED_100 &&
+	      speed != SPEED_10) ||
+	     (duplex != DUPLEX_HALF &&
+	      duplex != DUPLEX_FULL)))
+		return -EINVAL;
+
+	phydev->autoneg = autoneg;
+
+	phydev->speed = speed;
+
+	phydev->advertising = advertising;
+
+	if (autoneg == AUTONEG_ENABLE)
+		phydev->advertising |= ADVERTISED_Autoneg;
+	else
+		phydev->advertising &= ~ADVERTISED_Autoneg;
+
+	phydev->duplex = duplex;
+
+	phydev->mdix = cmd->base.eth_tp_mdix_ctrl;
+
+	/* Restart the PHY */
+	phy_start_aneg(phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_set);
+
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 {
 	cmd->supported = phydev->supported;
@@ -385,6 +439,33 @@ int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(phy_ethtool_gset);
 
+int phy_ethtool_ksettings_get(struct phy_device *phydev,
+			      struct ethtool_link_ksettings *cmd)
+{
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						phydev->supported);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						phydev->advertising);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						phydev->lp_advertising);
+
+	cmd->base.speed = phydev->speed;
+	cmd->base.duplex = phydev->duplex;
+	if (phydev->interface == PHY_INTERFACE_MODE_MOCA)
+		cmd->base.port = PORT_BNC;
+	else
+		cmd->base.port = PORT_MII;
+
+	cmd->base.phy_address = phydev->mdio.addr;
+	cmd->base.autoneg = phydev->autoneg;
+	cmd->base.eth_tp_mdix_ctrl = phydev->mdix;
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_get);
+
 /**
  * phy_mii_ioctl - generic PHY MII ioctl interface
  * @phydev: the phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2abd7918f64f..be3f83bbdc0b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -805,6 +805,10 @@ void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
+int phy_ethtool_ksettings_get(struct phy_device *phydev,
+			      struct ethtool_link_ksettings *cmd);
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+			      const struct ethtool_link_ksettings *cmd);
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
-- 
cgit v1.2.3


From 607ea7cda6315be0ad8be2f98bc9de6f2d656ae6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 18 Apr 2016 14:41:10 +0300
Subject: net/ipv6/addrconf: simplify sysctl registration

Struct ctl_table_header holds pointer to sysctl table which could be used
for freeing it after unregistration. IPv4 sysctls already use that.
Remove redundant NULL assignment: ndev allocated using kzalloc.

This also saves some bytes: sysctl table could be shorter than
DEVCONF_MAX+1 if some options are disable in config.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h |  3 ++-
 net/ipv6/addrconf.c  | 43 +++++++++++++++++--------------------------
 2 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7edc14fb66b6..58d6e158755f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -63,7 +63,8 @@ struct ipv6_devconf {
 	} stable_secret;
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
-	void		*sysctl;
+
+	struct ctl_table_header *sysctl_header;
 };
 
 struct ipv6_params {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a6c99275bd8c..5a42c0fe0449 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -359,7 +359,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
 
 	ndev->cnf.mtu6 = dev->mtu;
-	ndev->cnf.sysctl = NULL;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
 	if (!ndev->nd_parms) {
 		kfree(ndev);
@@ -5620,13 +5619,7 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
 	return ret;
 }
 
-static struct addrconf_sysctl_table
-{
-	struct ctl_table_header *sysctl_header;
-	struct ctl_table addrconf_vars[DEVCONF_MAX+1];
-} addrconf_sysctl __read_mostly = {
-	.sysctl_header = NULL,
-	.addrconf_vars = {
+static const struct ctl_table addrconf_sysctl[] = {
 		{
 			.procname	= "forwarding",
 			.data		= &ipv6_devconf.forwarding,
@@ -5944,52 +5937,50 @@ static struct addrconf_sysctl_table
 		{
 			/* sentinel */
 		}
-	},
 };
 
 static int __addrconf_sysctl_register(struct net *net, char *dev_name,
 		struct inet6_dev *idev, struct ipv6_devconf *p)
 {
 	int i;
-	struct addrconf_sysctl_table *t;
+	struct ctl_table *table;
 	char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
 
-	t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
-	if (!t)
+	table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL);
+	if (!table)
 		goto out;
 
-	for (i = 0; t->addrconf_vars[i].data; i++) {
-		t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
-		t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
-		t->addrconf_vars[i].extra2 = net;
+	for (i = 0; table[i].data; i++) {
+		table[i].data += (char *)p - (char *)&ipv6_devconf;
+		table[i].extra1 = idev; /* embedded; no ref */
+		table[i].extra2 = net;
 	}
 
 	snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
 
-	t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars);
-	if (!t->sysctl_header)
+	p->sysctl_header = register_net_sysctl(net, path, table);
+	if (!p->sysctl_header)
 		goto free;
 
-	p->sysctl = t;
 	return 0;
 
 free:
-	kfree(t);
+	kfree(table);
 out:
 	return -ENOBUFS;
 }
 
 static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
 {
-	struct addrconf_sysctl_table *t;
+	struct ctl_table *table;
 
-	if (!p->sysctl)
+	if (!p->sysctl_header)
 		return;
 
-	t = p->sysctl;
-	p->sysctl = NULL;
-	unregister_net_sysctl_table(t->sysctl_header);
-	kfree(t);
+	table = p->sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(p->sysctl_header);
+	p->sysctl_header = NULL;
+	kfree(table);
 }
 
 static int addrconf_sysctl_register(struct inet6_dev *idev)
-- 
cgit v1.2.3


From bd570ff970a54df653b48ed0cfb373f2ebed083d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 18 Apr 2016 21:01:24 +0200
Subject: bpf: add event output helper for notifications/sampling/logging

This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").

The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.

User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.

While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.

Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/core.c        |  7 +++++++
 kernel/trace/bpf_trace.c | 27 +++++++++++++++++++++++++++
 net/core/filter.c        |  2 ++
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fb3c610fa96..f63afdc43bec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,7 +169,9 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
+const struct bpf_func_proto *bpf_get_event_output_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index be0abf669ced..e4248fe79513 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -764,14 +764,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
 	return NULL;
 }
 
+const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+{
+	return NULL;
+}
+
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
 	.func		= NULL,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b3cc24cb4321..780bcbe1d4de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -277,6 +277,33 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
+static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+
+static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+
+	perf_fetch_caller_regs(regs);
+
+	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+}
+
+static const struct bpf_func_proto bpf_event_output_proto = {
+	.func		= bpf_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+const struct bpf_func_proto *bpf_get_event_output_proto(void)
+{
+	return &bpf_event_output_proto;
+}
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 5d2ac2b9d1c4..218e5de8c402 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2039,6 +2039,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
 		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_perf_event_output:
+		return bpf_get_event_output_proto();
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From b853cb9628bfbcc4017da46d5f5b46e3eba9d8c6 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 28 Mar 2016 21:35:22 -0700
Subject: soc: qcom: smd: Make callback pass channel reference

By passing the smd channel reference to the callback, rather than the
smd device, we can open additional smd channels from sub-devices of smd
devices.

Also updates the two smd clients today found in mainline.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd-rpm.c    |  9 ++++++---
 drivers/soc/qcom/smd.c        | 22 ++++++++++++++++++----
 drivers/soc/qcom/wcnss_ctrl.c |  8 ++++----
 include/linux/soc/qcom/smd.h  |  7 +++++--
 4 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd-rpm.c b/drivers/soc/qcom/smd-rpm.c
index 731fa066f712..6609d7e0edb0 100644
--- a/drivers/soc/qcom/smd-rpm.c
+++ b/drivers/soc/qcom/smd-rpm.c
@@ -33,6 +33,7 @@
  */
 struct qcom_smd_rpm {
 	struct qcom_smd_channel *rpm_channel;
+	struct device *dev;
 
 	struct completion ack;
 	struct mutex lock;
@@ -149,14 +150,14 @@ out:
 }
 EXPORT_SYMBOL(qcom_rpm_smd_write);
 
-static int qcom_smd_rpm_callback(struct qcom_smd_device *qsdev,
+static int qcom_smd_rpm_callback(struct qcom_smd_channel *channel,
 				 const void *data,
 				 size_t count)
 {
 	const struct qcom_rpm_header *hdr = data;
 	size_t hdr_length = le32_to_cpu(hdr->length);
 	const struct qcom_rpm_message *msg;
-	struct qcom_smd_rpm *rpm = dev_get_drvdata(&qsdev->dev);
+	struct qcom_smd_rpm *rpm = qcom_smd_get_drvdata(channel);
 	const u8 *buf = data + sizeof(struct qcom_rpm_header);
 	const u8 *end = buf + hdr_length;
 	char msgbuf[32];
@@ -165,7 +166,7 @@ static int qcom_smd_rpm_callback(struct qcom_smd_device *qsdev,
 
 	if (le32_to_cpu(hdr->service_type) != RPM_SERVICE_TYPE_REQUEST ||
 	    hdr_length < sizeof(struct qcom_rpm_message)) {
-		dev_err(&qsdev->dev, "invalid request\n");
+		dev_err(rpm->dev, "invalid request\n");
 		return 0;
 	}
 
@@ -206,7 +207,9 @@ static int qcom_smd_rpm_probe(struct qcom_smd_device *sdev)
 	mutex_init(&rpm->lock);
 	init_completion(&rpm->ack);
 
+	rpm->dev = &sdev->dev;
 	rpm->rpm_channel = sdev->channel;
+	qcom_smd_set_drvdata(sdev->channel, rpm);
 
 	dev_set_drvdata(&sdev->dev, rpm);
 
diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index b6434c4be86a..ac1957dfdf24 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -194,6 +194,8 @@ struct qcom_smd_channel {
 
 	int pkt_size;
 
+	void *drvdata;
+
 	struct list_head list;
 	struct list_head dev_list;
 };
@@ -513,7 +515,6 @@ static void qcom_smd_channel_advance(struct qcom_smd_channel *channel,
  */
 static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
 {
-	struct qcom_smd_device *qsdev = channel->qsdev;
 	unsigned tail;
 	size_t len;
 	void *ptr;
@@ -533,7 +534,7 @@ static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
 		len = channel->pkt_size;
 	}
 
-	ret = channel->cb(qsdev, ptr, len);
+	ret = channel->cb(channel, ptr, len);
 	if (ret < 0)
 		return ret;
 
@@ -1034,6 +1035,18 @@ int qcom_smd_driver_register(struct qcom_smd_driver *qsdrv)
 }
 EXPORT_SYMBOL(qcom_smd_driver_register);
 
+void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel)
+{
+	return channel->drvdata;
+}
+EXPORT_SYMBOL(qcom_smd_get_drvdata);
+
+void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data)
+{
+	channel->drvdata = data;
+}
+EXPORT_SYMBOL(qcom_smd_set_drvdata);
+
 /**
  * qcom_smd_driver_unregister - unregister a smd driver
  * @qsdrv:	qcom_smd_driver struct
@@ -1079,12 +1092,13 @@ qcom_smd_find_channel(struct qcom_smd_edge *edge, const char *name)
  * Returns a channel handle on success, or -EPROBE_DEFER if the channel isn't
  * ready.
  */
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *parent,
 					       const char *name,
 					       qcom_smd_cb_t cb)
 {
 	struct qcom_smd_channel *channel;
-	struct qcom_smd_edge *edge = sdev->channel->edge;
+	struct qcom_smd_device *sdev = parent->qsdev;
+	struct qcom_smd_edge *edge = parent->edge;
 	int ret;
 
 	/* Wait up to HZ for the channel to appear */
diff --git a/drivers/soc/qcom/wcnss_ctrl.c b/drivers/soc/qcom/wcnss_ctrl.c
index 7a986f881d5c..c544f3d2c6ee 100644
--- a/drivers/soc/qcom/wcnss_ctrl.c
+++ b/drivers/soc/qcom/wcnss_ctrl.c
@@ -100,17 +100,17 @@ struct wcnss_download_nv_resp {
 
 /**
  * wcnss_ctrl_smd_callback() - handler from SMD responses
- * @qsdev:	smd device handle
+ * @channel:	smd channel handle
  * @data:	pointer to the incoming data packet
  * @count:	size of the incoming data packet
  *
  * Handles any incoming packets from the remote WCNSS_CTRL service.
  */
-static int wcnss_ctrl_smd_callback(struct qcom_smd_device *qsdev,
+static int wcnss_ctrl_smd_callback(struct qcom_smd_channel *channel,
 				   const void *data,
 				   size_t count)
 {
-	struct wcnss_ctrl *wcnss = dev_get_drvdata(&qsdev->dev);
+	struct wcnss_ctrl *wcnss = qcom_smd_get_drvdata(channel);
 	const struct wcnss_download_nv_resp *nvresp;
 	const struct wcnss_version_resp *version;
 	const struct wcnss_msg_hdr *hdr = data;
@@ -246,7 +246,7 @@ static int wcnss_ctrl_probe(struct qcom_smd_device *sdev)
 	init_completion(&wcnss->ack);
 	INIT_WORK(&wcnss->download_nv_work, wcnss_download_nv);
 
-	dev_set_drvdata(&sdev->dev, wcnss);
+	qcom_smd_set_drvdata(sdev->channel, wcnss);
 
 	return wcnss_request_version(wcnss);
 }
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index bd51c8a9d807..cb2f81559bc0 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -26,7 +26,7 @@ struct qcom_smd_device {
 	struct qcom_smd_channel *channel;
 };
 
-typedef int (*qcom_smd_cb_t)(struct qcom_smd_device *, const void *, size_t);
+typedef int (*qcom_smd_cb_t)(struct qcom_smd_channel *, const void *, size_t);
 
 /**
  * struct qcom_smd_driver - smd driver struct
@@ -50,13 +50,16 @@ struct qcom_smd_driver {
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
 void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
+void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel);
+void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data);
+
 #define module_qcom_smd_driver(__smd_driver) \
 	module_driver(__smd_driver, qcom_smd_driver_register, \
 		      qcom_smd_driver_unregister)
 
 int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *channel,
 					       const char *name,
 					       qcom_smd_cb_t cb);
 
-- 
cgit v1.2.3


From 85b67bcb7e4a23ced05e7020bf5843b9857f6881 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 18 Apr 2016 20:11:50 -0700
Subject: perf, bpf: minimize the size of perf_trace_() tracepoint handler

move trace_call_bpf() into helper function to minimize the size
of perf_trace_*() tracepoint handlers.
    text	   data	    bss	    dec	 	   hex	filename
10541679	5526646	2945024	19013349	1221ee5	vmlinux_before
10509422	5526646	2945024	18981092	121a0e4	vmlinux_after

It may seem that perf_fetch_caller_regs() can also be moved,
but that is incorrect, since ip/sp will be wrong.

bpf+tracepoint performance is not affected, since
perf_swevent_put_recursion_context() is now inlined.
export_symbol_gpl can also be dropped.

No measurable change in normal perf tracepoints.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h |  5 +++++
 include/trace/perf.h         | 13 +++----------
 kernel/events/core.c         | 20 +++++++++++++++++++-
 3 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fe6441203b59..222f6aa0418f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -609,6 +609,11 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task);
+
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
diff --git a/include/trace/perf.h b/include/trace/perf.h
index a182306eefd7..88de5c205e86 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -64,16 +64,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	if (prog) {							\
-		*(struct pt_regs **)entry = __regs;			\
-		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
-			perf_swevent_put_recursion_context(rctx);	\
-			return;						\
-		}							\
-	}								\
-	perf_trace_buf_submit(entry, __entry_size, rctx,		\
-			      event_call->event.type, __count, __regs,	\
-			      head, __task);				\
+	perf_trace_run_bpf_submit(entry, __entry_size, rctx,		\
+				  event_call, __count, __regs,		\
+				  head, __task);			\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5056abffef27..9eb23dc27462 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6741,7 +6741,6 @@ void perf_swevent_put_recursion_context(int rctx)
 
 	put_recursion_context(swhash->recursion, rctx);
 }
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -6998,6 +6997,25 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task)
+{
+	struct bpf_prog *prog = call->prog;
+
+	if (prog) {
+		*(struct pt_regs **)raw_data = regs;
+		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+			perf_swevent_put_recursion_context(rctx);
+			return;
+		}
+	}
+	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
+		      rctx, task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
-- 
cgit v1.2.3


From b1c20f0b97b4e565fa50cde1e6b44c2fd327a1e0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Tue, 19 Apr 2016 14:02:19 -0400
Subject: netdev_features: Fold NETIF_F_ALL_TSO into NETIF_F_GSO_SOFTWARE

This patch folds NETIF_F_ALL_TSO into the bitmask for NETIF_F_GSO_SOFTWARE.
The idea is to avoid duplication of defines since the only difference
between the two was the GSO_UDP bit.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 15eb0b12fff9..bc8736266749 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -152,11 +152,6 @@ enum {
 #define NETIF_F_GSO_MASK	(__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \
 		__NETIF_F_BIT(NETIF_F_GSO_SHIFT))
 
-/* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
-				 NETIF_F_TSO_MANGLEID | \
-				 NETIF_F_TSO6 | NETIF_F_UFO)
-
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
  * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set--
  * this would be contradictory
@@ -170,6 +165,9 @@ enum {
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
 
+/* List of features with software fallbacks. */
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_UFO)
+
 /*
  * If one device supports one of these features, then enable them
  * for all in netdev_increment_features.
-- 
cgit v1.2.3


From 237cd218099ce96edf2890a49aa191b38b84c2fc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 20 Apr 2016 22:02:09 +0300
Subject: net/mlx5: Introduce device queue counters

A queue counter can collect several statistics for one or more
hardware queues (QPs, RQs, etc ..) that the counter is attached to.

For Ethernet it will provide an "out of buffer" counter which
collects the number of all packets that are dropped due to lack
of software buffers.

Here we add device commands to alloc/query/dealloc queue counters.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 68 ++++++++++++++++++++++++++++
 include/linux/mlx5/qp.h                      |  6 +++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index def289375ecb..b720a274220d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -538,3 +538,71 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 	mlx5_core_destroy_sq(dev, sq->qpn);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
+
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
+
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
+
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, clear, reset);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_size);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
+
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+	void *out;
+	int err;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_q_counter(dev, counter_id, 0, out, outlen);
+	if (!err)
+		*out_of_buffer = MLX5_GET(query_q_counter_out, out,
+					  out_of_buffer);
+
+	kfree(out);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index cf031a3f16c5..64221027bf1f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -668,6 +668,12 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *sq);
 void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 				  struct mlx5_core_qp *sq);
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id);
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id);
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size);
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer);
 
 static inline const char *mlx5_qp_type_str(int type)
 {
-- 
cgit v1.2.3


From 461017cb006aa1b39b0f647ae0ee2d9d84eef05b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 20 Apr 2016 22:02:13 +0300
Subject: net/mlx5e: Support RX multi-packet WQE (Striding RQ)

Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.

Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.

In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.

For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.

MPWQE default configuration:
	Num of WQEs	= 16
	Strides Per WQE = 2048
	Stride Size	= 64 byte

The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.

Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.

* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
  default, 64B, 1024B, 1478B, 65536B.

* Netperf multi TCP stream:
- No degradation, line rate reached.

* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.

* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
  |     2K      |       ~ 1K          |       0
  |     8K      |       ~ 6K          |       0
  |     16K     |       ~13K          |       0
  |     32K     |       ~28K          |       0
  |     64K     |       ~57K          |     ~24K

As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  77 ++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  15 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 109 +++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 153 +++++++++++++++++++--
 include/linux/mlx5/device.h                        |  39 +++++-
 5 files changed, 349 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 61e249d8d7f8..f519148d7dcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -57,12 +57,30 @@
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xd
 
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x1
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW            0x4
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW            0x6
+
+#define MLX5_MPWRQ_LOG_NUM_STRIDES		11 /* >= 9, HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SIZE		6  /* >= 6, HW restriction */
+#define MLX5_MPWRQ_NUM_STRIDES			BIT(MLX5_MPWRQ_LOG_NUM_STRIDES)
+#define MLX5_MPWRQ_STRIDE_SIZE			BIT(MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_LOG_WQE_SZ			(MLX5_MPWRQ_LOG_NUM_STRIDES +\
+						 MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
+				    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
+#define MLX5_MPWRQ_PAGES_PER_WQE		BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
+#define MLX5_MPWRQ_STRIDES_PER_PAGE		(MLX5_MPWRQ_NUM_STRIDES >> \
+						 MLX5_MPWRQ_WQE_PAGE_ORDER)
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(128)
+
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW            0x2
 
 #define MLX5E_LOG_INDIR_RQT_SIZE       0x7
 #define MLX5E_INDIR_RQT_SIZE           BIT(MLX5E_LOG_INDIR_RQT_SIZE)
@@ -74,6 +92,38 @@
 #define MLX5E_NUM_MAIN_GROUPS 9
 #define MLX5E_NET_IP_ALIGN 2
 
+static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW,
+			     wq_size / 2);
+	default:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES,
+			     wq_size / 2);
+	}
+}
+
+static inline int mlx5_min_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+	}
+}
+
+static inline int mlx5_max_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	}
+}
+
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg  eth;
@@ -128,6 +178,7 @@ static const char vport_strings[][ETH_GSTRING_LEN] = {
 	"tx_queue_wake",
 	"tx_queue_dropped",
 	"rx_wqe_err",
+	"rx_mpwqe_filler",
 };
 
 struct mlx5e_vport_stats {
@@ -169,8 +220,9 @@ struct mlx5e_vport_stats {
 	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
 
-#define NUM_VPORT_COUNTERS     35
+#define NUM_VPORT_COUNTERS     36
 };
 
 static const char pport_strings[][ETH_GSTRING_LEN] = {
@@ -263,7 +315,8 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"csum_sw",
 	"lro_packets",
 	"lro_bytes",
-	"wqe_err"
+	"wqe_err",
+	"mpwqe_filler",
 };
 
 struct mlx5e_rq_stats {
@@ -274,7 +327,8 @@ struct mlx5e_rq_stats {
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
-#define NUM_RQ_STATS 7
+	u64 mpwqe_filler;
+#define NUM_RQ_STATS 8
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
@@ -318,6 +372,7 @@ struct mlx5e_stats {
 
 struct mlx5e_params {
 	u8  log_sq_size;
+	u8  rq_wq_type;
 	u8  log_rq_size;
 	u16 num_channels;
 	u8  num_tc;
@@ -374,11 +429,23 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq *rq,
 typedef int (*mlx5e_fp_alloc_wqe)(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,
 				  u16 ix);
 
+struct mlx5e_dma_info {
+	struct page	*page;
+	dma_addr_t	addr;
+};
+
+struct mlx5e_mpw_info {
+	struct mlx5e_dma_info dma_info;
+	u16 consumed_strides;
+	u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
+};
+
 struct mlx5e_rq {
 	/* data path */
 	struct mlx5_wq_ll      wq;
 	u32                    wqe_sz;
 	struct sk_buff       **skb;
+	struct mlx5e_mpw_info *wqe_info;
 
 	struct device         *pdev;
 	struct net_device     *netdev;
@@ -393,6 +460,7 @@ struct mlx5e_rq {
 
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;
+	u8                     wq_type;
 	u32                    rqn;
 	struct mlx5e_channel  *channel;
 	struct mlx5e_priv     *priv;
@@ -649,9 +717,12 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 6f40ba448f07..4077856aab76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -273,8 +273,9 @@ static void mlx5e_get_ringparam(struct net_device *dev,
 				struct ethtool_ringparam *param)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	int rq_wq_type = priv->params.rq_wq_type;
 
-	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->rx_max_pending = 1 << mlx5_max_log_rq_size(rq_wq_type);
 	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
 	param->rx_pending     = 1 << priv->params.log_rq_size;
 	param->tx_pending     = 1 << priv->params.log_sq_size;
@@ -285,6 +286,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	bool was_opened;
+	int rq_wq_type = priv->params.rq_wq_type;
 	u16 min_rx_wqes;
 	u8 log_rq_size;
 	u8 log_sq_size;
@@ -300,16 +302,16 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 			    __func__);
 		return -EINVAL;
 	}
-	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending < (1 << mlx5_min_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) < min (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_min_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
-	if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending > (1 << mlx5_max_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) > max (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_max_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
 	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
@@ -327,8 +329,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 
 	log_rq_size = order_base_2(param->rx_pending);
 	log_sq_size = order_base_2(param->tx_pending);
-	min_rx_wqes = min_t(u16, param->rx_pending - 1,
-			    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES);
+	min_rx_wqes = mlx5_min_rx_wqes(rq_wq_type, param->rx_pending);
 
 	if (log_rq_size == priv->params.log_rq_size &&
 	    log_sq_size == priv->params.log_sq_size &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 23ba12c3a738..871f3af204dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -175,6 +175,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->rx_csum_none		= 0;
 	s->rx_csum_sw		= 0;
 	s->rx_wqe_err		= 0;
+	s->rx_mpwqe_filler	= 0;
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -185,6 +186,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		s->rx_csum_none	+= rq_stats->csum_none;
 		s->rx_csum_sw	+= rq_stats->csum_sw;
 		s->rx_wqe_err   += rq_stats->wqe_err;
+		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -323,6 +325,7 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 byte_count;
 	int wq_sz;
 	int err;
 	int i;
@@ -337,28 +340,47 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-	rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
-			       cpu_to_node(c->cpu));
-	if (!rq->skb) {
-		err = -ENOMEM;
-		goto err_rq_wq_destroy;
-	}
 
-	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
-					     MLX5E_SW2HW_MTU(priv->netdev->mtu);
-	rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		rq->wqe_info = kzalloc_node(wq_sz * sizeof(*rq->wqe_info),
+					    GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe_info) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe_mpwrq;
+		rq->alloc_wqe = mlx5e_alloc_rx_mpwqe;
+
+		rq->wqe_sz = MLX5_MPWRQ_NUM_STRIDES * MLX5_MPWRQ_STRIDE_SIZE;
+		byte_count = rq->wqe_sz;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
+				       cpu_to_node(c->cpu));
+		if (!rq->skb) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
+		rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+
+		rq->wqe_sz = (priv->params.lro_en) ?
+				priv->params.lro_wqe_sz :
+				MLX5E_SW2HW_MTU(priv->netdev->mtu);
+		rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+		byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+		byte_count |= MLX5_HW_START_PADDING;
+	}
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-		u32 byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
 
 		wqe->data.lkey       = c->mkey_be;
-		wqe->data.byte_count =
-			cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+		wqe->data.byte_count = cpu_to_be32(byte_count);
 	}
 
-	rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
-	rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+	rq->wq_type = priv->params.rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = &priv->tstamp;
@@ -376,7 +398,14 @@ err_rq_wq_destroy:
 
 static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
-	kfree(rq->skb);
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->wqe_info);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		kfree(rq->skb);
+	}
+
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
@@ -1065,7 +1094,18 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type,          MLX5_WQ_TYPE_LINKED_LIST);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 MLX5_MPWRQ_LOG_NUM_STRIDES - 9);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 MLX5_MPWRQ_LOG_STRIDE_SIZE - 6);
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	}
+
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
@@ -1111,8 +1151,18 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 				    struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
+	u8 log_cq_size;
 
-	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_rq_size);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		log_cq_size = priv->params.log_rq_size +
+			MLX5_MPWRQ_LOG_NUM_STRIDES;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		log_cq_size = priv->params.log_rq_size;
+	}
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
@@ -1983,7 +2033,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 	if (changes & NETIF_F_LRO) {
 		bool was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
@@ -1992,7 +2043,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
 				       err);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			err = mlx5e_open_locked(priv->netdev);
 	}
 
@@ -2327,8 +2379,21 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 
 	priv->params.log_sq_size           =
 		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
-	priv->params.log_rq_size           =
-		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	priv->params.rq_wq_type = MLX5_CAP_GEN(mdev, striding_rq) ?
+		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
+		MLX5_WQ_TYPE_LINKED_LIST;
+
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
+		priv->params.lro_en = true;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	}
+
+	priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
+					    BIT(priv->params.log_rq_size));
 	priv->params.rx_cq_moderation_usec =
 		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_pkts =
@@ -2338,8 +2403,6 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 	priv->params.tx_cq_moderation_pkts =
 		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.tx_max_inline         = mlx5e_get_max_inline_cap(mdev);
-	priv->params.min_rx_wqes           =
-		MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.num_tc                = 1;
 	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d7cccedddf34..71f3a5d244ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -76,6 +76,41 @@ err_free_skb:
 	return -ENOMEM;
 }
 
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
+	gfp_t gfp_mask;
+	int i;
+
+	gfp_mask = GFP_ATOMIC | __GFP_COLD | __GFP_MEMALLOC;
+	wi->dma_info.page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+					     MLX5_MPWRQ_WQE_PAGE_ORDER);
+	if (unlikely(!wi->dma_info.page))
+		return -ENOMEM;
+
+	wi->dma_info.addr = dma_map_page(rq->pdev, wi->dma_info.page, 0,
+					 rq->wqe_sz, PCI_DMA_FROMDEVICE);
+	if (unlikely(dma_mapping_error(rq->pdev, wi->dma_info.addr))) {
+		put_page(wi->dma_info.page);
+		return -ENOMEM;
+	}
+
+	/* We split the high-order page into order-0 ones and manage their
+	 * reference counter to minimize the memory held by small skb fragments
+	 */
+	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+		atomic_add(MLX5_MPWRQ_STRIDES_PER_PAGE,
+			   &wi->dma_info.page[i]._count);
+		wi->skbs_frags[i] = 0;
+	}
+
+	wi->consumed_strides = 0;
+	wqe->data.addr       = cpu_to_be64(wi->dma_info.addr);
+
+	return 0;
+}
+
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->wq;
@@ -100,7 +135,8 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	return !mlx5_wq_ll_is_full(wq);
 }
 
-static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
+static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+				 u32 cqe_bcnt)
 {
 	struct ethhdr	*eth	= (struct ethhdr *)(skb->data);
 	struct iphdr	*ipv4	= (struct iphdr *)(skb->data + ETH_HLEN);
@@ -111,7 +147,7 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
 	int tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA  == l4_hdr_type) ||
 		       (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type));
 
-	u16 tot_len = be32_to_cpu(cqe->byte_cnt) - ETH_HLEN;
+	u16 tot_len = cqe_bcnt - ETH_HLEN;
 
 	if (eth->h_proto == htons(ETH_P_IP)) {
 		tcp = (struct tcphdr *)(skb->data + ETH_HLEN +
@@ -191,19 +227,17 @@ csum_none:
 }
 
 static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
+				      u32 cqe_bcnt,
 				      struct mlx5e_rq *rq,
 				      struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	u32 cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	int lro_num_seg;
 
-	skb_put(skb, cqe_bcnt);
-
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	if (lro_num_seg > 1) {
-		mlx5e_lro_update_hdr(skb, cqe);
+		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
 		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
 		rq->stats.lro_packets++;
 		rq->stats.lro_bytes += cqe_bcnt;
@@ -228,12 +262,24 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
 }
 
+static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	rq->stats.packets++;
+	rq->stats.bytes += cqe_bcnt;
+	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	__be16 wqe_counter_be;
 	u16 wqe_counter;
+	u32 cqe_bcnt;
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
@@ -253,16 +299,103 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto wq_ll_pop;
 	}
 
-	mlx5e_build_rx_skb(cqe, rq, skb);
-	rq->stats.packets++;
-	rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
-	napi_gro_receive(rq->cq.napi, skb);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+	skb_put(skb, cqe_bcnt);
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[wqe_id];
+	struct mlx5e_rx_wqe  *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	struct sk_buff *skb;
+	u32 consumed_bytes;
+	u32 head_offset;
+	u32 frag_offset;
+	u32 wqe_offset;
+	u32 page_idx;
+	u16 byte_cnt;
+	u16 cqe_bcnt;
+	u16 headlen;
+	int i;
+
+	wi->consumed_strides += cstrides;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		goto mpwrq_cqe_out;
+	}
+
+	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
+		rq->stats.mpwqe_filler++;
+		goto mpwrq_cqe_out;
+	}
+
+	skb = netdev_alloc_skb(rq->netdev,
+			       ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
+				     sizeof(long)));
+	if (unlikely(!skb))
+		goto mpwrq_cqe_out;
+
+	prefetch(skb->data);
+	wqe_offset = stride_ix * MLX5_MPWRQ_STRIDE_SIZE;
+	consumed_bytes = cstrides * MLX5_MPWRQ_STRIDE_SIZE;
+	dma_sync_single_for_cpu(rq->pdev, wi->dma_info.addr + wqe_offset,
+				consumed_bytes, DMA_FROM_DEVICE);
+
+	head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	page_idx       = wqe_offset >> PAGE_SHIFT;
+	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+	headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	frag_offset = head_offset + headlen;
+
+	byte_cnt = cqe_bcnt - headlen;
+	while (byte_cnt) {
+		u32 pg_consumed_bytes =
+			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize =
+			ALIGN(pg_consumed_bytes, MLX5_MPWRQ_STRIDE_SIZE);
+
+		wi->skbs_frags[page_idx]++;
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+				&wi->dma_info.page[page_idx], frag_offset,
+				pg_consumed_bytes, truesize);
+		byte_cnt -= pg_consumed_bytes;
+		frag_offset = 0;
+		page_idx++;
+	}
+
+	skb_copy_to_linear_data(skb,
+				page_address(wi->dma_info.page) + wqe_offset,
+				ALIGN(headlen, sizeof(long)));
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+
+mpwrq_cqe_out:
+	if (likely(wi->consumed_strides < MLX5_MPWRQ_NUM_STRIDES))
+		return;
+
+	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
+		       PCI_DMA_FROMDEVICE);
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+		atomic_sub(MLX5_MPWRQ_STRIDES_PER_PAGE - wi->skbs_frags[i],
+			   &wi->dma_info.page[i]._count);
+		put_page(&wi->dma_info.page[i]);
+	}
+	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+}
+
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8156e3c9239c..03f8d719b680 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -644,7 +644,8 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8		rsvd0[4];
+	u8              rsvd0[2];
+	__be16          wqe_id;
 	u8		lro_tcppsh_abort_dupack;
 	u8		lro_min_ttl;
 	__be16		lro_tcp_win;
@@ -696,6 +697,42 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
 	return (u64)lo | ((u64)hi << 32);
 }
 
+struct mpwrq_cqe_bc {
+	__be16	filler_consumed_strides;
+	__be16	byte_cnt;
+};
+
+static inline u16 mpwrq_get_cqe_byte_cnt(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return be16_to_cpu(bc->byte_cnt);
+}
+
+static inline u16 mpwrq_get_cqe_bc_consumed_strides(struct mpwrq_cqe_bc *bc)
+{
+	return 0x7fff & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_consumed_strides(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return mpwrq_get_cqe_bc_consumed_strides(bc);
+}
+
+static inline bool mpwrq_is_filler_cqe(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return 0x8000 & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_stride_index(struct mlx5_cqe64 *cqe)
+{
+	return be16_to_cpu(cqe->wqe_counter);
+}
+
 enum {
 	CQE_L4_HDR_TYPE_NONE			= 0x0,
 	CQE_L4_HDR_TYPE_TCP_NO_ACK		= 0x1,
-- 
cgit v1.2.3


From b7aade15485a660cbf5161962c284131324a9534 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Apr 2016 21:19:47 +0200
Subject: vxlan: break dependency with netdev drivers

Currently all drivers depend and autoload the vxlan module because how
vxlan_get_rx_port is linked into them. Remove this dependency:

By using a new event type in the netdevice notifier call chain we proxy
the request from the drivers to flush and resetup the vxlan ports not
directly via function call but by the already existing netdevice
notifier call chain.

I added a separate new event type, NETDEV_OFFLOAD_PUSH_VXLAN, to do so.
We don't need to save those ids, as the event type field is an unsigned
long and using specialized event types for this purpose seemed to be a
more elegant way. This also comes in beneficial if in future we want to
add offloading knobs for vxlan.

Cc: Jesse Gross <jesse@kernel.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 14 +++++++++-----
 include/linux/netdevice.h |  1 +
 include/net/vxlan.h       |  6 ++----
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c2e22c2532a1..6fb93b57a724 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2527,7 +2527,7 @@ static struct device_type vxlan_type = {
  * supply the listening VXLAN udp ports. Callers are expected
  * to implement the ndo_add_vxlan_port.
  */
-void vxlan_get_rx_port(struct net_device *dev)
+static void vxlan_push_rx_ports(struct net_device *dev)
 {
 	struct vxlan_sock *vs;
 	struct net *net = dev_net(dev);
@@ -2536,6 +2536,9 @@ void vxlan_get_rx_port(struct net_device *dev)
 	__be16 port;
 	unsigned int i;
 
+	if (!dev->netdev_ops->ndo_add_vxlan_port)
+		return;
+
 	spin_lock(&vn->sock_lock);
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
 		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
@@ -2547,7 +2550,6 @@ void vxlan_get_rx_port(struct net_device *dev)
 	}
 	spin_unlock(&vn->sock_lock);
 }
-EXPORT_SYMBOL_GPL(vxlan_get_rx_port);
 
 /* Initialize the device structure. */
 static void vxlan_setup(struct net_device *dev)
@@ -3283,20 +3285,22 @@ static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
 	unregister_netdevice_many(&list_kill);
 }
 
-static int vxlan_lowerdev_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
+static int vxlan_netdevice_event(struct notifier_block *unused,
+				 unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 
 	if (event == NETDEV_UNREGISTER)
 		vxlan_handle_lowerdev_unregister(vn, dev);
+	else if (event == NETDEV_OFFLOAD_PUSH_VXLAN)
+		vxlan_push_rx_ports(dev);
 
 	return NOTIFY_DONE;
 }
 
 static struct notifier_block vxlan_notifier_block __read_mostly = {
-	.notifier_call = vxlan_lowerdev_event,
+	.notifier_call = vxlan_netdevice_event,
 };
 
 static __net_init int vxlan_init_net(struct net *net)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3bb534576a3..d4c8cd424f8d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2244,6 +2244,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_BONDING_INFO	0x0019
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
+#define NETDEV_OFFLOAD_PUSH_VXLAN	0x001C
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index d442eb3129cd..673e9f9e6da7 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -390,13 +390,11 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset)
 	return vni_field;
 }
 
-#if IS_ENABLED(CONFIG_VXLAN)
-void vxlan_get_rx_port(struct net_device *netdev);
-#else
 static inline void vxlan_get_rx_port(struct net_device *netdev)
 {
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev);
 }
-#endif
 
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 {
-- 
cgit v1.2.3


From 681e683ff30ada19f73c17c38a528528dd8824f1 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Apr 2016 21:19:48 +0200
Subject: geneve: break dependency with netdev drivers

Equivalent to "vxlan: break dependency with netdev drivers", don't
autoload geneve module in case the driver is loaded. Instead make the
coupling weaker by using netdevice notifiers as proxy.

Cc: Jesse Gross <jesse@kernel.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      | 31 ++++++++++++++++++++++++++++---
 include/linux/netdevice.h |  1 +
 include/net/geneve.h      |  6 ++----
 3 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 512dbe013713..9c40b88fabd5 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1172,7 +1172,7 @@ static struct device_type geneve_type = {
  * supply the listening GENEVE udp ports. Callers are expected
  * to implement the ndo_add_geneve_port.
  */
-void geneve_get_rx_port(struct net_device *dev)
+static void geneve_push_rx_ports(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
@@ -1181,6 +1181,9 @@ void geneve_get_rx_port(struct net_device *dev)
 	struct sock *sk;
 	__be16 port;
 
+	if (!dev->netdev_ops->ndo_add_geneve_port)
+		return;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(gs, &gn->sock_list, list) {
 		sk = gs->sock->sk;
@@ -1190,7 +1193,6 @@ void geneve_get_rx_port(struct net_device *dev)
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL_GPL(geneve_get_rx_port);
 
 /* Initialize the device structure. */
 static void geneve_setup(struct net_device *dev)
@@ -1538,6 +1540,21 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 }
 EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
 
+static int geneve_netdevice_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event == NETDEV_OFFLOAD_PUSH_GENEVE)
+		geneve_push_rx_ports(dev);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block geneve_notifier_block __read_mostly = {
+	.notifier_call = geneve_netdevice_event,
+};
+
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
@@ -1590,11 +1607,18 @@ static int __init geneve_init_module(void)
 	if (rc)
 		goto out1;
 
-	rc = rtnl_link_register(&geneve_link_ops);
+	rc = register_netdevice_notifier(&geneve_notifier_block);
 	if (rc)
 		goto out2;
 
+	rc = rtnl_link_register(&geneve_link_ops);
+	if (rc)
+		goto out3;
+
 	return 0;
+
+out3:
+	unregister_netdevice_notifier(&geneve_notifier_block);
 out2:
 	unregister_pernet_subsys(&geneve_net_ops);
 out1:
@@ -1605,6 +1629,7 @@ late_initcall(geneve_init_module);
 static void __exit geneve_cleanup_module(void)
 {
 	rtnl_link_unregister(&geneve_link_ops);
+	unregister_netdevice_notifier(&geneve_notifier_block);
 	unregister_pernet_subsys(&geneve_net_ops);
 }
 module_exit(geneve_cleanup_module);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4c8cd424f8d..1f6d5db471a2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2245,6 +2245,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
 #define NETDEV_OFFLOAD_PUSH_VXLAN	0x001C
+#define NETDEV_OFFLOAD_PUSH_GENEVE	0x001D
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/geneve.h b/include/net/geneve.h
index e6c23dc765f7..cb544a530146 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -62,13 +62,11 @@ struct genevehdr {
 	struct geneve_opt options[];
 };
 
-#if IS_ENABLED(CONFIG_GENEVE)
-void geneve_get_rx_port(struct net_device *netdev);
-#else
 static inline void geneve_get_rx_port(struct net_device *netdev)
 {
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_GENEVE, netdev);
 }
-#endif
 
 #ifdef CONFIG_INET
 struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
-- 
cgit v1.2.3


From e9bbe898cbe89b17ad3993c136aa13d0431cd537 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 22 Apr 2016 17:31:19 +0200
Subject: libnl: nla_put_net64(): align on a 64-bit area

nla_data() is now aligned on a 64-bit area.

The temporary function nla_put_be64_32bit() is removed in this patch.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/ipset/ip_set.h      |  9 ++++++---
 include/net/netlink.h                       | 14 ++++++--------
 include/uapi/linux/netfilter/ipset/ip_set.h |  1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f48b8a664b0f..83b9a2e0d8d4 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -351,7 +351,8 @@ ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
 		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
 			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask))) ||
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
 	       (skbinfo->skbprio &&
 		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
 			      cpu_to_be32(skbinfo->skbprio))) ||
@@ -374,9 +375,11 @@ static inline bool
 ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
 {
 	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter))) ||
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
 	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)));
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
 }
 
 static inline void
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 47d7d1356fa3..066a921e7cbe 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -868,20 +868,18 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
 	return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr);
 }
 
-static inline int nla_put_be64_32bit(struct sk_buff *skb, int attrtype,
-				     __be64 value)
-{
-	return nla_put(skb, attrtype, sizeof(__be64), &value);
-}
 /**
- * nla_put_net64 - Add 64-bit network byte order netlink attribute to a socket buffer
+ * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
  * @skb: socket buffer to add attribute to
  * @attrtype: attribute type
  * @value: numeric value
+ * @padattr: attribute type for the padding
  */
-static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value)
+static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
+				int padattr)
 {
-	return nla_put_be64_32bit(skb, attrtype | NLA_F_NET_BYTEORDER, value);
+	return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value,
+			    padattr);
 }
 
 /**
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 63b2e34f1b60..ebb5154976de 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -118,6 +118,7 @@ enum {
 	IPSET_ATTR_SKBMARK,
 	IPSET_ATTR_SKBPRIO,
 	IPSET_ATTR_SKBQUEUE,
+	IPSET_ATTR_PAD,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
-- 
cgit v1.2.3


From a558da0916b90c330940a106105d0a6a67cb77f7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 25 Apr 2016 10:25:20 +0200
Subject: ieee802154: use nla_put_u64_64bit()

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/nl802154.h  |  2 ++
 net/ieee802154/nl-mac.c   | 17 +++++++++++------
 net/ieee802154/nl802154.c |  3 ++-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index 167342c2ce6b..0f6f6607f592 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -92,6 +92,8 @@ enum {
 	IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
 	IEEE802154_ATTR_LLSEC_DEV_KEY_MODE,
 
+	IEEE802154_ATTR_PAD,
+
 	__IEEE802154_ATTR_MAX,
 };
 
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 3503c38954f9..d3cbb3258718 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -34,9 +34,11 @@
 
 #include "ieee802154.h"
 
-static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr)
+static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr,
+			  int padattr)
 {
-	return nla_put_u64(msg, type, swab64((__force u64)hwaddr));
+	return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr),
+				 padattr);
 }
 
 static __le64 nla_get_hwaddr(const struct nlattr *nla)
@@ -623,7 +625,8 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
 
 		if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
 		    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
-				   desc->device_addr.extended_addr))
+				   desc->device_addr.extended_addr,
+				   IEEE802154_ATTR_PAD))
 			return -EMSGSIZE;
 	}
 
@@ -638,7 +641,7 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
 
 	if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
 	    nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
-			   desc->extended_source))
+			   desc->extended_source, IEEE802154_ATTR_PAD))
 		return -EMSGSIZE;
 
 	return 0;
@@ -1063,7 +1066,8 @@ ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
 	    nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
 	    nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
 			      desc->short_addr) ||
-	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr,
+			   IEEE802154_ATTR_PAD) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
 			desc->frame_counter) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
@@ -1167,7 +1171,8 @@ ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
 
 	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
-	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr,
+			   IEEE802154_ATTR_PAD) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
 			devkey->frame_counter) ||
 	    ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 614072064d03..8035c93dd527 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -813,7 +813,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 
 	if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
 	    nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) ||
-	    nla_put_u64(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev)) ||
+	    nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+			      wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) ||
 	    nla_put_u32(msg, NL802154_ATTR_GENERATION,
 			rdev->devlist_generation ^
 			(cfg802154_rdev_list_generation << 2)))
-- 
cgit v1.2.3


From d4967cf38fbd62467b8fb5cab63d7da1f5907ed7 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Fri, 22 Apr 2016 08:41:01 +0300
Subject: qed*: Align statistics names

There's a difference in statsitics' names starting at qed and
propagating to qede, where egress counters indicate ranges while ingress
counters indiciate high-end.
Align all statistcs to follow the same conventions - name indicates range.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede.h         | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 29 ++++++++++++++++---------
 include/linux/qed/qed_if.h                      | 20 ++++++++---------
 5 files changed, 59 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index fb5f3b815340..31e1d510a991 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1415,16 +1415,16 @@ static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 			sizeof(port_stats));
 
 	p_stats->rx_64_byte_packets		+= port_stats.pmm.r64;
-	p_stats->rx_127_byte_packets		+= port_stats.pmm.r127;
-	p_stats->rx_255_byte_packets		+= port_stats.pmm.r255;
-	p_stats->rx_511_byte_packets		+= port_stats.pmm.r511;
-	p_stats->rx_1023_byte_packets		+= port_stats.pmm.r1023;
-	p_stats->rx_1518_byte_packets		+= port_stats.pmm.r1518;
-	p_stats->rx_1522_byte_packets		+= port_stats.pmm.r1522;
-	p_stats->rx_2047_byte_packets		+= port_stats.pmm.r2047;
-	p_stats->rx_4095_byte_packets		+= port_stats.pmm.r4095;
-	p_stats->rx_9216_byte_packets		+= port_stats.pmm.r9216;
-	p_stats->rx_16383_byte_packets		+= port_stats.pmm.r16383;
+	p_stats->rx_65_to_127_byte_packets	+= port_stats.pmm.r127;
+	p_stats->rx_128_to_255_byte_packets	+= port_stats.pmm.r255;
+	p_stats->rx_256_to_511_byte_packets	+= port_stats.pmm.r511;
+	p_stats->rx_512_to_1023_byte_packets	+= port_stats.pmm.r1023;
+	p_stats->rx_1024_to_1518_byte_packets	+= port_stats.pmm.r1518;
+	p_stats->rx_1519_to_1522_byte_packets	+= port_stats.pmm.r1522;
+	p_stats->rx_1519_to_2047_byte_packets	+= port_stats.pmm.r2047;
+	p_stats->rx_2048_to_4095_byte_packets	+= port_stats.pmm.r4095;
+	p_stats->rx_4096_to_9216_byte_packets	+= port_stats.pmm.r9216;
+	p_stats->rx_9217_to_16383_byte_packets	+= port_stats.pmm.r16383;
 	p_stats->rx_crc_errors			+= port_stats.pmm.rfcs;
 	p_stats->rx_mac_crtl_frames		+= port_stats.pmm.rxcf;
 	p_stats->rx_pause_frames		+= port_stats.pmm.rxpf;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 16df1591388f..a687e7a1dc8d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -59,16 +59,16 @@ struct qede_stats {
 
 	/* port */
 	u64 rx_64_byte_packets;
-	u64 rx_127_byte_packets;
-	u64 rx_255_byte_packets;
-	u64 rx_511_byte_packets;
-	u64 rx_1023_byte_packets;
-	u64 rx_1518_byte_packets;
-	u64 rx_1522_byte_packets;
-	u64 rx_2047_byte_packets;
-	u64 rx_4095_byte_packets;
-	u64 rx_9216_byte_packets;
-	u64 rx_16383_byte_packets;
+	u64 rx_65_to_127_byte_packets;
+	u64 rx_128_to_255_byte_packets;
+	u64 rx_256_to_511_byte_packets;
+	u64 rx_512_to_1023_byte_packets;
+	u64 rx_1024_to_1518_byte_packets;
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
 	u64 rx_crc_errors;
 	u64 rx_mac_crtl_frames;
 	u64 rx_pause_frames;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index f0982f163670..f87e83b41d5d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -59,16 +59,16 @@ static const struct {
 	QEDE_STAT(tx_bcast_pkts),
 
 	QEDE_PF_STAT(rx_64_byte_packets),
-	QEDE_PF_STAT(rx_127_byte_packets),
-	QEDE_PF_STAT(rx_255_byte_packets),
-	QEDE_PF_STAT(rx_511_byte_packets),
-	QEDE_PF_STAT(rx_1023_byte_packets),
-	QEDE_PF_STAT(rx_1518_byte_packets),
-	QEDE_PF_STAT(rx_1522_byte_packets),
-	QEDE_PF_STAT(rx_2047_byte_packets),
-	QEDE_PF_STAT(rx_4095_byte_packets),
-	QEDE_PF_STAT(rx_9216_byte_packets),
-	QEDE_PF_STAT(rx_16383_byte_packets),
+	QEDE_PF_STAT(rx_65_to_127_byte_packets),
+	QEDE_PF_STAT(rx_128_to_255_byte_packets),
+	QEDE_PF_STAT(rx_256_to_511_byte_packets),
+	QEDE_PF_STAT(rx_512_to_1023_byte_packets),
+	QEDE_PF_STAT(rx_1024_to_1518_byte_packets),
+	QEDE_PF_STAT(rx_1519_to_1522_byte_packets),
+	QEDE_PF_STAT(rx_1519_to_2047_byte_packets),
+	QEDE_PF_STAT(rx_2048_to_4095_byte_packets),
+	QEDE_PF_STAT(rx_4096_to_9216_byte_packets),
+	QEDE_PF_STAT(rx_9217_to_16383_byte_packets),
 	QEDE_PF_STAT(tx_64_byte_packets),
 	QEDE_PF_STAT(tx_65_to_127_byte_packets),
 	QEDE_PF_STAT(tx_128_to_255_byte_packets),
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 197ef85684da..1e3ee49bae24 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1638,16 +1638,25 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	edev->stats.coalesced_bytes = stats.tpa_coalesced_bytes;
 
 	edev->stats.rx_64_byte_packets = stats.rx_64_byte_packets;
-	edev->stats.rx_127_byte_packets = stats.rx_127_byte_packets;
-	edev->stats.rx_255_byte_packets = stats.rx_255_byte_packets;
-	edev->stats.rx_511_byte_packets = stats.rx_511_byte_packets;
-	edev->stats.rx_1023_byte_packets = stats.rx_1023_byte_packets;
-	edev->stats.rx_1518_byte_packets = stats.rx_1518_byte_packets;
-	edev->stats.rx_1522_byte_packets = stats.rx_1522_byte_packets;
-	edev->stats.rx_2047_byte_packets = stats.rx_2047_byte_packets;
-	edev->stats.rx_4095_byte_packets = stats.rx_4095_byte_packets;
-	edev->stats.rx_9216_byte_packets = stats.rx_9216_byte_packets;
-	edev->stats.rx_16383_byte_packets = stats.rx_16383_byte_packets;
+	edev->stats.rx_65_to_127_byte_packets = stats.rx_65_to_127_byte_packets;
+	edev->stats.rx_128_to_255_byte_packets =
+				stats.rx_128_to_255_byte_packets;
+	edev->stats.rx_256_to_511_byte_packets =
+				stats.rx_256_to_511_byte_packets;
+	edev->stats.rx_512_to_1023_byte_packets =
+				stats.rx_512_to_1023_byte_packets;
+	edev->stats.rx_1024_to_1518_byte_packets =
+				stats.rx_1024_to_1518_byte_packets;
+	edev->stats.rx_1519_to_1522_byte_packets =
+				stats.rx_1519_to_1522_byte_packets;
+	edev->stats.rx_1519_to_2047_byte_packets =
+				stats.rx_1519_to_2047_byte_packets;
+	edev->stats.rx_2048_to_4095_byte_packets =
+				stats.rx_2048_to_4095_byte_packets;
+	edev->stats.rx_4096_to_9216_byte_packets =
+				stats.rx_4096_to_9216_byte_packets;
+	edev->stats.rx_9217_to_16383_byte_packets =
+				stats.rx_9217_to_16383_byte_packets;
 	edev->stats.rx_crc_errors = stats.rx_crc_errors;
 	edev->stats.rx_mac_crtl_frames = stats.rx_mac_crtl_frames;
 	edev->stats.rx_pause_frames = stats.rx_pause_frames;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 67e8c206b2c1..82a7fe011068 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -384,16 +384,16 @@ struct qed_eth_stats {
 
 	/* port */
 	u64	rx_64_byte_packets;
-	u64	rx_127_byte_packets;
-	u64	rx_255_byte_packets;
-	u64	rx_511_byte_packets;
-	u64	rx_1023_byte_packets;
-	u64	rx_1518_byte_packets;
-	u64	rx_1522_byte_packets;
-	u64	rx_2047_byte_packets;
-	u64	rx_4095_byte_packets;
-	u64	rx_9216_byte_packets;
-	u64	rx_16383_byte_packets;
+	u64	rx_65_to_127_byte_packets;
+	u64	rx_128_to_255_byte_packets;
+	u64	rx_256_to_511_byte_packets;
+	u64	rx_512_to_1023_byte_packets;
+	u64	rx_1024_to_1518_byte_packets;
+	u64	rx_1519_to_1522_byte_packets;
+	u64	rx_1519_to_2047_byte_packets;
+	u64	rx_2048_to_4095_byte_packets;
+	u64	rx_4096_to_9216_byte_packets;
+	u64	rx_9217_to_16383_byte_packets;
 	u64	rx_crc_errors;
 	u64	rx_mac_crtl_frames;
 	u64	rx_pause_frames;
-- 
cgit v1.2.3


From fe7cd2bfdac4d8739bc8665eef040e668e6b428f Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Fri, 22 Apr 2016 08:41:03 +0300
Subject: qed*: Conditions for changing link

There's some inconsistency in current logic determining whether the
link settings of a given interface can be changed; I.e., in all modes
other than the so-called `deault' mode the interfaces are forbidden from
changing the configuration - but even this rule is not applied to all
user APIs that may change the configuration.

Instead, let the core-module [qed] decide whether an interface can change
the configuration by supporting a new API function. We also revise the
current rule, allowing all interfaces to change their configurations while
laying the infrastructure for future modes where an interface would be
blocked from making such a configuration.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c      |  6 ++++++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 14 ++++++++++----
 include/linux/qed/qed_if.h                      | 10 ++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1e9f321f1ac4..d189871e8e23 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -915,6 +915,11 @@ static u32 qed_sb_release(struct qed_dev *cdev,
 	return rc;
 }
 
+static bool qed_can_link_change(struct qed_dev *cdev)
+{
+	return true;
+}
+
 static int qed_set_link(struct qed_dev *cdev,
 			struct qed_link_params *params)
 {
@@ -1177,6 +1182,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.sb_release = &qed_sb_release,
 	.simd_handler_config = &qed_simd_handler_config,
 	.simd_handler_clean = &qed_simd_handler_clean,
+	.can_link_change = &qed_can_link_change,
 	.set_link = &qed_set_link,
 	.get_link = &qed_get_current_link,
 	.drain = &qed_drain,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 2ac98d44c1e1..f1dd25ac5552 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -239,9 +239,9 @@ static int qede_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	struct qed_link_params params;
 	u32 speed;
 
-	if (!edev->dev_info.common.is_mf_default) {
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
 		DP_INFO(edev,
-			"Link parameters can not be changed in non-default mode\n");
+			"Link settings are not allowed to be changed\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -350,6 +350,12 @@ static int qede_nway_reset(struct net_device *dev)
 	struct qed_link_output current_link;
 	struct qed_link_params link_params;
 
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev,
+			"Link settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (!netif_running(dev))
 		return 0;
 
@@ -450,9 +456,9 @@ static int qede_set_pauseparam(struct net_device *dev,
 	struct qed_link_params params;
 	struct qed_link_output current_link;
 
-	if (!edev->dev_info.common.is_mf_default) {
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
 		DP_INFO(edev,
-			"Pause parameters can not be updated in non-default mode\n");
+			"Pause settings are not allowed to be changed\n");
 		return -EOPNOTSUPP;
 	}
 
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 82a7fe011068..e5de42b62976 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -211,6 +211,16 @@ struct qed_common_ops {
 
 	void		(*simd_handler_clean)(struct qed_dev *cdev,
 					      int index);
+
+/**
+ * @brief can_link_change - can the instance change the link or not
+ *
+ * @param cdev
+ *
+ * @return true if link-change is allowed, false otherwise.
+ */
+	bool (*can_link_change)(struct qed_dev *cdev);
+
 /**
  * @brief set_link - set links according to params
  *
-- 
cgit v1.2.3


From 6fa01ccd883021105e9f8af7d04b9f156fa3494a Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 22 Apr 2016 18:36:35 -0700
Subject: skbuff: Add pskb_extract() helper function

A pattern of skb usage seen in modules such as RDS-TCP is to
extract `to_copy' bytes from the received TCP segment, starting
at some offset `off' into a new skb `clone'. This is done in
the ->data_ready callback, where the clone skb is queued up for rx on
the PF_RDS socket, while the parent TCP segment is returned unchanged
back to the TCP engine.

The existing code uses the sequence
	clone = skb_clone(..);
	pskb_pull(clone, off, ..);
	pskb_trim(clone, to_copy, ..);
with the intention of discarding the first `off' bytes. However,
skb_clone() + pskb_pull() implies pksb_expand_head(), which ends
up doing a redundant memcpy of bytes that will then get discarded
in __pskb_pull_tail().

To avoid this inefficiency, this commit adds pskb_extract() that
creates the clone, and memcpy's only the relevant header/frag/frag_list
to the start of `clone'. pskb_trim() is then invoked to trim clone
down to the requested to_copy bytes.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   2 +
 net/core/skbuff.c      | 242 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 244 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index da0ace389fec..a1ce63979ad8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
+			     gfp_t gfp);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ff7788b0151..7a1d48983f81 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4622,3 +4622,245 @@ failure:
 	return NULL;
 }
 EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+				    const int headlen, gfp_t gfp_mask)
+{
+	int i;
+	int size = skb_end_offset(skb);
+	int new_hlen = headlen - off;
+	u8 *data;
+	int doff = 0;
+
+	size = SKB_DATA_ALIGN(size);
+
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size +
+			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+			       gfp_mask, NUMA_NO_NODE, NULL);
+	if (!data)
+		return -ENOMEM;
+
+	size = SKB_WITH_OVERHEAD(ksize(data));
+
+	/* Copy real data, and all frags */
+	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+	skb->len -= off;
+
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb),
+	       offsetof(struct skb_shared_info,
+			frags[skb_shinfo(skb)->nr_frags]));
+	if (skb_cloned(skb)) {
+		/* drop the old head gracefully */
+		if (skb_orphan_frags(skb, gfp_mask)) {
+			kfree(data);
+			return -ENOMEM;
+		}
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			skb_frag_ref(skb, i);
+		if (skb_has_frag_list(skb))
+			skb_clone_fraglist(skb);
+		skb_release_data(skb);
+	} else {
+		/* we can reuse existing recount- all we did was
+		 * relocate values
+		 */
+		skb_free_head(skb);
+	}
+
+	doff = (data - skb->head);
+	skb->head = data;
+	skb->data = data;
+	skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->end = size;
+	doff = 0;
+#else
+	skb->end = skb->head + size;
+#endif
+	skb_set_tail_pointer(skb, skb_headlen(skb));
+	skb_headers_offset_update(skb, 0);
+	skb->cloned = 0;
+	skb->hdr_len = 0;
+	skb->nohdr = 0;
+	atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+	return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+				struct skb_shared_info *shinfo, int eat,
+				gfp_t gfp_mask)
+{
+	struct sk_buff *list = shinfo->frag_list;
+	struct sk_buff *clone = NULL;
+	struct sk_buff *insp = NULL;
+
+	do {
+		if (!list) {
+			pr_err("Not enough bytes to eat. Want %d\n", eat);
+			return -EFAULT;
+		}
+		if (list->len <= eat) {
+			/* Eaten as whole. */
+			eat -= list->len;
+			list = list->next;
+			insp = list;
+		} else {
+			/* Eaten partially. */
+			if (skb_shared(list)) {
+				clone = skb_clone(list, gfp_mask);
+				if (!clone)
+					return -ENOMEM;
+				insp = list->next;
+				list = clone;
+			} else {
+				/* This may be pulled without problems. */
+				insp = list;
+			}
+			if (pskb_carve(list, eat, gfp_mask) < 0) {
+				kfree_skb(clone);
+				return -ENOMEM;
+			}
+			break;
+		}
+	} while (eat);
+
+	/* Free pulled out fragments. */
+	while ((list = shinfo->frag_list) != insp) {
+		shinfo->frag_list = list->next;
+		kfree_skb(list);
+	}
+	/* And insert new clone at head. */
+	if (clone) {
+		clone->next = list;
+		shinfo->frag_list = clone;
+	}
+	return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+				       int pos, gfp_t gfp_mask)
+{
+	int i, k = 0;
+	int size = skb_end_offset(skb);
+	u8 *data;
+	const int nfrags = skb_shinfo(skb)->nr_frags;
+	struct skb_shared_info *shinfo;
+	int doff = 0;
+
+	size = SKB_DATA_ALIGN(size);
+
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size +
+			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+			       gfp_mask, NUMA_NO_NODE, NULL);
+	if (!data)
+		return -ENOMEM;
+
+	size = SKB_WITH_OVERHEAD(ksize(data));
+
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb), offsetof(struct skb_shared_info,
+					 frags[skb_shinfo(skb)->nr_frags]));
+	if (skb_orphan_frags(skb, gfp_mask)) {
+		kfree(data);
+		return -ENOMEM;
+	}
+	shinfo = (struct skb_shared_info *)(data + size);
+	for (i = 0; i < nfrags; i++) {
+		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+		if (pos + fsize > off) {
+			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+			if (pos < off) {
+				/* Split frag.
+				 * We have two variants in this case:
+				 * 1. Move all the frag to the second
+				 *    part, if it is possible. F.e.
+				 *    this approach is mandatory for TUX,
+				 *    where splitting is expensive.
+				 * 2. Split is accurately. We make this.
+				 */
+				shinfo->frags[0].page_offset += off - pos;
+				skb_frag_size_sub(&shinfo->frags[0], off - pos);
+			}
+			skb_frag_ref(skb, i);
+			k++;
+		}
+		pos += fsize;
+	}
+	shinfo->nr_frags = k;
+	if (skb_has_frag_list(skb))
+		skb_clone_fraglist(skb);
+
+	if (k == 0) {
+		/* split line is in frag list */
+		pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+	}
+	skb_release_data(skb);
+
+	doff = (data - skb->head);
+	skb->head = data;
+	skb->head_frag = 0;
+	skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->end = size;
+	doff = 0;
+#else
+	skb->end = skb->head + size;
+#endif
+	skb_reset_tail_pointer(skb);
+	skb_headers_offset_update(skb, 0);
+	skb->cloned   = 0;
+	skb->hdr_len  = 0;
+	skb->nohdr    = 0;
+	skb->len -= off;
+	skb->data_len = skb->len;
+	atomic_set(&skb_shinfo(skb)->dataref, 1);
+	return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+	int headlen = skb_headlen(skb);
+
+	if (len < headlen)
+		return pskb_carve_inside_header(skb, len, headlen, gfp);
+	else
+		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+			     int to_copy, gfp_t gfp)
+{
+	struct sk_buff  *clone = skb_clone(skb, gfp);
+
+	if (!clone)
+		return NULL;
+
+	if (pskb_carve(clone, off, gfp) < 0 ||
+	    pskb_trim(clone, to_copy)) {
+		kfree_skb(clone);
+		return NULL;
+	}
+	return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
-- 
cgit v1.2.3


From f0cdf76c103ffa34ca5ac87dcdef7edffc722cbf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 24 Apr 2016 21:38:14 +0200
Subject: net: remove NETDEV_TX_LOCKED support

No more users in the tree, remove NETDEV_TX_LOCKED support.
Adds another hole in softnet_stats struct, but better than keeping
the unused collision counter around.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.txt | 10 ++++-----
 Documentation/networking/netdevices.txt      |  9 +++-----
 include/linux/netdevice.h                    |  3 ---
 net/core/net-procfs.c                        |  3 ++-
 net/core/pktgen.c                            |  1 -
 net/sched/sch_generic.c                      | 32 ----------------------------
 6 files changed, 9 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
index f310edec8a77..7413eb05223b 100644
--- a/Documentation/networking/netdev-features.txt
+++ b/Documentation/networking/netdev-features.txt
@@ -131,13 +131,11 @@ stack. Driver should not change behaviour based on them.
 
  * LLTX driver (deprecated for hardware drivers)
 
-NETIF_F_LLTX should be set in drivers that implement their own locking in
-transmit path or don't need locking at all (e.g. software tunnels).
-In ndo_start_xmit, it is recommended to use a try_lock and return
-NETDEV_TX_LOCKED when the spin lock fails.  The locking should also properly
-protect against other callbacks (the rules you need to find out).
+NETIF_F_LLTX is meant to be used by drivers that don't need locking at all,
+e.g. software tunnels.
 
-Don't use it for new drivers.
+This is also used in a few legacy drivers that implement their
+own locking, don't use it for new (hardware) drivers.
 
  * netns-local device
 
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index 0b1cf6b2a592..7fec2061a334 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -69,10 +69,9 @@ ndo_start_xmit:
 
 	When the driver sets NETIF_F_LLTX in dev->features this will be
 	called without holding netif_tx_lock. In this case the driver
-	has to lock by itself when needed. It is recommended to use a try lock
-	for this and return NETDEV_TX_LOCKED when the spin lock fails.
-	The locking there should also properly protect against 
-	set_rx_mode. Note that the use of NETIF_F_LLTX is deprecated.
+	has to lock by itself when needed.
+	The locking there should also properly protect against
+	set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated.
 	Don't use it for new drivers.
 
 	Context: Process with BHs disabled or BH (timer),
@@ -83,8 +82,6 @@ ndo_start_xmit:
 	o NETDEV_TX_BUSY Cannot transmit packet, try later 
 	  Usually a bug, means queue start/stop flow control is broken in
 	  the driver. Note: the driver must NOT put the skb in its DMA ring.
-	o NETDEV_TX_LOCKED Locking failed, please retry quickly.
-	  Only valid when NETIF_F_LLTX is set.
 
 ndo_tx_timeout:
 	Synchronization: netif_tx_lock spinlock; all TX queues frozen.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1f6d5db471a2..18d8394f2e5d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -106,7 +106,6 @@ enum netdev_tx {
 	__NETDEV_TX_MIN	 = INT_MIN,	/* make sure enum is signed */
 	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
 	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
-	NETDEV_TX_LOCKED = 0x20,	/* driver tx lock was already taken */
 };
 typedef enum netdev_tx netdev_tx_t;
 
@@ -831,7 +830,6 @@ struct tc_to_netdev {
  *	the queue before that can happen; it's for obsolete devices and weird
  *	corner cases, but the stack really does a non-trivial amount
  *	of useless work if you return NETDEV_TX_BUSY.
- *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
  *	Required; cannot be NULL.
  *
  * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
@@ -2737,7 +2735,6 @@ struct softnet_data {
 	/* stats */
 	unsigned int		processed;
 	unsigned int		time_squeeze;
-	unsigned int		cpu_collision;
 	unsigned int		received_rps;
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 2bf83299600a..14d09345f00d 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps, flow_limit_count);
+		   0,	/* was cpu_collision */
+		   sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 20999aa596dd..8604ae245960 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3472,7 +3472,6 @@ xmit_more:
 				     pkt_dev->odevname, ret);
 		pkt_dev->errors++;
 		/* fallthru */
-	case NETDEV_TX_LOCKED:
 	case NETDEV_TX_BUSY:
 		/* Retry it next time */
 		atomic_dec(&(pkt_dev->skb->users));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80742edea96f..9c7756237904 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -108,35 +108,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	return skb;
 }
 
-static inline int handle_dev_cpu_collision(struct sk_buff *skb,
-					   struct netdev_queue *dev_queue,
-					   struct Qdisc *q)
-{
-	int ret;
-
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
-		/*
-		 * Same CPU holding the lock. It may be a transient
-		 * configuration error, when hard_start_xmit() recurses. We
-		 * detect it by checking xmit owner and drop the packet when
-		 * deadloop is detected. Return OK to try the next skb.
-		 */
-		kfree_skb_list(skb);
-		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
-				     dev_queue->dev->name);
-		ret = qdisc_qlen(q);
-	} else {
-		/*
-		 * Another cpu is holding lock, requeue & delay xmits for
-		 * some time.
-		 */
-		__this_cpu_inc(softnet_data.cpu_collision);
-		ret = dev_requeue_skb(skb, q);
-	}
-
-	return ret;
-}
-
 /*
  * Transmit possibly several skbs, and handle the return status as
  * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
@@ -174,9 +145,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
 		ret = qdisc_qlen(q);
-	} else if (ret == NETDEV_TX_LOCKED) {
-		/* Driver try lock failed */
-		ret = handle_dev_cpu_collision(skb, txq, q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
-- 
cgit v1.2.3


From 9218b44dcc059e08e249f6f7614b8e391eb041d8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:47 +0300
Subject: net/mlx5e: Statistics handling refactoring

Redesign ethtool statistics handling and reporting in the driver:
1. Move counters to a separate file (en_stats.h).
2. Remove unnecessary dependencies between stats and strings.
3. Use counter descriptors which hold a name and offset for each counter,
   and will be used to decide which counters will be exposed.

For example when adding a new software counter to ethtool, instead of:
1. Add to stats struct.
2. Add to strings struct in the same order.
3. Change macro defining number of software counters.
The only thing needed is to link the new counter to a counter descriptor.

VPort counters are a set of hardware traffic counters created automatically
for each virtual port opened.
PPort counters are a set of counters describing per physical port
performance statistics.
These counters are gathered from hardware register and divided to groups
according to different protocols.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 240 +---------------
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 132 ++++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 232 +++++----------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 318 +++++++++++++++++++++
 include/linux/mlx5/device.h                        |   1 +
 5 files changed, 483 insertions(+), 440 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6e24e821a1d8..e903eff2574f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -46,6 +46,7 @@
 #include <linux/rhashtable.h>
 #include "wq.h"
 #include "mlx5_core.h"
+#include "en_stats.h"
 
 #define MLX5E_MAX_NUM_TC	8
 
@@ -148,245 +149,6 @@ struct mlx5e_umr_wqe {
 #define MLX5E_MIN_BW_ALLOC 1   /* Min percentage of BW allocation */
 #endif
 
-static const char vport_strings[][ETH_GSTRING_LEN] = {
-	/* vport statistics */
-	"rx_packets",
-	"rx_bytes",
-	"tx_packets",
-	"tx_bytes",
-	"rx_error_packets",
-	"rx_error_bytes",
-	"tx_error_packets",
-	"tx_error_bytes",
-	"rx_unicast_packets",
-	"rx_unicast_bytes",
-	"tx_unicast_packets",
-	"tx_unicast_bytes",
-	"rx_multicast_packets",
-	"rx_multicast_bytes",
-	"tx_multicast_packets",
-	"tx_multicast_bytes",
-	"rx_broadcast_packets",
-	"rx_broadcast_bytes",
-	"tx_broadcast_packets",
-	"tx_broadcast_bytes",
-
-	/* SW counters */
-	"tso_packets",
-	"tso_bytes",
-	"tso_inner_packets",
-	"tso_inner_bytes",
-	"lro_packets",
-	"lro_bytes",
-	"rx_csum_good",
-	"rx_csum_none",
-	"rx_csum_sw",
-	"tx_csum_offload",
-	"tx_csum_inner",
-	"tx_queue_stopped",
-	"tx_queue_wake",
-	"tx_queue_dropped",
-	"rx_wqe_err",
-	"rx_mpwqe_filler",
-	"rx_mpwqe_frag",
-	"rx_buff_alloc_err",
-};
-
-struct mlx5e_vport_stats {
-	/* HW counters */
-	u64 rx_packets;
-	u64 rx_bytes;
-	u64 tx_packets;
-	u64 tx_bytes;
-	u64 rx_error_packets;
-	u64 rx_error_bytes;
-	u64 tx_error_packets;
-	u64 tx_error_bytes;
-	u64 rx_unicast_packets;
-	u64 rx_unicast_bytes;
-	u64 tx_unicast_packets;
-	u64 tx_unicast_bytes;
-	u64 rx_multicast_packets;
-	u64 rx_multicast_bytes;
-	u64 tx_multicast_packets;
-	u64 tx_multicast_bytes;
-	u64 rx_broadcast_packets;
-	u64 rx_broadcast_bytes;
-	u64 tx_broadcast_packets;
-	u64 tx_broadcast_bytes;
-
-	/* SW counters */
-	u64 tso_packets;
-	u64 tso_bytes;
-	u64 tso_inner_packets;
-	u64 tso_inner_bytes;
-	u64 lro_packets;
-	u64 lro_bytes;
-	u64 rx_csum_good;
-	u64 rx_csum_none;
-	u64 rx_csum_sw;
-	u64 tx_csum_offload;
-	u64 tx_csum_inner;
-	u64 tx_queue_stopped;
-	u64 tx_queue_wake;
-	u64 tx_queue_dropped;
-	u64 rx_wqe_err;
-	u64 rx_mpwqe_filler;
-	u64 rx_mpwqe_frag;
-	u64 rx_buff_alloc_err;
-
-#define NUM_VPORT_COUNTERS     38
-};
-
-static const char pport_strings[][ETH_GSTRING_LEN] = {
-	/* IEEE802.3 counters */
-	"frames_tx",
-	"frames_rx",
-	"check_seq_err",
-	"alignment_err",
-	"octets_tx",
-	"octets_received",
-	"multicast_xmitted",
-	"broadcast_xmitted",
-	"multicast_rx",
-	"broadcast_rx",
-	"in_range_len_errors",
-	"out_of_range_len",
-	"too_long_errors",
-	"symbol_err",
-	"mac_control_tx",
-	"mac_control_rx",
-	"unsupported_op_rx",
-	"pause_ctrl_rx",
-	"pause_ctrl_tx",
-
-	/* RFC2863 counters */
-	"in_octets",
-	"in_ucast_pkts",
-	"in_discards",
-	"in_errors",
-	"in_unknown_protos",
-	"out_octets",
-	"out_ucast_pkts",
-	"out_discards",
-	"out_errors",
-	"in_multicast_pkts",
-	"in_broadcast_pkts",
-	"out_multicast_pkts",
-	"out_broadcast_pkts",
-
-	/* RFC2819 counters */
-	"drop_events",
-	"octets",
-	"pkts",
-	"broadcast_pkts",
-	"multicast_pkts",
-	"crc_align_errors",
-	"undersize_pkts",
-	"oversize_pkts",
-	"fragments",
-	"jabbers",
-	"collisions",
-	"p64octets",
-	"p65to127octets",
-	"p128to255octets",
-	"p256to511octets",
-	"p512to1023octets",
-	"p1024to1518octets",
-	"p1519to2047octets",
-	"p2048to4095octets",
-	"p4096to8191octets",
-	"p8192to10239octets",
-};
-
-#define NUM_IEEE_802_3_COUNTERS		19
-#define NUM_RFC_2863_COUNTERS		13
-#define NUM_RFC_2819_COUNTERS		21
-#define NUM_PPORT_COUNTERS		(NUM_IEEE_802_3_COUNTERS + \
-					 NUM_RFC_2863_COUNTERS + \
-					 NUM_RFC_2819_COUNTERS)
-
-struct mlx5e_pport_stats {
-	__be64 IEEE_802_3_counters[NUM_IEEE_802_3_COUNTERS];
-	__be64 RFC_2863_counters[NUM_RFC_2863_COUNTERS];
-	__be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS];
-};
-
-static const char qcounter_stats_strings[][ETH_GSTRING_LEN] = {
-	"rx_out_of_buffer",
-};
-
-struct mlx5e_qcounter_stats {
-	u32 rx_out_of_buffer;
-#define NUM_Q_COUNTERS 1
-};
-
-static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",
-	"bytes",
-	"csum_none",
-	"csum_sw",
-	"lro_packets",
-	"lro_bytes",
-	"wqe_err",
-	"mpwqe_filler",
-	"mpwqe_frag",
-	"buff_alloc_err",
-};
-
-struct mlx5e_rq_stats {
-	u64 packets;
-	u64 bytes;
-	u64 csum_none;
-	u64 csum_sw;
-	u64 lro_packets;
-	u64 lro_bytes;
-	u64 wqe_err;
-	u64 mpwqe_filler;
-	u64 mpwqe_frag;
-	u64 buff_alloc_err;
-#define NUM_RQ_STATS 10
-};
-
-static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",
-	"bytes",
-	"tso_packets",
-	"tso_bytes",
-	"tso_inner_packets",
-	"tso_inner_bytes",
-	"csum_offload_inner",
-	"nop",
-	"csum_offload_none",
-	"stopped",
-	"wake",
-	"dropped",
-};
-
-struct mlx5e_sq_stats {
-	/* commonly accessed in data path */
-	u64 packets;
-	u64 bytes;
-	u64 tso_packets;
-	u64 tso_bytes;
-	u64 tso_inner_packets;
-	u64 tso_inner_bytes;
-	u64 csum_offload_inner;
-	u64 nop;
-	/* less likely accessed in data path */
-	u64 csum_offload_none;
-	u64 stopped;
-	u64 wake;
-	u64 dropped;
-#define NUM_SQ_STATS 12
-};
-
-struct mlx5e_stats {
-	struct mlx5e_vport_stats   vport;
-	struct mlx5e_pport_stats   pport;
-	struct mlx5e_qcounter_stats qcnt;
-};
-
 struct mlx5e_params {
 	u8  log_sq_size;
 	u8  rq_wq_type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 4077856aab76..f1649d543475 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -166,6 +166,12 @@ static const struct {
 };
 
 #define MLX5E_NUM_Q_CNTRS(priv) (NUM_Q_COUNTERS * (!!priv->q_counter))
+#define MLX5E_NUM_RQ_STATS(priv) \
+	(NUM_RQ_STATS * priv->params.num_channels * \
+	 test_bit(MLX5E_STATE_OPENED, &priv->state))
+#define MLX5E_NUM_SQ_STATS(priv) \
+	(NUM_SQ_STATS * priv->params.num_channels * priv->params.num_tc * \
+	 test_bit(MLX5E_STATE_OPENED, &priv->state))
 
 static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 {
@@ -173,21 +179,68 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
+		return NUM_SW_COUNTERS +
 		       MLX5E_NUM_Q_CNTRS(priv) +
-		       priv->params.num_channels * NUM_RQ_STATS +
-		       priv->params.num_channels * priv->params.num_tc *
-						   NUM_SQ_STATS;
+		       NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
+		       MLX5E_NUM_RQ_STATS(priv) +
+		       MLX5E_NUM_SQ_STATS(priv);
 	/* fallthrough */
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
+static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	int i, j, tc, idx = 0;
+
+	/* SW counters */
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, sw_stats_desc[i].name);
+
+	/* Q counters */
+	for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, q_stats_desc[i].name);
+
+	/* VPORT counters */
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       vport_stats_desc[i].name);
+
+	/* PPORT counters */
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_802_3_stats_desc[i].name);
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_2863_stats_desc[i].name);
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_2819_stats_desc[i].name);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return;
+
+	/* per channel counters */
+	for (i = 0; i < priv->params.num_channels; i++)
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN, "rx%d_%s", i,
+				rq_stats_desc[j].name);
+
+	for (tc = 0; tc < priv->params.num_tc; tc++)
+		for (i = 0; i < priv->params.num_channels; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				sprintf(data + (idx++) * ETH_GSTRING_LEN,
+					"tx%d_%s",
+					priv->channeltc_to_txq_map[i][tc],
+					sq_stats_desc[j].name);
+}
+
 static void mlx5e_get_strings(struct net_device *dev,
 			      uint32_t stringset, uint8_t *data)
 {
-	int i, j, tc, idx = 0;
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
 	switch (stringset) {
@@ -198,35 +251,7 @@ static void mlx5e_get_strings(struct net_device *dev,
 		break;
 
 	case ETH_SS_STATS:
-		/* VPORT counters */
-		for (i = 0; i < NUM_VPORT_COUNTERS; i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       vport_strings[i]);
-
-		/* Q counters */
-		for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       qcounter_stats_strings[i]);
-
-		/* PPORT counters */
-		for (i = 0; i < NUM_PPORT_COUNTERS; i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       pport_strings[i]);
-
-		/* per channel counters */
-		for (i = 0; i < priv->params.num_channels; i++)
-			for (j = 0; j < NUM_RQ_STATS; j++)
-				sprintf(data + (idx++) * ETH_GSTRING_LEN,
-					"rx%d_%s", i, rq_stats_strings[j]);
-
-		for (tc = 0; tc < priv->params.num_tc; tc++)
-			for (i = 0; i < priv->params.num_channels; i++)
-				for (j = 0; j < NUM_SQ_STATS; j++)
-					sprintf(data +
-					      (idx++) * ETH_GSTRING_LEN,
-					      "tx%d_%s",
-					      priv->channeltc_to_txq_map[i][tc],
-					      sq_stats_strings[j]);
+		mlx5e_fill_stats_strings(priv, data);
 		break;
 	}
 }
@@ -245,28 +270,45 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
 		mlx5e_update_stats(priv);
 	mutex_unlock(&priv->state_lock);
 
-	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
-		data[idx++] = ((u64 *)&priv->stats.vport)[i];
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
+						   sw_stats_desc, i);
 
 	for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
-		data[idx++] = ((u32 *)&priv->stats.qcnt)[i];
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   q_stats_desc, i);
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vport.query_vport_out,
+						  vport_stats_desc, i);
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.IEEE_802_3_counters,
+						  pport_802_3_stats_desc, i);
 
-	for (i = 0; i < NUM_PPORT_COUNTERS; i++)
-		data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2863_counters,
+						  pport_2863_stats_desc, i);
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2819_counters,
+						  pport_2819_stats_desc, i);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return;
 
 	/* per channel counters */
 	for (i = 0; i < priv->params.num_channels; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
-			data[idx++] = !test_bit(MLX5E_STATE_OPENED,
-						&priv->state) ? 0 :
-				       ((u64 *)&priv->channel[i]->rq.stats)[j];
+			data[idx++] =
+			       MLX5E_READ_CTR64_CPU(&priv->channel[i]->rq.stats,
+						    rq_stats_desc, j);
 
 	for (tc = 0; tc < priv->params.num_tc; tc++)
 		for (i = 0; i < priv->params.num_channels; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
-				data[idx++] = !test_bit(MLX5E_STATE_OPENED,
-							&priv->state) ? 0 :
-				((u64 *)&priv->channel[i]->sq[tc].stats)[j];
+				data[idx++] = MLX5E_READ_CTR64_CPU(&priv->channel[i]->sq[tc].stats,
+								   sq_stats_desc, j);
 }
 
 static void mlx5e_get_ringparam(struct net_device *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6270f8d539db..0c532367ff13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -91,96 +91,15 @@ static void mlx5e_update_carrier_work(struct work_struct *work)
 	mutex_unlock(&priv->state_lock);
 }
 
-static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_pport_stats *s = &priv->stats.pport;
-	u32 *in;
-	u32 *out;
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-
-	in  = mlx5_vzalloc(sz);
-	out = mlx5_vzalloc(sz);
-	if (!in || !out)
-		goto free_out;
-
-	MLX5_SET(ppcnt_reg, in, local_port, 1);
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->IEEE_802_3_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->IEEE_802_3_counters));
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->RFC_2863_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->RFC_2863_counters));
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->RFC_2819_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->RFC_2819_counters));
-
-free_out:
-	kvfree(in);
-	kvfree(out);
-}
-
-static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
-{
-	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
-
-	if (!priv->q_counter)
-		return;
-
-	mlx5_core_query_out_of_buffer(priv->mdev, priv->q_counter,
-				      &qcnt->rx_out_of_buffer);
-}
-
-void mlx5e_update_stats(struct mlx5e_priv *priv)
+static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 {
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_vport_stats *s = &priv->stats.vport;
+	struct mlx5e_sw_stats *s = &priv->stats.sw;
 	struct mlx5e_rq_stats *rq_stats;
 	struct mlx5e_sq_stats *sq_stats;
-	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
-	u32 *out;
-	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
-	u64 tx_offload_none;
+	u64 tx_offload_none = 0;
 	int i, j;
 
-	out = mlx5_vzalloc(outlen);
-	if (!out)
-		return;
-
-	/* Collect firts the SW counters and then HW for consistency */
-	s->rx_packets		= 0;
-	s->rx_bytes		= 0;
-	s->tx_packets		= 0;
-	s->tx_bytes		= 0;
-	s->tso_packets		= 0;
-	s->tso_bytes		= 0;
-	s->tso_inner_packets	= 0;
-	s->tso_inner_bytes	= 0;
-	s->tx_queue_stopped	= 0;
-	s->tx_queue_wake	= 0;
-	s->tx_queue_dropped	= 0;
-	s->tx_csum_inner	= 0;
-	tx_offload_none		= 0;
-	s->lro_packets		= 0;
-	s->lro_bytes		= 0;
-	s->rx_csum_none		= 0;
-	s->rx_csum_sw		= 0;
-	s->rx_wqe_err		= 0;
-	s->rx_mpwqe_filler	= 0;
-	s->rx_mpwqe_frag	= 0;
-	s->rx_buff_alloc_err	= 0;
+	memset(s, 0, sizeof(*s));
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -212,7 +131,19 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		}
 	}
 
-	/* HW counters */
+	/* Update calculated offload counters */
+	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
+	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
+			     s->rx_csum_sw;
+}
+
+static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 *out = (u32 *)priv->stats.vport.query_vport_out;
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
+	struct mlx5_core_dev *mdev = priv->mdev;
+
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
@@ -222,58 +153,56 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 
 	memset(out, 0, outlen);
 
-	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen))
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+	u32 *in;
+
+	in = mlx5_vzalloc(sz);
+	if (!in)
 		goto free_out;
 
-#define MLX5_GET_CTR(p, x) \
-	MLX5_GET64(query_vport_counter_out, p, x)
-
-	s->rx_error_packets     =
-		MLX5_GET_CTR(out, received_errors.packets);
-	s->rx_error_bytes       =
-		MLX5_GET_CTR(out, received_errors.octets);
-	s->tx_error_packets     =
-		MLX5_GET_CTR(out, transmit_errors.packets);
-	s->tx_error_bytes       =
-		MLX5_GET_CTR(out, transmit_errors.octets);
-
-	s->rx_unicast_packets   =
-		MLX5_GET_CTR(out, received_eth_unicast.packets);
-	s->rx_unicast_bytes     =
-		MLX5_GET_CTR(out, received_eth_unicast.octets);
-	s->tx_unicast_packets   =
-		MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
-	s->tx_unicast_bytes     =
-		MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
-
-	s->rx_multicast_packets =
-		MLX5_GET_CTR(out, received_eth_multicast.packets);
-	s->rx_multicast_bytes   =
-		MLX5_GET_CTR(out, received_eth_multicast.octets);
-	s->tx_multicast_packets =
-		MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
-	s->tx_multicast_bytes   =
-		MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
-
-	s->rx_broadcast_packets =
-		MLX5_GET_CTR(out, received_eth_broadcast.packets);
-	s->rx_broadcast_bytes   =
-		MLX5_GET_CTR(out, received_eth_broadcast.octets);
-	s->tx_broadcast_packets =
-		MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
-	s->tx_broadcast_bytes   =
-		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
-	/* Update calculated offload counters */
-	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
-	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
-			       s->rx_csum_sw;
+	out = pstats->IEEE_802_3_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
-	mlx5e_update_pport_counters(priv);
-	mlx5e_update_q_counter(priv);
+	out = pstats->RFC_2863_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
+	out = pstats->RFC_2819_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 free_out:
-	kvfree(out);
+	kvfree(in);
+}
+
+static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
+
+	if (!priv->q_counter)
+		return;
+
+	mlx5_core_query_out_of_buffer(priv->mdev, priv->q_counter,
+				      &qcnt->rx_out_of_buffer);
+}
+
+void mlx5e_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_update_sw_counters(priv);
+	mlx5e_update_q_counter(priv);
+	mlx5e_update_vport_counters(priv);
+	mlx5e_update_pport_counters(priv);
 }
 
 static void mlx5e_update_stats_work(struct work_struct *work)
@@ -2073,37 +2002,28 @@ static struct rtnl_link_stats64 *
 mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
 	struct mlx5e_vport_stats *vstats = &priv->stats.vport;
 	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
 
-	stats->rx_packets = vstats->rx_packets;
-	stats->rx_bytes   = vstats->rx_bytes;
-	stats->tx_packets = vstats->tx_packets;
-	stats->tx_bytes   = vstats->tx_bytes;
-
-#define PPCNT_GET_802_3_CTR(fld)                            \
-	(MLX5_GET64(eth_802_3_cntrs_grp_data_layout,        \
-			pstats->IEEE_802_3_counters, fld##_high))
-
-#define PPCNT_GET_2863_CTR(fld)                             \
-	(MLX5_GET64(eth_2863_cntrs_grp_data_layout,         \
-			pstats->RFC_2863_counters, fld##_high))
+	stats->rx_packets = sstats->rx_packets;
+	stats->rx_bytes   = sstats->rx_bytes;
+	stats->tx_packets = sstats->tx_packets;
+	stats->tx_bytes   = sstats->tx_bytes;
 
 	stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer;
-	stats->tx_dropped = vstats->tx_queue_dropped;
+	stats->tx_dropped = sstats->tx_queue_dropped;
 
 	stats->rx_length_errors =
-		PPCNT_GET_802_3_CTR(a_in_range_length_errors) +
-		PPCNT_GET_802_3_CTR(a_out_of_range_length_field) +
-		PPCNT_GET_802_3_CTR(a_frame_too_long_errors);
+		PPORT_802_3_GET(pstats, a_in_range_length_errors) +
+		PPORT_802_3_GET(pstats, a_out_of_range_length_field) +
+		PPORT_802_3_GET(pstats, a_frame_too_long_errors);
 	stats->rx_crc_errors =
-		PPCNT_GET_802_3_CTR(a_frame_check_sequence_errors);
-	stats->rx_frame_errors =
-		PPCNT_GET_802_3_CTR(a_alignment_errors);
-	stats->tx_aborted_errors =
-		PPCNT_GET_2863_CTR(if_out_discards);
+		PPORT_802_3_GET(pstats, a_frame_check_sequence_errors);
+	stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors);
+	stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards);
 	stats->tx_carrier_errors =
-		PPCNT_GET_802_3_CTR(a_symbol_error_during_carrier);
+		PPORT_802_3_GET(pstats, a_symbol_error_during_carrier);
 	stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors +
 			   stats->rx_frame_errors;
 	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
@@ -2111,8 +2031,8 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	/* vport multicast also counts packets that are dropped due to steering
 	 * or rx out of buffer
 	 */
-	stats->multicast = vstats->rx_multicast_packets;
-
+	stats->multicast =
+		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
 
 	return stats;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
new file mode 100644
index 000000000000..116320d8fc42
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_STATS_H__
+#define __MLX5_EN_STATS_H__
+
+#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \
+	(*(u64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \
+	(*(u32 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset))
+
+#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld)
+
+struct counter_desc {
+	char		name[ETH_GSTRING_LEN];
+	int		offset; /* Byte offset */
+};
+
+struct mlx5e_sw_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 rx_csum_good;
+	u64 rx_csum_none;
+	u64 rx_csum_sw;
+	u64 tx_csum_offload;
+	u64 tx_csum_inner;
+	u64 tx_queue_stopped;
+	u64 tx_queue_wake;
+	u64 tx_queue_dropped;
+	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
+	u64 rx_mpwqe_frag;
+	u64 rx_buff_alloc_err;
+};
+
+static const struct counter_desc sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_good) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_offload) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+};
+
+struct mlx5e_qcounter_stats {
+	u32 rx_out_of_buffer;
+};
+
+static const struct counter_desc q_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_out_of_buffer) },
+};
+
+#define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c)
+#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \
+						vstats->query_vport_out, c)
+
+struct mlx5e_vport_stats {
+	__be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)];
+};
+
+static const struct counter_desc vport_stats_desc[] = {
+	{ "rx_error_packets", VPORT_COUNTER_OFF(received_errors.packets) },
+	{ "rx_error_bytes", VPORT_COUNTER_OFF(received_errors.octets) },
+	{ "tx_error_packets", VPORT_COUNTER_OFF(transmit_errors.packets) },
+	{ "tx_error_bytes", VPORT_COUNTER_OFF(transmit_errors.octets) },
+	{ "rx_unicast_packets",
+		VPORT_COUNTER_OFF(received_eth_unicast.packets) },
+	{ "rx_unicast_bytes", VPORT_COUNTER_OFF(received_eth_unicast.octets) },
+	{ "tx_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.packets) },
+	{ "tx_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.octets) },
+	{ "rx_multicast_packets",
+		VPORT_COUNTER_OFF(received_eth_multicast.packets) },
+	{ "rx_multicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_multicast.octets) },
+	{ "tx_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.packets) },
+	{ "tx_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.octets) },
+	{ "rx_broadcast_packets",
+		VPORT_COUNTER_OFF(received_eth_broadcast.packets) },
+	{ "rx_broadcast_bytes",
+		VPORT_COUNTER_OFF(received_eth_broadcast.octets) },
+	{ "tx_broadcast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.packets) },
+	{ "tx_broadcast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.octets) },
+};
+
+#define PPORT_802_3_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_802_3_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \
+		   counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \
+		   counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \
+		   counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+};
+
+static const struct counter_desc pport_802_3_stats_desc[] = {
+	{ "frames_tx", PPORT_802_3_OFF(a_frames_transmitted_ok) },
+	{ "frames_rx", PPORT_802_3_OFF(a_frames_received_ok) },
+	{ "check_seq_err", PPORT_802_3_OFF(a_frame_check_sequence_errors) },
+	{ "alignment_err", PPORT_802_3_OFF(a_alignment_errors) },
+	{ "octets_tx", PPORT_802_3_OFF(a_octets_transmitted_ok) },
+	{ "octets_received", PPORT_802_3_OFF(a_octets_received_ok) },
+	{ "multicast_xmitted", PPORT_802_3_OFF(a_multicast_frames_xmitted_ok) },
+	{ "broadcast_xmitted", PPORT_802_3_OFF(a_broadcast_frames_xmitted_ok) },
+	{ "multicast_rx", PPORT_802_3_OFF(a_multicast_frames_received_ok) },
+	{ "broadcast_rx", PPORT_802_3_OFF(a_broadcast_frames_received_ok) },
+	{ "in_range_len_errors", PPORT_802_3_OFF(a_in_range_length_errors) },
+	{ "out_of_range_len", PPORT_802_3_OFF(a_out_of_range_length_field) },
+	{ "too_long_errors", PPORT_802_3_OFF(a_frame_too_long_errors) },
+	{ "symbol_err", PPORT_802_3_OFF(a_symbol_error_during_carrier) },
+	{ "mac_control_tx", PPORT_802_3_OFF(a_mac_control_frames_transmitted) },
+	{ "mac_control_rx", PPORT_802_3_OFF(a_mac_control_frames_received) },
+	{ "unsupported_op_rx",
+		PPORT_802_3_OFF(a_unsupported_opcodes_received) },
+	{ "pause_ctrl_rx", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_received) },
+	{ "pause_ctrl_tx",
+		PPORT_802_3_OFF(a_pause_mac_ctrl_frames_transmitted) },
+};
+
+static const struct counter_desc pport_2863_stats_desc[] = {
+	{ "in_octets", PPORT_2863_OFF(if_in_octets) },
+	{ "in_ucast_pkts", PPORT_2863_OFF(if_in_ucast_pkts) },
+	{ "in_discards", PPORT_2863_OFF(if_in_discards) },
+	{ "in_errors", PPORT_2863_OFF(if_in_errors) },
+	{ "in_unknown_protos", PPORT_2863_OFF(if_in_unknown_protos) },
+	{ "out_octets", PPORT_2863_OFF(if_out_octets) },
+	{ "out_ucast_pkts", PPORT_2863_OFF(if_out_ucast_pkts) },
+	{ "out_discards", PPORT_2863_OFF(if_out_discards) },
+	{ "out_errors", PPORT_2863_OFF(if_out_errors) },
+	{ "in_multicast_pkts", PPORT_2863_OFF(if_in_multicast_pkts) },
+	{ "in_broadcast_pkts", PPORT_2863_OFF(if_in_broadcast_pkts) },
+	{ "out_multicast_pkts", PPORT_2863_OFF(if_out_multicast_pkts) },
+	{ "out_broadcast_pkts", PPORT_2863_OFF(if_out_broadcast_pkts) },
+};
+
+static const struct counter_desc pport_2819_stats_desc[] = {
+	{ "drop_events", PPORT_2819_OFF(ether_stats_drop_events) },
+	{ "octets", PPORT_2819_OFF(ether_stats_octets) },
+	{ "pkts", PPORT_2819_OFF(ether_stats_pkts) },
+	{ "broadcast_pkts", PPORT_2819_OFF(ether_stats_broadcast_pkts) },
+	{ "multicast_pkts", PPORT_2819_OFF(ether_stats_multicast_pkts) },
+	{ "crc_align_errors", PPORT_2819_OFF(ether_stats_crc_align_errors) },
+	{ "undersize_pkts", PPORT_2819_OFF(ether_stats_undersize_pkts) },
+	{ "oversize_pkts", PPORT_2819_OFF(ether_stats_oversize_pkts) },
+	{ "fragments", PPORT_2819_OFF(ether_stats_fragments) },
+	{ "jabbers", PPORT_2819_OFF(ether_stats_jabbers) },
+	{ "collisions", PPORT_2819_OFF(ether_stats_collisions) },
+	{ "p64octets", PPORT_2819_OFF(ether_stats_pkts64octets) },
+	{ "p65to127octets", PPORT_2819_OFF(ether_stats_pkts65to127octets) },
+	{ "p128to255octets", PPORT_2819_OFF(ether_stats_pkts128to255octets) },
+	{ "p256to511octets", PPORT_2819_OFF(ether_stats_pkts256to511octets) },
+	{ "p512to1023octets", PPORT_2819_OFF(ether_stats_pkts512to1023octets) },
+	{ "p1024to1518octets",
+		PPORT_2819_OFF(ether_stats_pkts1024to1518octets) },
+	{ "p1519to2047octets",
+		PPORT_2819_OFF(ether_stats_pkts1519to2047octets) },
+	{ "p2048to4095octets",
+		PPORT_2819_OFF(ether_stats_pkts2048to4095octets) },
+	{ "p4096to8191octets",
+		PPORT_2819_OFF(ether_stats_pkts4096to8191octets) },
+	{ "p8192to10239octets",
+		PPORT_2819_OFF(ether_stats_pkts8192to10239octets) },
+};
+
+struct mlx5e_rq_stats {
+	u64 packets;
+	u64 bytes;
+	u64 csum_none;
+	u64 csum_sw;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 wqe_err;
+	u64 mpwqe_filler;
+	u64 mpwqe_frag;
+	u64 buff_alloc_err;
+};
+
+static const struct counter_desc rq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_filler) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_frag) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+};
+
+struct mlx5e_sq_stats {
+	/* commonly accessed in data path */
+	u64 packets;
+	u64 bytes;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 csum_offload_inner;
+	u64 nop;
+	/* less likely accessed in data path */
+	u64 csum_offload_none;
+	u64 stopped;
+	u64 wake;
+	u64 dropped;
+};
+
+static const struct counter_desc sq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, csum_offload_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, nop) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, csum_offload_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, dropped) },
+};
+
+#define NUM_SW_COUNTERS			ARRAY_SIZE(sw_stats_desc)
+#define NUM_Q_COUNTERS			ARRAY_SIZE(q_stats_desc)
+#define NUM_VPORT_COUNTERS		ARRAY_SIZE(vport_stats_desc)
+#define NUM_PPORT_802_3_COUNTERS	ARRAY_SIZE(pport_802_3_stats_desc)
+#define NUM_PPORT_2863_COUNTERS		ARRAY_SIZE(pport_2863_stats_desc)
+#define NUM_PPORT_2819_COUNTERS		ARRAY_SIZE(pport_2819_stats_desc)
+#define NUM_PPORT_COUNTERS		(NUM_PPORT_802_3_COUNTERS + \
+					 NUM_PPORT_2863_COUNTERS  + \
+					 NUM_PPORT_2819_COUNTERS)
+#define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
+#define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
+
+struct mlx5e_stats {
+	struct mlx5e_sw_stats sw;
+	struct mlx5e_qcounter_stats qcnt;
+	struct mlx5e_vport_stats vport;
+	struct mlx5e_pport_stats pport;
+};
+
+#endif /* __MLX5_EN_STATS_H__ */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 03f8d719b680..8be44ca777ed 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -59,6 +59,7 @@
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
 #define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
 #define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
 #define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
-- 
cgit v1.2.3


From 121fcdc84d8240d4dfe1f737befd5814b12623ee Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:50 +0300
Subject: net/mlx5e: Add link down events counter

Expose link_down_events counter through ethtool -S.
This counter is read from PPort statistics, then proccessed and stored as
a special handling software counter.
This counter is stored along software counters since it is the only PPort
counter that it's size is not 64 bits.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 10 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  5 +++++
 include/linux/mlx5/device.h                        |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ef66ba65f5cd..61e261c3d247 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -135,6 +135,10 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
 	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
 			     s->rx_csum_sw;
+
+	s->link_down_events = MLX5_GET(ppcnt_reg,
+				priv->stats.pport.phy_counters,
+				counter_set.phys_layer_cntrs.link_down_events);
 }
 
 static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
@@ -183,6 +187,10 @@ static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
+	out = pstats->phy_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
 		out = pstats->per_prio_counters[prio];
@@ -208,10 +216,10 @@ static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
 
 void mlx5e_update_stats(struct mlx5e_priv *priv)
 {
-	mlx5e_update_sw_counters(priv);
 	mlx5e_update_q_counter(priv);
 	mlx5e_update_vport_counters(priv);
 	mlx5e_update_pport_counters(priv);
+	mlx5e_update_sw_counters(priv);
 }
 
 static void mlx5e_update_stats_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index de27eeaa3bd2..7cd8cb44b2ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -71,6 +71,9 @@ struct mlx5e_sw_stats {
 	u64 rx_mpwqe_filler;
 	u64 rx_mpwqe_frag;
 	u64 rx_buff_alloc_err;
+
+	/* Special handling counters */
+	u64 link_down_events;
 };
 
 static const struct counter_desc sw_stats_desc[] = {
@@ -96,6 +99,7 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events) },
 };
 
 struct mlx5e_qcounter_stats {
@@ -178,6 +182,7 @@ struct mlx5e_pport_stats {
 	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 };
 
 static const struct counter_desc pport_802_3_stats_desc[] = {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8be44ca777ed..942bccacd7b7 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1369,6 +1369,7 @@ enum {
 	MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5,
 	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
 	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11,
+	MLX5_PHYSICAL_LAYER_COUNTERS_GROUP    = 0x12,
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
-- 
cgit v1.2.3


From 94cb1ebbafd509210887eea6ced55c40da7b4baa Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:52 +0300
Subject: net/mlx5e: Add support for RXALL netdev feature

Introduce new access register named Ports Check Mask Register (PCMR) to
control all HW checks on port. With this register, the driver can
enable/disable Hardware FCS validation.

When RXALL is enabled/disabled using ndo_set_features, enable/disable
fcs check at HW.
User can change HW configuration using rx-all flag at ethtool.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 20 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c    | 49 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                       |  1 +
 include/linux/mlx5/port.h                         |  4 ++
 4 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d82bc6b697f9..ad0cb4aa593b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2139,6 +2139,14 @@ static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
 	return 0;
 }
 
+static int set_feature_rx_all(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_set_port_fcs(mdev, !enable);
+}
+
 static int mlx5e_handle_feature(struct net_device *netdev,
 				netdev_features_t wanted_features,
 				netdev_features_t feature,
@@ -2174,6 +2182,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 				    set_feature_vlan_filter);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_TC,
 				    set_feature_tc_num_filters);
+	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL,
+				    set_feature_rx_all);
 
 	return err ? -EINVAL : 0;
 }
@@ -2564,6 +2574,8 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
+	bool fcs_supported;
+	bool fcs_enabled;
 
 	SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
 
@@ -2607,10 +2619,18 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
 	}
 
+	mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
+
+	if (fcs_supported)
+		netdev->hw_features |= NETIF_F_RXALL;
+
 	netdev->features          = netdev->hw_features;
 	if (!priv->params.lro_en)
 		netdev->features  &= ~NETIF_F_LRO;
 
+	if (fcs_enabled)
+		netdev->features  &= ~NETIF_F_RXALL;
+
 #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
 	if (FT_CAP(flow_modify_en) &&
 	    FT_CAP(modify_root) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index ae378c575deb..c37740f30fbe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -607,3 +607,52 @@ int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode)
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_wol);
+
+static int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    outlen, MLX5_REG_PCMR, 0, 0);
+}
+
+static int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	return mlx5_core_access_reg(mdev, in, inlen, out,
+				    sizeof(out), MLX5_REG_PCMR, 0, 1);
+}
+
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	MLX5_SET(pcmr_reg, in, fcs_chk, enable);
+
+	return mlx5_set_ports_check(mdev, in, sizeof(in));
+}
+
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+	/* Default values for FW which do not support MLX5_REG_PCMR */
+	*supported = false;
+	*enabled = true;
+
+	if (!MLX5_CAP_GEN(mdev, ports_check))
+		return;
+
+	if (mlx5_query_ports_check(mdev, out, sizeof(out)))
+		return;
+
+	*supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap));
+	*enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk));
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index dcd5ac8d3b14..497a4dbd91b0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -112,6 +112,7 @@ enum {
 	MLX5_REG_PMPE		 = 0x5010,
 	MLX5_REG_PELC		 = 0x500e,
 	MLX5_REG_PVLC		 = 0x500f,
+	MLX5_REG_PCMR		 = 0x5041,
 	MLX5_REG_PMLP		 = 0, /* TBD */
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index a1d145abd4eb..577e953d0aa7 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -84,4 +84,8 @@ int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode);
 int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
 
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From da54d24ec3ef736de04c61a01653776a9750334f Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:53 +0300
Subject: net/mlx5e: Add ethtool support for interface identify (LED blinking)

Add the needed hardware command and mlx5_ifc structs for managing LED
control.
Add set_phys_id ethtool callback to support ethtool -p flag.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 25 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 13 +++++++++++
 include/linux/mlx5/driver.h                        |  1 +
 include/linux/mlx5/port.h                          |  6 ++++++
 4 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 522d584bc05f..a2c444ec191b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1135,6 +1135,30 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 	return mlx5_set_port_wol(mdev, mlx5_wol_mode);
 }
 
+static int mlx5e_set_phys_id(struct net_device *dev,
+			     enum ethtool_phys_id_state state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 beacon_duration;
+
+	if (!MLX5_CAP_GEN(mdev, beacon_led))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_INF;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_OFF;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return mlx5_set_port_beacon(mdev, beacon_duration);
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -1159,6 +1183,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_pauseparam    = mlx5e_get_pauseparam,
 	.set_pauseparam    = mlx5e_set_pauseparam,
 	.get_ts_info       = mlx5e_get_ts_info,
+	.set_phys_id       = mlx5e_set_phys_id,
 	.get_wol	   = mlx5e_get_wol,
 	.set_wol	   = mlx5e_set_wol,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index c37740f30fbe..446549f63b5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -115,6 +115,19 @@ int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_ptys);
 
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration)
+{
+	u32 out[MLX5_ST_SZ_DW(mlcr_reg)];
+	u32 in[MLX5_ST_SZ_DW(mlcr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(mlcr_reg, in, local_port, 1);
+	MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MLCR, 0, 1);
+}
+
 int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 			      u32 *proto_cap, int proto_mask)
 {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 497a4dbd91b0..2e8758d1b19e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -116,6 +116,7 @@ enum {
 	MLX5_REG_PMLP		 = 0, /* TBD */
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MLCR		 = 0x902b,
 };
 
 enum {
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 577e953d0aa7..a364ab1737a0 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -35,6 +35,11 @@
 
 #include <linux/mlx5/driver.h>
 
+enum mlx5_beacon_duration {
+	MLX5_BEACON_DURATION_OFF = 0x0,
+	MLX5_BEACON_DURATION_INF = 0xffff,
+};
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
@@ -53,6 +58,7 @@ int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
 			       enum mlx5_port_status status);
 int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
 				 enum mlx5_port_status *status);
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration);
 
 int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
 void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-- 
cgit v1.2.3


From bb64143eee8c036a89b31daa4e9bf8360a8bded1 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:54 +0300
Subject: net/mlx5e: Add ethtool support for dump module EEPROM

Add query MCIA, PMLP registers infrastructure and commands.
Add ethtool support for get_module_info() and get_module_eeprom()
callbacks.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 80 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 76 ++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  3 +-
 include/linux/mlx5/port.h                          | 15 ++++
 4 files changed, 173 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index a2c444ec191b..0518c8658507 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1159,6 +1159,84 @@ static int mlx5e_set_phys_id(struct net_device *dev,
 	return mlx5_set_port_beacon(mdev, beacon_duration);
 }
 
+static int mlx5e_get_module_info(struct net_device *netdev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *dev = priv->mdev;
+	int size_read = 0;
+	u8 data[4];
+
+	size_read = mlx5_query_module_eeprom(dev, 0, 2, data);
+	if (size_read < 2)
+		return -EIO;
+
+	/* data[0] = identifier byte */
+	switch (data[0]) {
+	case MLX5_MODULE_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX5_MODULE_ID_QSFP_PLUS:
+	case MLX5_MODULE_ID_QSFP28:
+		/* data[1] = revision id */
+		if (data[0] == MLX5_MODULE_ID_QSFP28 || data[1] >= 0x3) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX5_MODULE_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n",
+			   __func__, data[0]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5e_get_module_eeprom(struct net_device *netdev,
+				   struct ethtool_eeprom *ee,
+				   u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int size_read;
+	int i = 0;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i,
+						     data + i);
+
+		if (!size_read)
+			/* Done reading */
+			return 0;
+
+		if (size_read < 0) {
+			netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n",
+				   __func__, size_read);
+			return 0;
+		}
+
+		i += size_read;
+		offset += size_read;
+	}
+
+	return 0;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -1186,4 +1264,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.set_phys_id       = mlx5e_set_phys_id,
 	.get_wol	   = mlx5e_get_wol,
 	.set_wol	   = mlx5e_set_wol,
+	.get_module_info   = mlx5e_get_module_info,
+	.get_module_eeprom = mlx5e_get_module_eeprom,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 446549f63b5c..4cb2a44510fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -310,6 +310,82 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
 
+static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num)
+{
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)];
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)];
+	int module_mapping;
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return err;
+
+	module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping);
+	*module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK;
+
+	return 0;
+}
+
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data)
+{
+	u32 out[MLX5_ST_SZ_DW(mcia_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcia_reg)];
+	int module_num;
+	u16 i2c_addr;
+	int status;
+	int err;
+	void *ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
+
+	err = mlx5_query_module_num(dev, &module_num);
+	if (err)
+		return err;
+
+	memset(in, 0, sizeof(in));
+	size = min_t(int, size, MLX5_EEPROM_MAX_BYTES);
+
+	if (offset < MLX5_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLX5_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size -= offset + size - MLX5_EEPROM_PAGE_LENGTH;
+
+	i2c_addr = MLX5_I2C_ADDR_LOW;
+	if (offset >= MLX5_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLX5_I2C_ADDR_HIGH;
+		offset -= MLX5_EEPROM_PAGE_LENGTH;
+	}
+
+	MLX5_SET(mcia_reg, in, l, 0);
+	MLX5_SET(mcia_reg, in, module, module_num);
+	MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr);
+	MLX5_SET(mcia_reg, in, page_number, 0);
+	MLX5_SET(mcia_reg, in, device_address, offset);
+	MLX5_SET(mcia_reg, in, size, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCIA, 0, 0);
+	if (err)
+		return err;
+
+	status = MLX5_GET(mcia_reg, out, status);
+	if (status) {
+		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
+			      status);
+		return -EIO;
+	}
+
+	memcpy(data, ptr, size);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
+
 static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
 				int pvlc_size,  u8 local_port)
 {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2e8758d1b19e..1a170672c656 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -113,9 +113,10 @@ enum {
 	MLX5_REG_PELC		 = 0x500e,
 	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PCMR		 = 0x5041,
-	MLX5_REG_PMLP		 = 0, /* TBD */
+	MLX5_REG_PMLP		 = 0x5002,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
 };
 
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index a364ab1737a0..7391eb833253 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -40,6 +40,19 @@ enum mlx5_beacon_duration {
 	MLX5_BEACON_DURATION_INF = 0xffff,
 };
 
+enum mlx5_module_id {
+	MLX5_MODULE_ID_SFP              = 0x3,
+	MLX5_MODULE_ID_QSFP             = 0xC,
+	MLX5_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX5_MODULE_ID_QSFP28           = 0x11,
+};
+
+#define MLX5_EEPROM_MAX_BYTES			32
+#define MLX5_EEPROM_IDENTIFIER_BYTE_MASK	0x000000ff
+#define MLX5_I2C_ADDR_LOW		0x50
+#define MLX5_I2C_ADDR_HIGH		0x51
+#define MLX5_EEPROM_PAGE_LENGTH		256
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
@@ -93,5 +106,7 @@ int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
 int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
 void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 			 bool *enabled);
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data);
 
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 363501145e3faa650193722fe7047b767ed87172 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:55 +0300
Subject: net/mlx5e: Add ethtool support for rxvlan-offload (vlan stripping)

Use ethtool -K <interface> rxvlan <on/off> to enable/disable
C-TAG vlan stripping by hardware.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 74 ++++++++++++++++++++++-
 include/linux/mlx5/driver.h                       |  4 ++
 3 files changed, 78 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e903eff2574f..8abc289ac1fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -166,6 +166,7 @@ struct mlx5e_params {
 	u8  rss_hfunc;
 	u8  toeplitz_hash_key[40];
 	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
+	bool vlan_strip_disable;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct ieee_ets ets;
 #endif
@@ -575,6 +576,8 @@ int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
 void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
 
+int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd);
+
 int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix);
 void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ad0cb4aa593b..6c9c10c131a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -388,6 +388,7 @@ static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 	MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
 	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc,  rqc, flush_in_error_en,	1);
+	MLX5_SET(rqc,  rqc, vsd, priv->params.vlan_strip_disable);
 	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
 						MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
@@ -402,7 +403,8 @@ static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 	return err;
 }
 
-static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
+static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
+				 int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
@@ -430,6 +432,36 @@ static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 	return err;
 }
 
+static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask, MLX5_RQ_BITMASK_VSD);
+	MLX5_SET(rqc, rqc, vsd, vsd);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	mlx5_core_destroy_rq(rq->priv->mdev, rq->rqn);
@@ -468,7 +500,7 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (err)
 		goto err_destroy_rq;
 
-	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
@@ -493,7 +525,7 @@ static void mlx5e_close_rq(struct mlx5e_rq *rq)
 	clear_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state);
 	napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
 
-	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
+	mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	while (!mlx5_wq_ll_is_empty(&rq->wq))
 		msleep(20);
 
@@ -1963,6 +1995,23 @@ static void mlx5e_destroy_tirs(struct mlx5e_priv *priv)
 		mlx5e_destroy_tir(priv, i);
 }
 
+int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd)
+{
+	int err = 0;
+	int i;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	for (i = 0; i < priv->params.num_channels; i++) {
+		err = mlx5e_modify_rq_vsd(&priv->channel[i]->rq, vsd);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int mlx5e_setup_tc(struct net_device *netdev, u8 tc)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -2147,6 +2196,23 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable)
 	return mlx5_set_port_fcs(mdev, !enable);
 }
 
+static int set_feature_rx_vlan(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->params.vlan_strip_disable = !enable;
+	err = mlx5e_modify_rqs_vsd(priv, !enable);
+	if (err)
+		priv->params.vlan_strip_disable = enable;
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
 static int mlx5e_handle_feature(struct net_device *netdev,
 				netdev_features_t wanted_features,
 				netdev_features_t feature,
@@ -2184,6 +2250,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 				    set_feature_tc_num_filters);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL,
 				    set_feature_rx_all);
+	err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_VLAN_CTAG_RX,
+				    set_feature_rx_vlan);
 
 	return err ? -EINVAL : 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1a170672c656..2cc5e9fd5913 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -45,6 +45,10 @@
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
 
+enum {
+	MLX5_RQ_BITMASK_VSD = 1 << 1,
+};
+
 enum {
 	MLX5_BOARD_ID_LEN = 64,
 	MLX5_MAX_NAME_LEN = 16,
-- 
cgit v1.2.3


From 1b223dd391622fde05e03829d813c3c6cc998685 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:56 +0300
Subject: net/mlx5e: Fix checksum handling for non-stripped vlan packets

Now as rx-vlan offload can be disabled, packets can be received
with vlan tag not stripped, which means is_first_ethertype_ip will
return false, for that we need to check if the hardware reported
csum OK so we will report CHECKSUM_UNNECESSARY for those packets.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 20 +++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  8 ++++++--
 include/linux/mlx5/device.h                        | 21 ++++++++++++++++-----
 4 files changed, 38 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6c9c10c131a0..5bad17d37d7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -109,6 +109,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 		s->lro_bytes	+= rq_stats->lro_bytes;
 		s->rx_csum_none	+= rq_stats->csum_none;
 		s->rx_csum_sw	+= rq_stats->csum_sw;
+		s->rx_csum_inner += rq_stats->csum_inner;
 		s->rx_wqe_err   += rq_stats->wqe_err;
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 918b7c7fd74f..23adfe2fcba9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -543,16 +543,26 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 
 	if (lro) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else if (likely(is_first_ethertype_ip(skb))) {
+		return;
+	}
+
+	if (is_first_ethertype_ip(skb)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
 		rq->stats.csum_sw++;
-	} else {
-		goto csum_none;
+		return;
 	}
 
-	return;
-
+	if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
+		   (cqe->hds_ip_ext & CQE_L4_OK))) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (cqe_is_tunneled(cqe)) {
+			skb->csum_level = 1;
+			skb->encapsulation = 1;
+			rq->stats.csum_inner++;
+		}
+		return;
+	}
 csum_none:
 	skb->ip_summed = CHECKSUM_NONE;
 	rq->stats.csum_none++;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 7cd8cb44b2ab..115752b53d85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -62,6 +62,7 @@ struct mlx5e_sw_stats {
 	u64 rx_csum_good;
 	u64 rx_csum_none;
 	u64 rx_csum_sw;
+	u64 rx_csum_inner;
 	u64 tx_csum_offload;
 	u64 tx_csum_inner;
 	u64 tx_queue_stopped;
@@ -90,6 +91,7 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_good) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_offload) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
@@ -272,8 +274,9 @@ static const struct counter_desc pport_per_prio_pfc_stats_desc[] = {
 struct mlx5e_rq_stats {
 	u64 packets;
 	u64 bytes;
-	u64 csum_none;
 	u64 csum_sw;
+	u64 csum_inner;
+	u64 csum_none;
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
@@ -285,8 +288,9 @@ struct mlx5e_rq_stats {
 static const struct counter_desc rq_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, bytes) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_bytes) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, wqe_err) },
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 942bccacd7b7..6bd429b53b77 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -645,8 +645,9 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8              rsvd0[2];
-	__be16          wqe_id;
+	u8		outer_l3_tunneled;
+	u8		rsvd0;
+	__be16		wqe_id;
 	u8		lro_tcppsh_abort_dupack;
 	u8		lro_min_ttl;
 	__be16		lro_tcp_win;
@@ -659,7 +660,7 @@ struct mlx5_cqe64 {
 	__be16		slid;
 	__be32		flags_rqpn;
 	u8		hds_ip_ext;
-	u8		l4_hdr_type_etc;
+	u8		l4_l3_hdr_type;
 	__be16		vlan_info;
 	__be32		srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */
 	__be32		imm_inval_pkey;
@@ -680,12 +681,22 @@ static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 
 static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
 {
-	return (cqe->l4_hdr_type_etc >> 4) & 0x7;
+	return (cqe->l4_l3_hdr_type >> 4) & 0x7;
+}
+
+static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->l4_l3_hdr_type >> 2) & 0x3;
+}
+
+static inline u8 cqe_is_tunneled(struct mlx5_cqe64 *cqe)
+{
+	return cqe->outer_l3_tunneled & 0x1;
 }
 
 static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe)
 {
-	return !!(cqe->l4_hdr_type_etc & 0x1);
+	return !!(cqe->l4_l3_hdr_type & 0x1);
 }
 
 static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
-- 
cgit v1.2.3


From 501e7ef569f4ea2dc7e50773cf6a5d757c94f9b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Apr 2016 15:30:07 -0700
Subject: net-rfs: fix false sharing accessing sd->input_queue_head

sd->input_queue_head is incremented for each processed packet
in process_backlog(), and read from other cpus performing
Out Of Order avoidance in get_rps_cpu()

Moving this field in a separate cache line keeps it mostly
hot for the cpu in process_backlog(), as other cpus will
only read it.

In a stress test, process_backlog() was consuming 6.80 % of cpu cycles,
and the patch reduced the cost to 0.65 %

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 18d8394f2e5d..934ca866562d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2747,11 +2747,15 @@ struct softnet_data {
 	struct sk_buff		*completion_queue;
 
 #ifdef CONFIG_RPS
-	/* Elements below can be accessed between CPUs for RPS */
+	/* input_queue_head should be written by cpu owning this struct,
+	 * and only read by other cpus. Worth using a cache line.
+	 */
+	unsigned int		input_queue_head ____cacheline_aligned_in_smp;
+
+	/* Elements below can be accessed between CPUs for RPS/RFS */
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
-	unsigned int		input_queue_head;
 	unsigned int		input_queue_tail;
 #endif
 	unsigned int		dropped;
-- 
cgit v1.2.3


From 0a2cf20c3fb62ad4717276b5303bf831f7b29d54 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Wed, 27 Apr 2016 23:39:01 -0400
Subject: tcp: remove SKBTX_ACK_TSTAMP since it is redundant

The SKBTX_ACK_TSTAMP flag is set in skb_shinfo->tx_flags when
the timestamp of the TCP acknowledgement should be reported on
error queue. Since accessing skb_shinfo is likely to incur a
cache-line miss at the time of receiving the ack, the
txstamp_ack bit was added in tcp_skb_cb, which is set iff
the SKBTX_ACK_TSTAMP flag is set for an skb. This makes
SKBTX_ACK_TSTAMP flag redundant.

Remove the SKBTX_ACK_TSTAMP and instead use the txstamp_ack bit
everywhere.

Note that this frees one bit in shinfo->tx_flags.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Suggested-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  6 +-----
 net/ipv4/tcp.c         |  5 +++--
 net/ipv4/tcp_input.c   |  3 +--
 net/ipv4/tcp_output.c  | 17 +++++++++++------
 net/socket.c           |  3 ---
 5 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a1ce63979ad8..c84a5a1078c5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -382,14 +382,10 @@ enum {
 
 	/* generate software time stamp when entering packet scheduling */
 	SKBTX_SCHED_TSTAMP = 1 << 6,
-
-	/* generate software timestamp on peer data acknowledgment */
-	SKBTX_ACK_TSTAMP = 1 << 7,
 };
 
 #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
-				 SKBTX_SCHED_TSTAMP | \
-				 SKBTX_ACK_TSTAMP)
+				 SKBTX_SCHED_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53890a730ff4..91993782a947 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -435,9 +435,10 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 		sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
-		if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+		if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+			tcb->txstamp_ack = 1;
+		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
 			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
-		tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d5239c283cb..70c370b93762 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3087,8 +3087,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 		return;
 
 	shinfo = skb_shinfo(skb);
-	if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
-	    !before(shinfo->tskey, prior_snd_una) &&
+	if (!before(shinfo->tskey, prior_snd_una) &&
 	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
 		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1b2045ac3a9..b3a31b4df57c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1111,11 +1111,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	tcp_verify_left_out(tp);
 }
 
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->txstamp_ack ||
+		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
 static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+	if (unlikely(tcp_has_tx_tstamp(skb)) &&
 	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
 		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
 		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
@@ -2446,13 +2452,12 @@ u32 __tcp_select_window(struct sock *sk)
 void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 			     const struct sk_buff *next_skb)
 {
-	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
-	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-
-	if (unlikely(tsflags)) {
+	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+		const struct skb_shared_info *next_shinfo =
+			skb_shinfo(next_skb);
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-		shinfo->tx_flags |= tsflags;
+		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
 		shinfo->tskey = next_shinfo->tskey;
 		TCP_SKB_CB(skb)->txstamp_ack |=
 			TCP_SKB_CB(next_skb)->txstamp_ack;
diff --git a/net/socket.c b/net/socket.c
index 5dbb0bbe12a7..7789d79609dd 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -600,9 +600,6 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
 	if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
 		flags |= SKBTX_SCHED_TSTAMP;
 
-	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
-		flags |= SKBTX_ACK_TSTAMP;
-
 	*tx_flags = flags;
 }
 EXPORT_SYMBOL(__sock_tx_timestamp);
-- 
cgit v1.2.3


From 92b4423e3a0bc5d43ecde4bcad871f8b5ba04efd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 Apr 2016 10:39:34 +0200
Subject: netfilter: fix IS_ERR_VALUE usage

This is a forward-port of the original patch from Andrzej Hajda,
he said:

"IS_ERR_VALUE should be used only with unsigned long type.
Otherwise it can work incorrectly. To achieve this function
xt_percpu_counter_alloc is modified to return unsigned long,
and its result is assigned to temporary variable to perform
error checking, before assigning to .pcnt field.

The patch follows conclusion from discussion on LKML [1][2].

[1]: http://permalink.gmane.org/gmane.linux.kernel/2120927
[2]: http://permalink.gmane.org/gmane.linux.kernel/2150581"

Original patch from Andrzej is here:

http://patchwork.ozlabs.org/patch/582970/

This patch has clashed with input validation fixes for x_tables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 6 +++---
 net/ipv4/netfilter/arp_tables.c    | 6 ++++--
 net/ipv4/netfilter/ip_tables.c     | 6 ++++--
 net/ipv6/netfilter/ip6_tables.c    | 6 ++++--
 4 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 4dd9306c9d56..dc4f58a3cdcc 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -380,16 +380,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
  * allows us to return 0 for single core systems without forcing
  * callers to deal with SMP vs. NONSMP issues.
  */
-static inline u64 xt_percpu_counter_alloc(void)
+static inline unsigned long xt_percpu_counter_alloc(void)
 {
 	if (nr_cpu_ids > 1) {
 		void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
 						    sizeof(struct xt_counters));
 
 		if (res == NULL)
-			return (u64) -ENOMEM;
+			return -ENOMEM;
 
-		return (u64) (__force unsigned long) res;
+		return (__force unsigned long) res;
 	}
 
 	return 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 60f5161abcb4..3355ed72051d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -513,11 +513,13 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
+	unsigned long pcnt;
 	int ret;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	t = arpt_get_target(e);
 	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 735d1ee8c1ab..21ccc19e1e6f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -656,10 +656,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
+	unsigned long pcnt;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73e606c719ef..17874e83a950 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -669,10 +669,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
+	unsigned long pcnt;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
-- 
cgit v1.2.3


From f4b05d27ec6b032ca504591e2a157b058b6f172f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 28 Apr 2016 17:59:28 +0200
Subject: net: constify is_skb_forwardable's arguments

is_skb_forwardable is not supposed to change anything so constify its
arguments

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 net/core/dev.c            | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 934ca866562d..52914a854386 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3264,7 +3264,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb);
+bool is_skb_forwardable(const struct net_device *dev,
+			const struct sk_buff *skb);
 
 extern int		netdev_budget;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c2f3d5dbde56..d91dfbec0fc6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
 			__net_timestamp(SKB);		\
 	}						\
 
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 {
 	unsigned int len;
 
-- 
cgit v1.2.3


From d745098cedb3f5c6a554796d4a3a505abd4ebaa6 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:33 +0300
Subject: net/mlx5: Introduce modify flow rule destination

This API is used for modifying the flow rule destination.
This is needed for modifying the pointed flow table by the
traffic type classifier rules to point on the aRFS tables.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++--
 include/linux/mlx5/fs.h                           | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 89cce97d46c6..bb2c1cd35fd7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -615,8 +615,8 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	return err;
 }
 
-static int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-					struct mlx5_flow_destination *dest)
+int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+				 struct mlx5_flow_destination *dest)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 8dec5508d93d..28a5b662ab6a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -113,4 +113,7 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		   struct mlx5_flow_destination *dest);
 void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
 
+int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+				 struct mlx5_flow_destination *dest);
+
 #endif
-- 
cgit v1.2.3


From d63cd28608bb563d52e62990fa01c016e8dbdb75 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:35 +0300
Subject: net/mlx5: Add user chosen levels when allocating flow tables

Currently, consumers of the flow steering infrastructure can't
choose their own flow table levels and are limited to one
flow table per level. This just waste levels.
Instead, we introduce here the possibility to use multiple
flow tables in a level. The user is free to connect these
flow tables, while following the rule (FTEs in FT of level x
could only point to FTs of level y where y > x).

In addition this patch switch the order of the create/destroy
flow tables of the NIC(vlan and main).

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                 |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c   | 30 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 66 +++++++++++++++--------
 include/linux/mlx5/fs.h                           |  6 ++-
 6 files changed, 70 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 99eb1c1a3b7b..3ff663c35bac 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1438,7 +1438,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 	if (!ft) {
 		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
 							 num_entries,
-							 num_groups);
+							 num_groups,
+							 0);
 
 		if (!IS_ERR(ft)) {
 			prio->refcount = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 4df49e660587..d61171ae0168 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -37,6 +37,11 @@
 #include <linux/mlx5/fs.h>
 #include "en.h"
 
+enum {
+	MLX5E_VLAN_FT_LEVEL = 0,
+	MLX5E_MAIN_FT_LEVEL
+};
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 enum {
@@ -1041,7 +1046,8 @@ static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv)
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_MAIN_TABLE_SIZE);
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_MAIN_TABLE_SIZE,
+				       MLX5E_MAIN_FT_LEVEL);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
@@ -1150,7 +1156,8 @@ static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_VLAN_TABLE_SIZE);
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_VLAN_TABLE_SIZE,
+				       MLX5E_VLAN_FT_LEVEL);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
@@ -1167,11 +1174,16 @@ static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
 	if (err)
 		goto err_free_g;
 
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+	if (err)
+		goto err_destroy_vlan_flow_groups;
+
 	return 0;
 
+err_destroy_vlan_flow_groups:
+	mlx5e_destroy_groups(ft);
 err_free_g:
 	kfree(ft->g);
-
 err_destroy_vlan_flow_table:
 	mlx5_destroy_flow_table(ft->t);
 	ft->t = NULL;
@@ -1194,15 +1206,11 @@ int mlx5e_create_flow_tables(struct mlx5e_priv *priv)
 	if (!priv->fts.ns)
 		return -EINVAL;
 
-	err = mlx5e_create_vlan_flow_table(priv);
-	if (err)
-		return err;
-
 	err = mlx5e_create_main_flow_table(priv);
 	if (err)
-		goto err_destroy_vlan_flow_table;
+		return err;
 
-	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+	err = mlx5e_create_vlan_flow_table(priv);
 	if (err)
 		goto err_destroy_main_flow_table;
 
@@ -1210,8 +1218,6 @@ int mlx5e_create_flow_tables(struct mlx5e_priv *priv)
 
 err_destroy_main_flow_table:
 	mlx5e_destroy_main_flow_table(priv);
-err_destroy_vlan_flow_table:
-	mlx5e_destroy_vlan_flow_table(priv);
 
 	return err;
 }
@@ -1219,6 +1225,6 @@ err_destroy_vlan_flow_table:
 void mlx5e_destroy_flow_tables(struct mlx5e_priv *priv)
 {
 	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
-	mlx5e_destroy_main_flow_table(priv);
 	mlx5e_destroy_vlan_flow_table(priv);
+	mlx5e_destroy_main_flow_table(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b3de09f13425..2137387c043d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -64,7 +64,8 @@ static struct mlx5_flow_rule *mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 		priv->fts.tc.t =
 			mlx5_create_auto_grouped_flow_table(priv->fts.ns, 0,
 							    MLX5E_TC_FLOW_TABLE_NUM_ENTRIES,
-							    MLX5E_TC_FLOW_TABLE_NUM_GROUPS);
+							    MLX5E_TC_FLOW_TABLE_NUM_GROUPS,
+							    0);
 		if (IS_ERR(priv->fts.tc.t)) {
 			netdev_err(priv->netdev,
 				   "Failed to create tc offload table\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index bc3d9f8a75c1..ff91bb5e1c43 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -401,7 +401,7 @@ static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	memset(flow_group_in, 0, inlen);
 
 	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
-	fdb = mlx5_create_flow_table(root_ns, 0, table_size);
+	fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0);
 	if (IS_ERR_OR_NULL(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cfb35c334cd1..ca55d7e30e5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -225,19 +225,6 @@ static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns,
 	return NULL;
 }
 
-static unsigned int find_next_free_level(struct fs_prio *prio)
-{
-	if (!list_empty(&prio->node.children)) {
-		struct mlx5_flow_table *ft;
-
-		ft = list_last_entry(&prio->node.children,
-				     struct mlx5_flow_table,
-				     node.list);
-		return ft->level + 1;
-	}
-	return prio->start_level;
-}
-
 static bool masked_memcmp(void *mask, void *val1, void *val2, size_t size)
 {
 	unsigned int i;
@@ -696,9 +683,23 @@ static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table
 	return err;
 }
 
+static void list_add_flow_table(struct mlx5_flow_table *ft,
+				struct fs_prio *prio)
+{
+	struct list_head *prev = &prio->node.children;
+	struct mlx5_flow_table *iter;
+
+	fs_for_each_ft(iter, prio) {
+		if (iter->level > ft->level)
+			break;
+		prev = &iter->node.list;
+	}
+	list_add(&ft->node.list, prev);
+}
+
 struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-					       int prio,
-					       int max_fte)
+					       int prio, int max_fte,
+					       u32 level)
 {
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_table *ft;
@@ -719,12 +720,15 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		err = -EINVAL;
 		goto unlock_root;
 	}
-	if (fs_prio->num_ft == fs_prio->num_levels) {
+	if (level >= fs_prio->num_levels) {
 		err = -ENOSPC;
 		goto unlock_root;
 	}
-
-	ft = alloc_flow_table(find_next_free_level(fs_prio),
+	/* The level is related to the
+	 * priority level range.
+	 */
+	level += fs_prio->start_level;
+	ft = alloc_flow_table(level,
 			      roundup_pow_of_two(max_fte),
 			      root->table_type);
 	if (!ft) {
@@ -745,7 +749,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		goto destroy_ft;
 	lock_ref_node(&fs_prio->node);
 	tree_add_node(&ft->node, &fs_prio->node);
-	list_add_tail(&ft->node.list, &fs_prio->node.children);
+	list_add_flow_table(ft, fs_prio);
 	fs_prio->num_ft++;
 	unlock_ref_node(&fs_prio->node);
 	mutex_unlock(&root->chain_lock);
@@ -762,14 +766,15 @@ unlock_root:
 struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 							    int prio,
 							    int num_flow_table_entries,
-							    int max_num_groups)
+							    int max_num_groups,
+							    u32 level)
 {
 	struct mlx5_flow_table *ft;
 
 	if (max_num_groups > num_flow_table_entries)
 		return ERR_PTR(-EINVAL);
 
-	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries);
+	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries, level);
 	if (IS_ERR(ft))
 		return ft;
 
@@ -1068,6 +1073,20 @@ unlock_fg:
 	return rule;
 }
 
+static bool dest_is_valid(struct mlx5_flow_destination *dest,
+			  u32 action,
+			  struct mlx5_flow_table *ft)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return true;
+
+	if (!dest || ((dest->type ==
+	    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) &&
+	    (dest->ft->level <= ft->level)))
+		return false;
+	return true;
+}
+
 static struct mlx5_flow_rule *
 _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		    u8 match_criteria_enable,
@@ -1080,7 +1099,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_rule *rule;
 
-	if ((action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) && !dest)
+	if (!dest_is_valid(dest, action, ft))
 		return ERR_PTR(-EINVAL);
 
 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
@@ -1517,6 +1536,7 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns)
 
 #define ANCHOR_PRIO 0
 #define ANCHOR_SIZE 1
+#define ANCHOR_LEVEL 0
 static int create_anchor_flow_table(struct mlx5_core_dev
 							*dev)
 {
@@ -1526,7 +1546,7 @@ static int create_anchor_flow_table(struct mlx5_core_dev
 	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ANCHOR);
 	if (!ns)
 		return -EINVAL;
-	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE);
+	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL);
 	if (IS_ERR(ft)) {
 		mlx5_core_err(dev, "Failed to create last anchor flow table");
 		return PTR_ERR(ft);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 28a5b662ab6a..165ff4f9cc6a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -82,12 +82,14 @@ struct mlx5_flow_table *
 mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    int prio,
 				    int num_flow_table_entries,
-				    int max_num_groups);
+				    int max_num_groups,
+				    u32 level);
 
 struct mlx5_flow_table *
 mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
-		       int num_flow_table_entries);
+		       int num_flow_table_entries,
+		       u32 level);
 int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
 
 /* inbox should be set with the following values:
-- 
cgit v1.2.3


From 5a7b27eb9cf3986f487469b57a3a41286d2e7100 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:39 +0300
Subject: net/mlx5: Initializing CPU reverse mapping

Allocating CPU rmap and add entry for each IRQ.
CPU rmap is used in aRFS to get the RX queue number
of the RX completion interrupts.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 +++
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 18 ++++++++++++++++++
 include/linux/mlx5/driver.h                       |  3 +++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 518192e59779..8fee224fb98c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1691,6 +1691,9 @@ int mlx5e_open_locked(struct net_device *netdev)
 	mlx5e_redirect_rqts(priv);
 	mlx5e_update_carrier(priv);
 	mlx5e_timestamp_init(priv);
+#ifdef CONFIG_RFS_ACCEL
+	priv->netdev->rx_cpu_rmap = priv->mdev->rmap;
+#endif
 
 	schedule_delayed_work(&priv->update_stats_work, 0);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6892746fd10d..6feef7fb9d6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -48,6 +48,9 @@
 #include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
 #include "mlx5_core.h"
 #include "fs_core.h"
 #ifdef CONFIG_MLX5_CORE_EN
@@ -665,6 +668,12 @@ static void free_comp_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	struct mlx5_eq *eq, *n;
 
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
 	spin_lock(&table->lock);
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
 		list_del(&eq->list);
@@ -691,6 +700,11 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 	INIT_LIST_HEAD(&table->comp_eqs_list);
 	ncomp_vec = table->num_comp_vectors;
 	nent = MLX5_COMP_EQ_SIZE;
+#ifdef CONFIG_RFS_ACCEL
+	dev->rmap = alloc_irq_cpu_rmap(ncomp_vec);
+	if (!dev->rmap)
+		return -ENOMEM;
+#endif
 	for (i = 0; i < ncomp_vec; i++) {
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
 		if (!eq) {
@@ -698,6 +712,10 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 			goto clean;
 		}
 
+#ifdef CONFIG_RFS_ACCEL
+		irq_cpu_rmap_add(dev->rmap,
+				 dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector);
+#endif
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96a428dcac9f..d5529449ef47 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -560,6 +560,9 @@ struct mlx5_core_dev {
 	struct mlx5_profile	*profile;
 	atomic_t		num_qps;
 	u32			issi;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap         *rmap;
+#endif
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 03dc76ca1ee5d02401d5a22ed7ddf15b5e9dfe76 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Thu, 28 Apr 2016 20:20:52 -0400
Subject: qed: add infrastructure for device self tests.

This patch adds the functionality and APIs needed for selftests.
It adds the ability to configure the link-mode which is required for the
implementation of loopback tests. It adds the APIs for clock test,
register test, interrupt test and memory test.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         | 13 ++++
 drivers/net/ethernet/qlogic/qed/qed_main.c        | 28 +++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         | 42 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         | 22 +++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.c    | 76 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.h    | 40 ++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sp.h          | 10 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 21 +++++++
 include/linux/qed/qed_if.h                        | 47 ++++++++++++++
 10 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 5c2fd57236fe..aafa6692e62f 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
-	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o
+	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
+	 qed_selftest.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 5aa78a9ae17f..c4fae71bed11 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -3857,6 +3857,7 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_PHY_CORE_WRITE             0x000e0000
 #define DRV_MSG_CODE_SET_VERSION                0x000f0000
 
+#define DRV_MSG_CODE_BIST_TEST                  0x001e0000
 #define DRV_MSG_CODE_SET_LED_MODE               0x00200000
 
 #define DRV_MSG_SEQ_NUMBER_MASK                 0x0000ffff
@@ -3914,6 +3915,18 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_SET_LED_MODE_ON            0x1
 #define DRV_MB_PARAM_SET_LED_MODE_OFF           0x2
 
+#define DRV_MB_PARAM_BIST_UNKNOWN_TEST          0
+#define DRV_MB_PARAM_BIST_REGISTER_TEST         1
+#define DRV_MB_PARAM_BIST_CLOCK_TEST            2
+
+#define DRV_MB_PARAM_BIST_RC_UNKNOWN            0
+#define DRV_MB_PARAM_BIST_RC_PASSED             1
+#define DRV_MB_PARAM_BIST_RC_FAILED             2
+#define DRV_MB_PARAM_BIST_RC_INVALID_PARAMETER          3
+
+#define DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT      0
+#define DRV_MB_PARAM_BIST_TEST_INDEX_MASK       0x000000FF
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK                        0xffff0000
 #define FW_MSG_CODE_DRV_LOAD_ENGINE             0x10100000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1918b83f0a97..1b758bdec587 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -28,6 +28,7 @@
 #include "qed_dev_api.h"
 #include "qed_mcp.h"
 #include "qed_hw.h"
+#include "qed_selftest.h"
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -976,6 +977,25 @@ static int qed_set_link(struct qed_dev *cdev,
 		else
 			link_params->pause.forced_tx = false;
 	}
+	if (params->override_flags & QED_LINK_OVERRIDE_LOOPBACK_MODE) {
+		switch (params->loopback_mode) {
+		case QED_LINK_LOOPBACK_INT_PHY:
+			link_params->loopback_mode = PMM_LOOPBACK_INT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT_PHY:
+			link_params->loopback_mode = PMM_LOOPBACK_EXT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT:
+			link_params->loopback_mode = PMM_LOOPBACK_EXT;
+			break;
+		case QED_LINK_LOOPBACK_MAC:
+			link_params->loopback_mode = PMM_LOOPBACK_MAC;
+			break;
+		default:
+			link_params->loopback_mode = PMM_LOOPBACK_NONE;
+			break;
+		}
+	}
 
 	rc = qed_mcp_set_link(hwfn, ptt, params->link_up);
 
@@ -1182,7 +1202,15 @@ static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
 	return status;
 }
 
+struct qed_selftest_ops qed_selftest_ops_pass = {
+	.selftest_memory = &qed_selftest_memory,
+	.selftest_interrupt = &qed_selftest_interrupt,
+	.selftest_register = &qed_selftest_register,
+	.selftest_clock = &qed_selftest_clock,
+};
+
 const struct qed_common_ops qed_common_ops_pass = {
+	.selftest = &qed_selftest_ops_pass,
 	.probe = &qed_probe,
 	.remove = &qed_remove,
 	.set_power_state = &qed_set_power_state,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index cb46dbdf47dd..2f8309d772c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1017,3 +1017,45 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 
 	return rc;
 }
+
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param = 0, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_REGISTER_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_CLOCK_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 608bcb2403cb..5f218eed0541 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -245,6 +245,28 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    enum qed_led_mode mode);
 
+/**
+ * @brief Bist register test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt);
+
+/**
+ * @brief Bist clock test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.c b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
new file mode 100644
index 000000000000..a342bfe4280d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
@@ -0,0 +1,76 @@
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+int qed_selftest_memory(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_interrupt(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_register(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_register_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+int qed_selftest_clock(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_clock_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.h b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
new file mode 100644
index 000000000000..50eb0b49950f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
@@ -0,0 +1,40 @@
+#ifndef _QED_SELFTEST_API_H
+#define _QED_SELFTEST_API_H
+#include <linux/types.h>
+
+/**
+ * @brief qed_selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_memory(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_interrupt(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_register(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_clock(struct qed_dev *cdev);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 4b91cb32f317..eec137f40895 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -369,4 +369,14 @@ int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_tunn_update_params *p_tunn,
 			      enum spq_mode comp_mode,
 			      struct qed_spq_comp_cb *p_comp_data);
+/**
+ * @brief qed_sp_heartbeat_ramrod - Send empty Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn);
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 7ccd96e5802b..9f9bc10d0f6c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -428,3 +428,24 @@ int qed_sp_pf_stop(struct qed_hwfn *p_hwfn)
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_EMPTY, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index e5de42b62976..d72c832a9397 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -110,6 +110,7 @@ struct qed_link_params {
 #define QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS      BIT(1)
 #define QED_LINK_OVERRIDE_SPEED_FORCED_SPEED    BIT(2)
 #define QED_LINK_OVERRIDE_PAUSE_CONFIG          BIT(3)
+#define QED_LINK_OVERRIDE_LOOPBACK_MODE         BIT(4)
 	u32	override_flags;
 	bool	autoneg;
 	u32	adv_speeds;
@@ -118,6 +119,12 @@ struct qed_link_params {
 #define QED_LINK_PAUSE_RX_ENABLE                BIT(1)
 #define QED_LINK_PAUSE_TX_ENABLE                BIT(2)
 	u32	pause_config;
+#define QED_LINK_LOOPBACK_NONE                  BIT(0)
+#define QED_LINK_LOOPBACK_INT_PHY               BIT(1)
+#define QED_LINK_LOOPBACK_EXT_PHY               BIT(2)
+#define QED_LINK_LOOPBACK_EXT                   BIT(3)
+#define QED_LINK_LOOPBACK_MAC                   BIT(4)
+	u32	loopback_mode;
 };
 
 struct qed_link_output {
@@ -158,7 +165,47 @@ struct qed_common_cb_ops {
 			       struct qed_link_output	*link);
 };
 
+struct qed_selftest_ops {
+/**
+ * @brief selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_interrupt)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_memory)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_register)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_clock)(struct qed_dev *cdev);
+};
+
 struct qed_common_ops {
+	struct qed_selftest_ops *selftest;
+
 	struct qed_dev*	(*probe)(struct pci_dev *dev,
 				 enum qed_protocol protocol,
 				 u32 dp_module, u8 dp_level);
-- 
cgit v1.2.3


From c0ef079ca791ef9e057ac748051425a768c9e192 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 03:29:09 +0200
Subject: netdevice: shrink size of struct netdev_queue

- trans_timeout is incremented when tx queue timed out (tx watchdog).
- tx_maxrate is set via sysfs

Moving tx_maxrate to read-mostly part shrinks the struct by 64 bytes.
While at it, also move trans_timeout (it is out-of-place in the
'write-mostly' part).

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 52914a854386..f2182594160e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -569,6 +569,12 @@ struct netdev_queue {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
 	int			numa_node;
 #endif
+	unsigned long		tx_maxrate;
+	/*
+	 * Number of TX timeouts for this queue
+	 * (/sys/class/net/DEV/Q/trans_timeout)
+	 */
+	unsigned long		trans_timeout;
 /*
  * write-mostly part
  */
@@ -579,18 +585,11 @@ struct netdev_queue {
 	 */
 	unsigned long		trans_start;
 
-	/*
-	 * Number of TX timeouts for this queue
-	 * (/sys/class/net/DEV/Q/trans_timeout)
-	 */
-	unsigned long		trans_timeout;
-
 	unsigned long		state;
 
 #ifdef CONFIG_BQL
 	struct dql		dql;
 #endif
-	unsigned long		tx_maxrate;
 } ____cacheline_aligned_in_smp;
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
-- 
cgit v1.2.3


From 9580bf2edb402b3afaf9c5a4efb6953f993ef52e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 30 Apr 2016 10:19:29 -0700
Subject: net: relax expensive skb_unclone() in iptunnel_handle_offloads()

Locally generated TCP GSO packets having to go through a GRE/SIT/IPIP
tunnel have to go through an expensive skb_unclone()

Reallocating skb->head is a lot of work.

Test should really check if a 'real clone' of the packet was done.

TCP does not care if the original gso_type is changed while the packet
travels in the stack.

This adds skb_header_unclone() which is a variant of skb_clone()
using skb_header_cloned() check instead of skb_cloned().

This variant can probably be used from other points.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 10 ++++++++++
 net/ipv4/ip_tunnel_core.c |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c84a5a1078c5..c413c588a24f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1325,6 +1325,16 @@ static inline int skb_header_cloned(const struct sk_buff *skb)
 	return dataref != 1;
 }
 
+static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
+{
+	might_sleep_if(gfpflags_allow_blocking(pri));
+
+	if (skb_header_cloned(skb))
+		return pskb_expand_head(skb, 0, 0, pri);
+
+	return 0;
+}
+
 /**
  *	skb_header_release - release reference to header
  *	@skb: buffer to operate on
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 786fa7ca28e0..9118b0e640ba 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -157,7 +157,7 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
 	}
 
 	if (skb_is_gso(skb)) {
-		err = skb_unclone(skb, GFP_ATOMIC);
+		err = skb_header_unclone(skb, GFP_ATOMIC);
 		if (unlikely(err))
 			return err;
 		skb_shinfo(skb)->gso_type |= gso_type_mask;
-- 
cgit v1.2.3


From efdc810ba39dae0ccce9cb9c1c84ff9b0157ca43 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Tue, 3 May 2016 17:13:54 +0300
Subject: net/mlx5: Flow steering, Add vport ACL support

Update the relevant flow steering device structs and commands to
support vport.
Update the flow steering core API to receive vport number.
Add ingress and egress ACL flow table name spaces.
Add ACL flow table support:
* ACL (Access Control List) flow table is a table that contains
only allow/drop steering rules.

* We have two types of ACL flow tables - ingress and egress.

* ACLs handle traffic sent from/to E-Switch FDB table, Ingress refers to
traffic sent from Vport to E-Switch and Egress refers to traffic sent
from E-Switch to vport.

* Ingress ACL flow table allow/drop rules is checked against traffic
sent from VF.

* Egress ACL flow table allow/drop rules is checked against traffic sent
to VF.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 33 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h   |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 85 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  7 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +
 include/linux/mlx5/device.h                        | 12 +++
 include/linux/mlx5/driver.h                        |  2 +
 include/linux/mlx5/fs.h                            |  7 ++
 9 files changed, 142 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index ff91bb5e1c43..dd066199d172 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -845,7 +845,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
 	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
-	int total_vports = 1 + pci_sriov_get_totalvfs(dev->pdev);
+	int total_vports = MLX5_TOTAL_VPORTS(dev);
 	struct mlx5_eswitch *esw;
 	int vport_num;
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index f46f1db0fc00..9797768891ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -50,6 +50,10 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
 	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
 	MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
+		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
+	}
 
 	memset(out, 0, sizeof(out));
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
@@ -57,6 +61,7 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 }
 
 int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       u16 vport,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
 			       *next_ft, unsigned int *table_id)
@@ -77,6 +82,10 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 	MLX5_SET(create_flow_table_in, in, table_type, type);
 	MLX5_SET(create_flow_table_in, in, level, level);
 	MLX5_SET(create_flow_table_in, in, log_size, log_size);
+	if (vport) {
+		MLX5_SET(create_flow_table_in, in, vport_number, vport);
+		MLX5_SET(create_flow_table_in, in, other_vport, 1);
+	}
 
 	memset(out, 0, sizeof(out));
 	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
@@ -101,6 +110,10 @@ int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
 	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_table_in, in, other_vport, 1);
+	}
 
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
 					  sizeof(out));
@@ -120,6 +133,10 @@ int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_MODIFY_FLOW_TABLE);
 	MLX5_SET(modify_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(modify_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(modify_flow_table_in, in, other_vport, 1);
+	}
 	MLX5_SET(modify_flow_table_in, in, modify_field_select,
 		 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
 	if (next_ft) {
@@ -148,6 +165,10 @@ int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_CREATE_FLOW_GROUP);
 	MLX5_SET(create_flow_group_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_group_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(create_flow_group_in, in, other_vport, 1);
+	}
 
 	err = mlx5_cmd_exec_check_status(dev, in,
 					 inlen, out,
@@ -174,6 +195,10 @@ int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
 	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
 	MLX5_SET(destroy_flow_group_in, in, group_id, group_id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_group_in, in, other_vport, 1);
+	}
 
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
 					  sizeof(out));
@@ -207,6 +232,10 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(set_fte_in, in, table_type, ft->type);
 	MLX5_SET(set_fte_in, in, table_id,   ft->id);
 	MLX5_SET(set_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(set_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(set_fte_in, in, other_vport, 1);
+	}
 
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
@@ -285,6 +314,10 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(delete_fte_in, in, table_type, ft->type);
 	MLX5_SET(delete_fte_in, in, table_id, ft->id);
 	MLX5_SET(delete_fte_in, in, flow_index, index);
+	if (ft->vport) {
+		MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(delete_fte_in, in, other_vport, 1);
+	}
 
 	err =  mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 9814d4784803..c97b4a03eeed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -34,6 +34,7 @@
 #define _MLX5_FS_CMD_
 
 int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       u16 vport,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
 			       *next_ft, unsigned int *table_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 4d78d5a48af3..659a6980cda2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -457,7 +457,7 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
 	return fg;
 }
 
-static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte,
+static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte,
 						enum fs_flow_table_type table_type)
 {
 	struct mlx5_flow_table *ft;
@@ -469,6 +469,7 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte,
 	ft->level = level;
 	ft->node.type = FS_TYPE_FLOW_TABLE;
 	ft->type = table_type;
+	ft->vport = vport;
 	ft->max_fte = max_fte;
 	INIT_LIST_HEAD(&ft->fwd_rules);
 	mutex_init(&ft->lock);
@@ -700,9 +701,9 @@ static void list_add_flow_table(struct mlx5_flow_table *ft,
 	list_add(&ft->node.list, prev);
 }
 
-struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-					       int prio, int max_fte,
-					       u32 level)
+static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+							u16 vport, int prio,
+							int max_fte, u32 level)
 {
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_table *ft;
@@ -732,6 +733,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 	 */
 	level += fs_prio->start_level;
 	ft = alloc_flow_table(level,
+			      vport,
 			      roundup_pow_of_two(max_fte),
 			      root->table_type);
 	if (!ft) {
@@ -742,7 +744,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 	tree_init_node(&ft->node, 1, del_flow_table);
 	log_table_sz = ilog2(ft->max_fte);
 	next_ft = find_next_chained_ft(fs_prio);
-	err = mlx5_cmd_create_flow_table(root->dev, ft->type, ft->level,
+	err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->type, ft->level,
 					 log_table_sz, next_ft, &ft->id);
 	if (err)
 		goto free_ft;
@@ -766,6 +768,20 @@ unlock_root:
 	return ERR_PTR(err);
 }
 
+struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+					       int prio, int max_fte,
+					       u32 level)
+{
+	return __mlx5_create_flow_table(ns, 0, prio, max_fte, level);
+}
+
+struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+						     int prio, int max_fte,
+						     u32 level, u16 vport)
+{
+	return __mlx5_create_flow_table(ns, vport, prio, max_fte, level);
+}
+
 struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 							    int prio,
 							    int num_flow_table_entries,
@@ -1319,6 +1335,16 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			return &dev->priv.fdb_root_ns->ns;
 		else
 			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+		if (dev->priv.esw_egress_root_ns)
+			return &dev->priv.esw_egress_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+		if (dev->priv.esw_ingress_root_ns)
+			return &dev->priv.esw_ingress_root_ns->ns;
+		else
+			return NULL;
 	default:
 		return NULL;
 	}
@@ -1699,6 +1725,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 {
 	cleanup_root_ns(dev);
 	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
+	cleanup_single_prio_root_ns(dev, dev->priv.esw_egress_root_ns);
+	cleanup_single_prio_root_ns(dev, dev->priv.esw_ingress_root_ns);
 }
 
 static int init_fdb_root_ns(struct mlx5_core_dev *dev)
@@ -1719,6 +1747,38 @@ static int init_fdb_root_ns(struct mlx5_core_dev *dev)
 	}
 }
 
+static int init_egress_acl_root_ns(struct mlx5_core_dev *dev)
+{
+	struct fs_prio *prio;
+
+	dev->priv.esw_egress_root_ns = create_root_ns(dev, FS_FT_ESW_EGRESS_ACL);
+	if (!dev->priv.esw_egress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&dev->priv.esw_egress_root_ns->ns, 0, MLX5_TOTAL_VPORTS(dev));
+	if (IS_ERR(prio))
+		return PTR_ERR(prio);
+	else
+		return 0;
+}
+
+static int init_ingress_acl_root_ns(struct mlx5_core_dev *dev)
+{
+	struct fs_prio *prio;
+
+	dev->priv.esw_ingress_root_ns = create_root_ns(dev, FS_FT_ESW_INGRESS_ACL);
+	if (!dev->priv.esw_ingress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&dev->priv.esw_ingress_root_ns->ns, 0, MLX5_TOTAL_VPORTS(dev));
+	if (IS_ERR(prio))
+		return PTR_ERR(prio);
+	else
+		return 0;
+}
+
 int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	int err = 0;
@@ -1731,8 +1791,21 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
 		err = init_fdb_root_ns(dev);
 		if (err)
-			cleanup_root_ns(dev);
+			goto err;
+	}
+	if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
+		err = init_egress_acl_root_ns(dev);
+		if (err)
+			goto err;
+	}
+	if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
+		err = init_ingress_acl_root_ns(dev);
+		if (err)
+			goto err;
 	}
 
+	return 0;
+err:
+	mlx5_cleanup_fs(dev);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index d607e564f454..8e76cc505f5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -45,8 +45,10 @@ enum fs_node_type {
 };
 
 enum fs_flow_table_type {
-	FS_FT_NIC_RX	 = 0x0,
-	FS_FT_FDB	 = 0X4,
+	FS_FT_NIC_RX          = 0x0,
+	FS_FT_ESW_EGRESS_ACL  = 0x2,
+	FS_FT_ESW_INGRESS_ACL = 0x3,
+	FS_FT_FDB             = 0X4,
 };
 
 enum fs_fte_status {
@@ -79,6 +81,7 @@ struct mlx5_flow_rule {
 struct mlx5_flow_table {
 	struct fs_node			node;
 	u32				id;
+	u16				vport;
 	unsigned int			max_fte;
 	unsigned int			level;
 	enum fs_flow_table_type		type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 0b0b226c789e..482604bd051c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -42,6 +42,8 @@
 #define DRIVER_VERSION "3.0-1"
 #define DRIVER_RELDATE  "January 2015"
 
+#define MLX5_TOTAL_VPORTS(mdev) (1 + pci_sriov_get_totalvfs(mdev->pdev))
+
 extern int mlx5_core_debug_mask;
 
 #define mlx5_core_dbg(__dev, format, ...)				\
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8fecd6d6f814..ee0d5a937f02 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1349,6 +1349,18 @@ enum mlx5_cap_type {
 #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap)
 
+#define MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_EGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_ingress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap)
+
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
 		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d5529449ef47..9613143f0561 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -518,6 +518,8 @@ struct mlx5_priv {
 	unsigned long		pci_dev_data;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_root_namespace *esw_egress_root_ns;
+	struct mlx5_flow_root_namespace *esw_ingress_root_ns;
 };
 
 enum mlx5_device_state {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 165ff4f9cc6a..6467569ad76e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -58,6 +58,8 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_LEFTOVERS,
 	MLX5_FLOW_NAMESPACE_ANCHOR,
 	MLX5_FLOW_NAMESPACE_FDB,
+	MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+	MLX5_FLOW_NAMESPACE_ESW_INGRESS,
 };
 
 struct mlx5_flow_table;
@@ -90,6 +92,11 @@ mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
 		       int num_flow_table_entries,
 		       u32 level);
+struct mlx5_flow_table *
+mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+			     int prio,
+			     int num_flow_table_entries,
+			     u32 level, u16 vport);
 int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
 
 /* inbox should be set with the following values:
-- 
cgit v1.2.3


From ba162f8eed61a7e71e26455ce1cff5b5898a3579 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 16:31:00 +0200
Subject: netdevice: add helper to update trans_start

trans_start exists twice:
- as member of net_device (legacy)
- as member of netdev_queue

In order to get rid of the legacy case, add a helper for the
dev->trans_update (this patch), then convert spots that do

dev->trans_start = jiffies

to use this helper (next patch).

This would then allow us to change the helper so that it updates the
trans_stamp of netdev queue 0 instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcf012637d10..f53412cccbaa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3481,6 +3481,12 @@ static inline void txq_trans_update(struct netdev_queue *txq)
 		txq->trans_start = jiffies;
 }
 
+/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
+static inline void netif_trans_update(struct net_device *dev)
+{
+	dev->trans_start = jiffies;
+}
+
 /**
  *	netif_tx_lock - grab network device transmit lock
  *	@dev: network device
-- 
cgit v1.2.3


From 9b36627acecd5792e81daf1a3bff8eab39ed45fb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 16:33:14 +0200
Subject: net: remove dev->trans_start

previous patches removed all direct accesses to dev->trans_start,
so change the netif_trans_update helper to update trans_start of
netdev queue 0 instead and then remove trans_start from struct net_device.

AFAICS a lot of the netif_trans_update() invocations are now useless
because they occur in ndo_start_xmit and driver doesn't set LLTX
(i.e. stack already took care of the update).

As I can't test any of them it seems better to just leave them alone.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  2 +-
 include/linux/netdevice.h                   | 15 +++++----------
 net/sched/sch_generic.c                     | 10 +++-------
 3 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8e6c0f2487d7..f6da6b76e678 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -328,7 +328,7 @@ static void i40e_tx_timeout(struct net_device *netdev)
 		unsigned long trans_start;
 
 		q = netdev_get_tx_queue(netdev, i);
-		trans_start = q->trans_start ? : netdev->trans_start;
+		trans_start = q->trans_start;
 		if (netif_xmit_stopped(q) &&
 		    time_after(jiffies,
 			       (trans_start + netdev->watchdog_timeo))) {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f53412cccbaa..63580e6d0df4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -581,7 +581,7 @@ struct netdev_queue {
 	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	int			xmit_lock_owner;
 	/*
-	 * please use this field instead of dev->trans_start
+	 * Time (in jiffies) of last Tx
 	 */
 	unsigned long		trans_start;
 
@@ -1545,7 +1545,6 @@ enum netdev_priv_flags {
  *
  *	@offload_fwd_mark:	Offload device fwding mark
  *
- *	@trans_start:		Time (in jiffies) of last Tx
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1794,13 +1793,6 @@ struct net_device {
 #endif
 
 	/* These may be needed for future network-power-down code. */
-
-	/*
-	 * trans_start here is expensive for high speed devices on SMP,
-	 * please use netdev_queue->trans_start instead.
-	 */
-	unsigned long		trans_start;
-
 	struct timer_list	watchdog_timer;
 
 	int __percpu		*pcpu_refcnt;
@@ -3484,7 +3476,10 @@ static inline void txq_trans_update(struct netdev_queue *txq)
 /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
 static inline void netif_trans_update(struct net_device *dev)
 {
-	dev->trans_start = jiffies;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+
+	if (txq->trans_start != jiffies)
+		txq->trans_start = jiffies;
 }
 
 /**
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 70182cfe119c..269dd71b3828 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -227,13 +227,12 @@ unsigned long dev_trans_start(struct net_device *dev)
 
 	if (is_vlan_dev(dev))
 		dev = vlan_dev_real_dev(dev);
-	res = dev->trans_start;
-	for (i = 0; i < dev->num_tx_queues; i++) {
+	res = netdev_get_tx_queue(dev, 0)->trans_start;
+	for (i = 1; i < dev->num_tx_queues; i++) {
 		val = netdev_get_tx_queue(dev, i)->trans_start;
 		if (val && time_after(val, res))
 			res = val;
 	}
-	dev->trans_start = res;
 
 	return res;
 }
@@ -256,10 +255,7 @@ static void dev_watchdog(unsigned long arg)
 				struct netdev_queue *txq;
 
 				txq = netdev_get_tx_queue(dev, i);
-				/*
-				 * old device drivers set dev->trans_start
-				 */
-				trans_start = txq->trans_start ? : dev->trans_start;
+				trans_start = txq->trans_start;
 				if (netif_xmit_stopped(txq) &&
 				    time_after(jiffies, (trans_start +
 							 dev->watchdog_timeo))) {
-- 
cgit v1.2.3


From 46cc6e4976e3d9058490f20d93bc7805f7f2d81e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 3 May 2016 16:56:03 -0700
Subject: tcp: fix lockdep splat in tcp_snd_una_update()

tcp_snd_una_update() and tcp_rcv_nxt_update() call
u64_stats_update_begin() either from process context or BH handler.

This triggers a lockdep splat on 32bit & SMP builds.

We could add u64_stats_update_begin_bh() variant but this would
slow down 32bit builds with useless local_disable_bh() and
local_enable_bh() pairs, since we own the socket lock at this point.

I add sock_owned_by_me() helper to have proper lockdep support
even on 64bit builds, and new u64_stats_update_begin_raw()
and u64_stats_update_end_raw methods.

Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible")
Reported-by: Fabio Estevam <festevam@gmail.com>
Diagnosed-by: Francois Romieu <romieu@fr.zoreil.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 14 ++++++++++++++
 include/net/sock.h             |  7 ++++++-
 net/ipv4/tcp_input.c           | 10 ++++++----
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index df89c9bcba7d..d3a2bb712af3 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -89,6 +89,20 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 #endif
 }
 
+static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	raw_write_seqcount_begin(&syncp->seq);
+#endif
+}
+
+static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	raw_write_seqcount_end(&syncp->seq);
+#endif
+}
+
 static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/include/net/sock.h b/include/net/sock.h
index 45f5b492c658..c9c8b19df27c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1421,11 +1421,16 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
  * accesses from user process context.
  */
 
-static inline bool sock_owned_by_user(const struct sock *sk)
+static inline void sock_owned_by_me(const struct sock *sk)
 {
 #ifdef CONFIG_LOCKDEP
 	WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
 #endif
+}
+
+static inline bool sock_owned_by_user(const struct sock *sk)
+{
+	sock_owned_by_me(sk);
 	return sk->sk_lock.owned;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6171f92be090..a914e0607895 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3355,9 +3355,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 {
 	u32 delta = ack - tp->snd_una;
 
-	u64_stats_update_begin(&tp->syncp);
+	sock_owned_by_me((struct sock *)tp);
+	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_acked += delta;
-	u64_stats_update_end(&tp->syncp);
+	u64_stats_update_end_raw(&tp->syncp);
 	tp->snd_una = ack;
 }
 
@@ -3366,9 +3367,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
 {
 	u32 delta = seq - tp->rcv_nxt;
 
-	u64_stats_update_begin(&tp->syncp);
+	sock_owned_by_me((struct sock *)tp);
+	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_received += delta;
-	u64_stats_update_end(&tp->syncp);
+	u64_stats_update_end_raw(&tp->syncp);
 	tp->rcv_nxt = seq;
 }
 
-- 
cgit v1.2.3


From 73898db0430125606c86c798c0627aefef9af9ed Mon Sep 17 00:00:00 2001
From: Haggai Abramovsky <hagaya@mellanox.com>
Date: Wed, 4 May 2016 14:50:15 +0300
Subject: net/mlx4: Avoid wrong virtual mappings

The dma_alloc_coherent() function returns a virtual address which can
be used for coherent access to the underlying memory.  On some
architectures, like arm64, undefined behavior results if this memory is
also accessed via virtual mappings that are not coherent.  Because of
their undefined nature, operations like virt_to_page() return garbage
when passed virtual addresses obtained from dma_alloc_coherent().  Any
subsequent mappings via vmap() of the garbage page values are unusable
and result in bad things like bus errors (synchronous aborts in ARM64
speak).

The mlx4 driver contains code that does the equivalent of:
vmap(virt_to_page(dma_alloc_coherent)), this results in an OOPs when the
device is opened.

Prevent Ethernet driver to run this problematic code by forcing it to
allocate contiguous memory. As for the Infiniband driver, at first we
are trying to allocate contiguous memory, but in case of failure roll
back to work with fragmented memory.

Signed-off-by: Haggai Abramovsky <hagaya@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reported-by: David Daney <david.daney@cavium.com>
Tested-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/qp.c                   | 27 +++++--
 drivers/net/ethernet/mellanox/mlx4/alloc.c        | 93 ++++++++++-------------
 drivers/net/ethernet/mellanox/mlx4/en_cq.c        |  9 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_resources.c | 31 --------
 drivers/net/ethernet/mellanox/mlx4/en_rx.c        | 11 +--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c        | 14 +---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  2 -
 include/linux/mlx4/device.h                       |  4 +-
 9 files changed, 68 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index fd97534762b8..81b0e1fbec1d 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -419,7 +419,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 }
 
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp,
+			      bool shrink_wqe)
 {
 	int s;
 
@@ -477,7 +478,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	 * We set WQE size to at least 64 bytes, this way stamping
 	 * invalidates each WQE.
 	 */
-	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
 	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
 	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
 	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
@@ -642,6 +643,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 {
 	int qpn;
 	int err;
+	struct ib_qp_cap backup_cap;
 	struct mlx4_ib_sqp *sqp;
 	struct mlx4_ib_qp *qp;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
@@ -775,7 +777,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err;
 		}
 
-		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+		memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap));
+		err = set_kernel_sq_size(dev, &init_attr->cap,
+					 qp_type, qp, true);
 		if (err)
 			goto err;
 
@@ -787,9 +791,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			*qp->db.db = 0;
 		}
 
-		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
-			err = -ENOMEM;
-			goto err_db;
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size,
+				   &qp->buf, gfp)) {
+			memcpy(&init_attr->cap, &backup_cap,
+			       sizeof(backup_cap));
+			err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
+						 qp, false);
+			if (err)
+				goto err_db;
+
+			if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+					   PAGE_SIZE * 2, &qp->buf, gfp)) {
+				err = -ENOMEM;
+				goto err_db;
+			}
 		}
 
 		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 0c51c69f802f..249a4584401a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -576,41 +576,48 @@ out:
 
 	return res;
 }
-/*
- * Handling for queue buffers -- we allocate a bunch of memory and
- * register it in a memory region at HCA virtual address 0.  If the
- * requested size is > max_direct, we split the allocation into
- * multiple pages, so we don't require too much contiguous memory.
- */
 
-int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf, gfp_t gfp)
+static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size,
+				 struct mlx4_buf *buf, gfp_t gfp)
 {
 	dma_addr_t t;
 
-	if (size <= max_direct) {
-		buf->nbufs        = 1;
-		buf->npages       = 1;
-		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_alloc_coherent(&dev->persist->pdev->dev,
-						       size, &t, gfp);
-		if (!buf->direct.buf)
-			return -ENOMEM;
+	buf->nbufs        = 1;
+	buf->npages       = 1;
+	buf->page_shift   = get_order(size) + PAGE_SHIFT;
+	buf->direct.buf   =
+		dma_zalloc_coherent(&dev->persist->pdev->dev,
+				    size, &t, gfp);
+	if (!buf->direct.buf)
+		return -ENOMEM;
 
-		buf->direct.map = t;
+	buf->direct.map = t;
 
-		while (t & ((1 << buf->page_shift) - 1)) {
-			--buf->page_shift;
-			buf->npages *= 2;
-		}
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
 
-		memset(buf->direct.buf, 0, size);
+	return 0;
+}
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ *  requested size is > max_direct, we split the allocation into
+ *  multiple pages, so we don't require too much contiguous memory.
+ */
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf, gfp_t gfp)
+{
+	if (size <= max_direct) {
+		return mlx4_buf_direct_alloc(dev, size, buf, gfp);
 	} else {
+		dma_addr_t t;
 		int i;
 
-		buf->direct.buf  = NULL;
-		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
-		buf->npages      = buf->nbufs;
+		buf->direct.buf = NULL;
+		buf->nbufs	= (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages	= buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
 		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
 					   gfp);
@@ -619,28 +626,12 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->persist->pdev->dev,
-						   PAGE_SIZE,
-						   &t, gfp);
+				dma_zalloc_coherent(&dev->persist->pdev->dev,
+						    PAGE_SIZE, &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
 
 			buf->page_list[i].map = t;
-
-			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
-		}
-
-		if (BITS_PER_LONG == 64) {
-			struct page **pages;
-			pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
-			if (!pages)
-				goto err_free;
-			for (i = 0; i < buf->nbufs; ++i)
-				pages[i] = virt_to_page(buf->page_list[i].buf);
-			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
-			kfree(pages);
-			if (!buf->direct.buf)
-				goto err_free;
 		}
 	}
 
@@ -655,15 +646,11 @@ EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
 
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 {
-	int i;
-
-	if (buf->nbufs == 1)
+	if (buf->nbufs == 1) {
 		dma_free_coherent(&dev->persist->pdev->dev, size,
-				  buf->direct.buf,
-				  buf->direct.map);
-	else {
-		if (BITS_PER_LONG == 64)
-			vunmap(buf->direct.buf);
+				  buf->direct.buf, buf->direct.map);
+	} else {
+		int i;
 
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->page_list[i].buf)
@@ -789,7 +776,7 @@ void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
 EXPORT_SYMBOL_GPL(mlx4_db_free);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
-		       int size, int max_direct)
+		       int size)
 {
 	int err;
 
@@ -799,7 +786,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 
 	*wqres->db.db = 0;
 
-	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL);
+	err = mlx4_buf_direct_alloc(dev, size, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_db;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index af975a2b74c6..132cea655920 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -73,22 +73,16 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	 */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
-				cq->buf_size, 2 * PAGE_SIZE);
+				cq->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_cq;
 
-	err = mlx4_en_map_buffer(&cq->wqres.buf);
-	if (err)
-		goto err_res;
-
 	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
 	*pcq = cq;
 
 	return 0;
 
-err_res:
-	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
 err_cq:
 	kfree(cq);
 	*pcq = NULL;
@@ -177,7 +171,6 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_cq *cq = *pcq;
 
-	mlx4_en_unmap_buffer(&cq->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
 	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
 	    cq->is_tx == RX)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6f28ac58251c..92e0624f4cf0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2928,7 +2928,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
 	/* Allocate page for receive rings */
 	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
-				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+				MLX4_EN_PAGE_SIZE);
 	if (err) {
 		en_err(priv, "Failed to allocate page for rx qps\n");
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index 02e925d6f734..a6b0db0e0383 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -107,37 +107,6 @@ int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
 	return ret;
 }
 
-int mlx4_en_map_buffer(struct mlx4_buf *buf)
-{
-	struct page **pages;
-	int i;
-
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
-		return 0;
-
-	pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
-
-	for (i = 0; i < buf->nbufs; ++i)
-		pages[i] = virt_to_page(buf->page_list[i].buf);
-
-	buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
-	kfree(pages);
-	if (!buf->direct.buf)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
-{
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
-		return;
-
-	vunmap(buf->direct.buf);
-}
-
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
 {
     return;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b723e3bcab39..8ef6875b6cf9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -394,17 +394,11 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 
 	/* Allocate HW buffers on provided NUMA node */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
-	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
-				 ring->buf_size, 2 * PAGE_SIZE);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_info;
 
-	err = mlx4_en_map_buffer(&ring->wqres.buf);
-	if (err) {
-		en_err(priv, "Failed to map RX buffer\n");
-		goto err_hwq;
-	}
 	ring->buf = ring->wqres.buf.direct.buf;
 
 	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
@@ -412,8 +406,6 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	*pring = ring;
 	return 0;
 
-err_hwq:
-	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 err_info:
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
@@ -517,7 +509,6 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rx_ring *ring = *pring;
 
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 0f206a95429c..f6e61570cb2c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -94,20 +94,13 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
 	/* Allocate HW buffers on provided NUMA node */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
-	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
-				 2 * PAGE_SIZE);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
 		goto err_bounce;
 	}
 
-	err = mlx4_en_map_buffer(&ring->wqres.buf);
-	if (err) {
-		en_err(priv, "Failed to map TX buffer\n");
-		goto err_hwq_res;
-	}
-
 	ring->buf = ring->wqres.buf.direct.buf;
 
 	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
@@ -118,7 +111,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 				    MLX4_RESERVE_ETH_BF_QP);
 	if (err) {
 		en_err(priv, "failed reserving qp for TX ring\n");
-		goto err_map;
+		goto err_hwq_res;
 	}
 
 	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
@@ -155,8 +148,6 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
 err_reserve:
 	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
-err_map:
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq_res:
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 err_bounce:
@@ -183,7 +174,6 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 	mlx4_qp_remove(mdev->dev, &ring->qp);
 	mlx4_qp_free(mdev->dev, &ring->qp);
 	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	kfree(ring->bounce_buf);
 	ring->bounce_buf = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 63b1aeae2c03..cc84e09f324a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -672,8 +672,6 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 		int is_tx, int rss, int qpn, int cqn, int user_prio,
 		struct mlx4_qp_context *context);
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
-int mlx4_en_map_buffer(struct mlx4_buf *buf);
-void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
 int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
 			    int loopback);
 void mlx4_en_calc_rx_buf(struct net_device *dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d1f904c8b2cb..80dec87a94f8 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1058,7 +1058,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+	if (buf->nbufs == 1)
 		return buf->direct.buf + offset;
 	else
 		return buf->page_list[offset >> PAGE_SHIFT].buf +
@@ -1098,7 +1098,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order,
 void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
-		       int size, int max_direct);
+		       int size);
 void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
 		       int size);
 
-- 
cgit v1.2.3


From db58ba45920255e967cc1d62a430cebd634b5046 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:12 -0700
Subject: bpf: wire in data and data_end for cls_act_bpf

allow cls_bpf and act_bpf programs access skb->data and skb->data_end pointers.
The bpf helpers that change skb->data need to update data_end pointer as well.
The verifier checks that programs always reload data, data_end pointers
after calls to such bpf helpers.
We cannot add 'data_end' pointer to struct qdisc_skb_cb directly,
since it's embedded as-is by infiniband ipoib, so wrapper struct is needed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 16 ++++++++++++++++
 net/core/filter.c      | 51 ++++++++++++++++++++++++++++++++++++++++++++------
 net/sched/act_bpf.c    |  2 ++
 net/sched/cls_bpf.c    |  2 ++
 4 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43aa1f8855c7..ec1411c89105 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -352,6 +352,22 @@ struct sk_filter {
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
+struct bpf_skb_data_end {
+	struct qdisc_skb_cb qdisc_cb;
+	void *data_end;
+};
+
+/* compute the linear packet data range [data, data_end) which
+ * will be accessed by cls_bpf and act_bpf programs
+ */
+static inline void bpf_compute_data_end(struct sk_buff *skb)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
+	cb->data_end = skb->data + skb_headlen(skb);
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
 	/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/net/core/filter.c b/net/core/filter.c
index 218e5de8c402..71c2a1f473ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1344,6 +1344,21 @@ struct bpf_scratchpad {
 
 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
 
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+					unsigned int write_len)
+{
+	int err;
+
+	if (!skb_cloned(skb))
+		return 0;
+	if (skb_clone_writable(skb, write_len))
+		return 0;
+	err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+	if (!err)
+		bpf_compute_data_end(skb);
+	return err;
+}
+
 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 {
 	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
@@ -1366,7 +1381,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 */
 	if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + len)))
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, sp->buff);
@@ -1444,7 +1459,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1499,7 +1514,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1699,12 +1714,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	__be16 vlan_proto = (__force __be16) r2;
+	int ret;
 
 	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
 		     vlan_proto != htons(ETH_P_8021AD)))
 		vlan_proto = htons(ETH_P_8021Q);
 
-	return skb_vlan_push(skb, vlan_proto, vlan_tci);
+	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+	bpf_compute_data_end(skb);
+	return ret;
 }
 
 const struct bpf_func_proto bpf_skb_vlan_push_proto = {
@@ -1720,8 +1738,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	int ret;
 
-	return skb_vlan_pop(skb);
+	ret = skb_vlan_pop(skb);
+	bpf_compute_data_end(skb);
+	return ret;
 }
 
 const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
@@ -2066,8 +2087,12 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
-	if (off == offsetof(struct __sk_buff, tc_classid))
+	switch (off) {
+	case offsetof(struct __sk_buff, tc_classid):
+	case offsetof(struct __sk_buff, data):
+	case offsetof(struct __sk_buff, data_end):
 		return false;
+	}
 
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -2215,6 +2240,20 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
 		break;
 
+	case offsetof(struct __sk_buff, data):
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+				      dst_reg, src_reg,
+				      offsetof(struct sk_buff, data));
+		break;
+
+	case offsetof(struct __sk_buff, data_end):
+		ctx_off -= offsetof(struct __sk_buff, data_end);
+		ctx_off += offsetof(struct sk_buff, cb);
+		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
+				      dst_reg, src_reg, ctx_off);
+		break;
+
 	case offsetof(struct __sk_buff, tc_index):
 #ifdef CONFIG_NET_SCHED
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 4fd703362563..c7123e01c2ca 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 425fe6a0eda3..7b342c779da7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.2.3


From 43315f31adc2bf3b35e04dcf2372c3bb08014ed1 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 6 May 2016 07:09:07 -0700
Subject: soc: qcom: smd: Introduce compile stubs

Introduce compile stubs for the SMD API, allowing consumers to be
compile tested.

Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/soc/qcom/smd.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index d0cb6d189a0a..46a984f5e3a3 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -45,13 +45,39 @@ struct qcom_smd_driver {
 	int (*callback)(struct qcom_smd_device *, const void *, size_t);
 };
 
+#if IS_ENABLED(CONFIG_QCOM_SMD)
+
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
 void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
+int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
+
+#else
+
+static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv)
+{
+	return -ENXIO;
+}
+
+static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+}
+
+static inline int qcom_smd_send(struct qcom_smd_channel *channel,
+				const void *data, int len)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+	return -ENXIO;
+}
+
+#endif
+
 #define module_qcom_smd_driver(__smd_driver) \
 	module_driver(__smd_driver, qcom_smd_driver_register, \
 		      qcom_smd_driver_unregister)
 
-int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
 #endif
-- 
cgit v1.2.3


From bdabad3e363d825ddf9679dd431cca0b2c30f881 Mon Sep 17 00:00:00 2001
From: Courtney Cavin <courtney.cavin@sonymobile.com>
Date: Fri, 6 May 2016 07:09:08 -0700
Subject: net: Add Qualcomm IPC router

Add an implementation of Qualcomm's IPC router protocol, used to
communicate with service providing remote processors.

Signed-off-by: Courtney Cavin <courtney.cavin@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
[bjorn: Cope with 0 being a valid node id and implement RTM_NEWADDR]
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h    |    4 +-
 include/uapi/linux/qrtr.h |   12 +
 net/Kconfig               |    1 +
 net/Makefile              |    1 +
 net/qrtr/Kconfig          |   24 ++
 net/qrtr/Makefile         |    2 +
 net/qrtr/qrtr.c           | 1007 +++++++++++++++++++++++++++++++++++++++++++++
 net/qrtr/qrtr.h           |   31 ++
 net/qrtr/smd.c            |  117 ++++++
 9 files changed, 1198 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/qrtr.h
 create mode 100644 net/qrtr/Kconfig
 create mode 100644 net/qrtr/Makefile
 create mode 100644 net/qrtr/qrtr.c
 create mode 100644 net/qrtr/qrtr.h
 create mode 100644 net/qrtr/smd.c

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 73bf6c6a833b..b5cc5a6d7011 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -201,8 +201,9 @@ struct ucred {
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
 #define AF_KCM		41	/* Kernel Connection Multiplexor*/
+#define AF_QIPCRTR	42	/* Qualcomm IPC Router          */
 
-#define AF_MAX		42	/* For now.. */
+#define AF_MAX		43	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -249,6 +250,7 @@ struct ucred {
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
 #define PF_KCM		AF_KCM
+#define PF_QIPCRTR	AF_QIPCRTR
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
new file mode 100644
index 000000000000..66c0748d26e2
--- /dev/null
+++ b/include/uapi/linux/qrtr.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_QRTR_H
+#define _LINUX_QRTR_H
+
+#include <linux/socket.h>
+
+struct sockaddr_qrtr {
+	__kernel_sa_family_t sq_family;
+	__u32 sq_node;
+	__u32 sq_port;
+};
+
+#endif /* _LINUX_QRTR_H */
diff --git a/net/Kconfig b/net/Kconfig
index a8934d8c8fda..b841c42e5c9b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -236,6 +236,7 @@ source "net/mpls/Kconfig"
 source "net/hsr/Kconfig"
 source "net/switchdev/Kconfig"
 source "net/l3mdev/Kconfig"
+source "net/qrtr/Kconfig"
 
 config RPS
 	bool
diff --git a/net/Makefile b/net/Makefile
index 81d14119eab5..bdd14553a774 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ endif
 ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
 obj-y				+= l3mdev/
 endif
+obj-$(CONFIG_QRTR)		+= qrtr/
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
new file mode 100644
index 000000000000..673fd1f86ebe
--- /dev/null
+++ b/net/qrtr/Kconfig
@@ -0,0 +1,24 @@
+# Qualcomm IPC Router configuration
+#
+
+config QRTR
+	tristate "Qualcomm IPC Router support"
+	depends on ARCH_QCOM || COMPILE_TEST
+	---help---
+	  Say Y if you intend to use Qualcomm IPC router protocol.  The
+	  protocol is used to communicate with services provided by other
+	  hardware blocks in the system.
+
+	  In order to do service lookups, a userspace daemon is required to
+	  maintain a service listing.
+
+if QRTR
+
+config QRTR_SMD
+	tristate "SMD IPC Router channels"
+	depends on QCOM_SMD || COMPILE_TEST
+	---help---
+	  Say Y here to support SMD based ipcrouter channels.  SMD is the
+	  most common transport for IPC Router.
+
+endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
new file mode 100644
index 000000000000..6c00dc623b7e
--- /dev/null
+++ b/net/qrtr/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_QRTR) := qrtr.o
+obj-$(CONFIG_QRTR_SMD) += smd.o
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
new file mode 100644
index 000000000000..c985ecbe9bd6
--- /dev/null
+++ b/net/qrtr/qrtr.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/qrtr.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+
+#include <net/sock.h>
+
+#include "qrtr.h"
+
+#define QRTR_PROTO_VER 1
+
+/* auto-bind range */
+#define QRTR_MIN_EPH_SOCKET 0x4000
+#define QRTR_MAX_EPH_SOCKET 0x7fff
+
+enum qrtr_pkt_type {
+	QRTR_TYPE_DATA		= 1,
+	QRTR_TYPE_HELLO		= 2,
+	QRTR_TYPE_BYE		= 3,
+	QRTR_TYPE_NEW_SERVER	= 4,
+	QRTR_TYPE_DEL_SERVER	= 5,
+	QRTR_TYPE_DEL_CLIENT	= 6,
+	QRTR_TYPE_RESUME_TX	= 7,
+	QRTR_TYPE_EXIT		= 8,
+	QRTR_TYPE_PING		= 9,
+};
+
+/**
+ * struct qrtr_hdr - (I|R)PCrouter packet header
+ * @version: protocol version
+ * @type: packet type; one of QRTR_TYPE_*
+ * @src_node_id: source node
+ * @src_port_id: source port
+ * @confirm_rx: boolean; whether a resume-tx packet should be send in reply
+ * @size: length of packet, excluding this header
+ * @dst_node_id: destination node
+ * @dst_port_id: destination port
+ */
+struct qrtr_hdr {
+	__le32 version;
+	__le32 type;
+	__le32 src_node_id;
+	__le32 src_port_id;
+	__le32 confirm_rx;
+	__le32 size;
+	__le32 dst_node_id;
+	__le32 dst_port_id;
+} __packed;
+
+#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr)
+#define QRTR_NODE_BCAST ((unsigned int)-1)
+#define QRTR_PORT_CTRL ((unsigned int)-2)
+
+struct qrtr_sock {
+	/* WARNING: sk must be the first member */
+	struct sock sk;
+	struct sockaddr_qrtr us;
+	struct sockaddr_qrtr peer;
+};
+
+static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
+{
+	BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0);
+	return container_of(sk, struct qrtr_sock, sk);
+}
+
+static unsigned int qrtr_local_nid = -1;
+
+/* for node ids */
+static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
+/* broadcast list */
+static LIST_HEAD(qrtr_all_nodes);
+/* lock for qrtr_nodes, qrtr_all_nodes and node reference */
+static DEFINE_MUTEX(qrtr_node_lock);
+
+/* local port allocation management */
+static DEFINE_IDR(qrtr_ports);
+static DEFINE_MUTEX(qrtr_port_lock);
+
+/**
+ * struct qrtr_node - endpoint node
+ * @ep_lock: lock for endpoint management and callbacks
+ * @ep: endpoint
+ * @ref: reference count for node
+ * @nid: node id
+ * @rx_queue: receive queue
+ * @work: scheduled work struct for recv work
+ * @item: list item for broadcast list
+ */
+struct qrtr_node {
+	struct mutex ep_lock;
+	struct qrtr_endpoint *ep;
+	struct kref ref;
+	unsigned int nid;
+
+	struct sk_buff_head rx_queue;
+	struct work_struct work;
+	struct list_head item;
+};
+
+/* Release node resources and free the node.
+ *
+ * Do not call directly, use qrtr_node_release.  To be used with
+ * kref_put_mutex.  As such, the node mutex is expected to be locked on call.
+ */
+static void __qrtr_node_release(struct kref *kref)
+{
+	struct qrtr_node *node = container_of(kref, struct qrtr_node, ref);
+
+	if (node->nid != QRTR_EP_NID_AUTO)
+		radix_tree_delete(&qrtr_nodes, node->nid);
+
+	list_del(&node->item);
+	mutex_unlock(&qrtr_node_lock);
+
+	skb_queue_purge(&node->rx_queue);
+	kfree(node);
+}
+
+/* Increment reference to node. */
+static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node)
+{
+	if (node)
+		kref_get(&node->ref);
+	return node;
+}
+
+/* Decrement reference to node and release as necessary. */
+static void qrtr_node_release(struct qrtr_node *node)
+{
+	if (!node)
+		return;
+	kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock);
+}
+
+/* Pass an outgoing packet socket buffer to the endpoint driver. */
+static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	int rc = -ENODEV;
+
+	mutex_lock(&node->ep_lock);
+	if (node->ep)
+		rc = node->ep->xmit(node->ep, skb);
+	else
+		kfree_skb(skb);
+	mutex_unlock(&node->ep_lock);
+
+	return rc;
+}
+
+/* Lookup node by id.
+ *
+ * callers must release with qrtr_node_release()
+ */
+static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
+{
+	struct qrtr_node *node;
+
+	mutex_lock(&qrtr_node_lock);
+	node = radix_tree_lookup(&qrtr_nodes, nid);
+	node = qrtr_node_acquire(node);
+	mutex_unlock(&qrtr_node_lock);
+
+	return node;
+}
+
+/* Assign node id to node.
+ *
+ * This is mostly useful for automatic node id assignment, based on
+ * the source id in the incoming packet.
+ */
+static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid)
+{
+	if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO)
+		return;
+
+	mutex_lock(&qrtr_node_lock);
+	radix_tree_insert(&qrtr_nodes, nid, node);
+	node->nid = nid;
+	mutex_unlock(&qrtr_node_lock);
+}
+
+/**
+ * qrtr_endpoint_post() - post incoming data
+ * @ep: endpoint handle
+ * @data: data pointer
+ * @len: size of data in bytes
+ *
+ * Return: 0 on success; negative error code on failure
+ */
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
+{
+	struct qrtr_node *node = ep->node;
+	const struct qrtr_hdr *phdr = data;
+	struct sk_buff *skb;
+	unsigned int psize;
+	unsigned int size;
+	unsigned int type;
+	unsigned int ver;
+	unsigned int dst;
+
+	if (len < QRTR_HDR_SIZE || len & 3)
+		return -EINVAL;
+
+	ver = le32_to_cpu(phdr->version);
+	size = le32_to_cpu(phdr->size);
+	type = le32_to_cpu(phdr->type);
+	dst = le32_to_cpu(phdr->dst_port_id);
+
+	psize = (size + 3) & ~3;
+
+	if (ver != QRTR_PROTO_VER)
+		return -EINVAL;
+
+	if (len != psize + QRTR_HDR_SIZE)
+		return -EINVAL;
+
+	if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA)
+		return -EINVAL;
+
+	skb = netdev_alloc_skb(NULL, len);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reset_transport_header(skb);
+	memcpy(skb_put(skb, len), data, len);
+
+	skb_queue_tail(&node->rx_queue, skb);
+	schedule_work(&node->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_post);
+
+/* Allocate and construct a resume-tx packet. */
+static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
+					    u32 dst_node, u32 port)
+{
+	const int pkt_len = 20;
+	struct qrtr_hdr *hdr;
+	struct sk_buff *skb;
+	u32 *buf;
+
+	skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+	skb_reset_transport_header(skb);
+
+	hdr = (struct qrtr_hdr *)skb_put(skb, QRTR_HDR_SIZE);
+	hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+	hdr->type = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+	hdr->src_node_id = cpu_to_le32(src_node);
+	hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+	hdr->confirm_rx = cpu_to_le32(0);
+	hdr->size = cpu_to_le32(pkt_len);
+	hdr->dst_node_id = cpu_to_le32(dst_node);
+	hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+
+	buf = (u32 *)skb_put(skb, pkt_len);
+	memset(buf, 0, pkt_len);
+	buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+	buf[1] = cpu_to_le32(src_node);
+	buf[2] = cpu_to_le32(port);
+
+	return skb;
+}
+
+static struct qrtr_sock *qrtr_port_lookup(int port);
+static void qrtr_port_put(struct qrtr_sock *ipc);
+
+/* Handle and route a received packet.
+ *
+ * This will auto-reply with resume-tx packet as necessary.
+ */
+static void qrtr_node_rx_work(struct work_struct *work)
+{
+	struct qrtr_node *node = container_of(work, struct qrtr_node, work);
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&node->rx_queue)) != NULL) {
+		const struct qrtr_hdr *phdr;
+		u32 dst_node, dst_port;
+		struct qrtr_sock *ipc;
+		u32 src_node;
+		int confirm;
+
+		phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+		src_node = le32_to_cpu(phdr->src_node_id);
+		dst_node = le32_to_cpu(phdr->dst_node_id);
+		dst_port = le32_to_cpu(phdr->dst_port_id);
+		confirm = !!phdr->confirm_rx;
+
+		qrtr_node_assign(node, src_node);
+
+		ipc = qrtr_port_lookup(dst_port);
+		if (!ipc) {
+			kfree_skb(skb);
+		} else {
+			if (sock_queue_rcv_skb(&ipc->sk, skb))
+				kfree_skb(skb);
+
+			qrtr_port_put(ipc);
+		}
+
+		if (confirm) {
+			skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port);
+			if (!skb)
+				break;
+			if (qrtr_node_enqueue(node, skb))
+				break;
+		}
+	}
+}
+
+/**
+ * qrtr_endpoint_register() - register a new endpoint
+ * @ep: endpoint to register
+ * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment
+ * Return: 0 on success; negative error code on failure
+ *
+ * The specified endpoint must have the xmit function pointer set on call.
+ */
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid)
+{
+	struct qrtr_node *node;
+
+	if (!ep || !ep->xmit)
+		return -EINVAL;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	INIT_WORK(&node->work, qrtr_node_rx_work);
+	kref_init(&node->ref);
+	mutex_init(&node->ep_lock);
+	skb_queue_head_init(&node->rx_queue);
+	node->nid = QRTR_EP_NID_AUTO;
+	node->ep = ep;
+
+	qrtr_node_assign(node, nid);
+
+	mutex_lock(&qrtr_node_lock);
+	list_add(&node->item, &qrtr_all_nodes);
+	mutex_unlock(&qrtr_node_lock);
+	ep->node = node;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_register);
+
+/**
+ * qrtr_endpoint_unregister - unregister endpoint
+ * @ep: endpoint to unregister
+ */
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
+{
+	struct qrtr_node *node = ep->node;
+
+	mutex_lock(&node->ep_lock);
+	node->ep = NULL;
+	mutex_unlock(&node->ep_lock);
+
+	qrtr_node_release(node);
+	ep->node = NULL;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister);
+
+/* Lookup socket by port.
+ *
+ * Callers must release with qrtr_port_put()
+ */
+static struct qrtr_sock *qrtr_port_lookup(int port)
+{
+	struct qrtr_sock *ipc;
+
+	if (port == QRTR_PORT_CTRL)
+		port = 0;
+
+	mutex_lock(&qrtr_port_lock);
+	ipc = idr_find(&qrtr_ports, port);
+	if (ipc)
+		sock_hold(&ipc->sk);
+	mutex_unlock(&qrtr_port_lock);
+
+	return ipc;
+}
+
+/* Release acquired socket. */
+static void qrtr_port_put(struct qrtr_sock *ipc)
+{
+	sock_put(&ipc->sk);
+}
+
+/* Remove port assignment. */
+static void qrtr_port_remove(struct qrtr_sock *ipc)
+{
+	int port = ipc->us.sq_port;
+
+	if (port == QRTR_PORT_CTRL)
+		port = 0;
+
+	__sock_put(&ipc->sk);
+
+	mutex_lock(&qrtr_port_lock);
+	idr_remove(&qrtr_ports, port);
+	mutex_unlock(&qrtr_port_lock);
+}
+
+/* Assign port number to socket.
+ *
+ * Specify port in the integer pointed to by port, and it will be adjusted
+ * on return as necesssary.
+ *
+ * Port may be:
+ *   0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET]
+ *   <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN
+ *   >QRTR_MIN_EPH_SOCKET: Specified; available to all
+ */
+static int qrtr_port_assign(struct qrtr_sock *ipc, int *port)
+{
+	int rc;
+
+	mutex_lock(&qrtr_port_lock);
+	if (!*port) {
+		rc = idr_alloc(&qrtr_ports, ipc,
+			       QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1,
+			       GFP_ATOMIC);
+		if (rc >= 0)
+			*port = rc;
+	} else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) {
+		rc = -EACCES;
+	} else if (*port == QRTR_PORT_CTRL) {
+		rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC);
+	} else {
+		rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC);
+		if (rc >= 0)
+			*port = rc;
+	}
+	mutex_unlock(&qrtr_port_lock);
+
+	if (rc == -ENOSPC)
+		return -EADDRINUSE;
+	else if (rc < 0)
+		return rc;
+
+	sock_hold(&ipc->sk);
+
+	return 0;
+}
+
+/* Bind socket to address.
+ *
+ * Socket should be locked upon call.
+ */
+static int __qrtr_bind(struct socket *sock,
+		       const struct sockaddr_qrtr *addr, int zapped)
+{
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int port;
+	int rc;
+
+	/* rebinding ok */
+	if (!zapped && addr->sq_port == ipc->us.sq_port)
+		return 0;
+
+	port = addr->sq_port;
+	rc = qrtr_port_assign(ipc, &port);
+	if (rc)
+		return rc;
+
+	/* unbind previous, if any */
+	if (!zapped)
+		qrtr_port_remove(ipc);
+	ipc->us.sq_port = port;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	return 0;
+}
+
+/* Auto bind to an ephemeral port. */
+static int qrtr_autobind(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_qrtr addr;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		return 0;
+
+	addr.sq_family = AF_QIPCRTR;
+	addr.sq_node = qrtr_local_nid;
+	addr.sq_port = 0;
+
+	return __qrtr_bind(sock, &addr, 1);
+}
+
+/* Bind socket to specified sockaddr. */
+static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int rc;
+
+	if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+		return -EINVAL;
+
+	if (addr->sq_node != ipc->us.sq_node)
+		return -EINVAL;
+
+	lock_sock(sk);
+	rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED));
+	release_sock(sk);
+
+	return rc;
+}
+
+/* Queue packet to local peer socket. */
+static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	const struct qrtr_hdr *phdr;
+	struct qrtr_sock *ipc;
+
+	phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+
+	ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id));
+	if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	if (sock_queue_rcv_skb(&ipc->sk, skb)) {
+		qrtr_port_put(ipc);
+		kfree_skb(skb);
+		return -ENOSPC;
+	}
+
+	qrtr_port_put(ipc);
+
+	return 0;
+}
+
+/* Queue packet for broadcast. */
+static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+
+	mutex_lock(&qrtr_node_lock);
+	list_for_each_entry(node, &qrtr_all_nodes, item) {
+		skbn = skb_clone(skb, GFP_KERNEL);
+		if (!skbn)
+			break;
+		skb_set_owner_w(skbn, skb->sk);
+		qrtr_node_enqueue(node, skbn);
+	}
+	mutex_unlock(&qrtr_node_lock);
+
+	qrtr_local_enqueue(node, skb);
+
+	return 0;
+}
+
+static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+	int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	struct qrtr_node *node;
+	struct qrtr_hdr *hdr;
+	struct sk_buff *skb;
+	size_t plen;
+	int rc;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT))
+		return -EINVAL;
+
+	if (len > 65535)
+		return -EMSGSIZE;
+
+	lock_sock(sk);
+
+	if (addr) {
+		if (msg->msg_namelen < sizeof(*addr)) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+
+		if (addr->sq_family != AF_QIPCRTR) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+
+		rc = qrtr_autobind(sock);
+		if (rc) {
+			release_sock(sk);
+			return rc;
+		}
+	} else if (sk->sk_state == TCP_ESTABLISHED) {
+		addr = &ipc->peer;
+	} else {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	node = NULL;
+	if (addr->sq_node == QRTR_NODE_BCAST) {
+		enqueue_fn = qrtr_bcast_enqueue;
+	} else if (addr->sq_node == ipc->us.sq_node) {
+		enqueue_fn = qrtr_local_enqueue;
+	} else {
+		enqueue_fn = qrtr_node_enqueue;
+		node = qrtr_node_lookup(addr->sq_node);
+		if (!node) {
+			release_sock(sk);
+			return -ECONNRESET;
+		}
+	}
+
+	plen = (len + 3) & ~3;
+	skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE,
+				  msg->msg_flags & MSG_DONTWAIT, &rc);
+	if (!skb)
+		goto out_node;
+
+	skb_reset_transport_header(skb);
+	skb_put(skb, len + QRTR_HDR_SIZE);
+
+	hdr = (struct qrtr_hdr *)skb_transport_header(skb);
+	hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+	hdr->src_node_id = cpu_to_le32(ipc->us.sq_node);
+	hdr->src_port_id = cpu_to_le32(ipc->us.sq_port);
+	hdr->confirm_rx = cpu_to_le32(0);
+	hdr->size = cpu_to_le32(len);
+	hdr->dst_node_id = cpu_to_le32(addr->sq_node);
+	hdr->dst_port_id = cpu_to_le32(addr->sq_port);
+
+	rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE,
+					 &msg->msg_iter, len);
+	if (rc) {
+		kfree_skb(skb);
+		goto out_node;
+	}
+
+	if (plen != len) {
+		skb_pad(skb, plen - len);
+		skb_put(skb, plen - len);
+	}
+
+	if (ipc->us.sq_port == QRTR_PORT_CTRL) {
+		if (len < 4) {
+			rc = -EINVAL;
+			kfree_skb(skb);
+			goto out_node;
+		}
+
+		/* control messages already require the type as 'command' */
+		skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4);
+	} else {
+		hdr->type = cpu_to_le32(QRTR_TYPE_DATA);
+	}
+
+	rc = enqueue_fn(node, skb);
+	if (rc >= 0)
+		rc = len;
+
+out_node:
+	qrtr_node_release(node);
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t size, int flags)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+	const struct qrtr_hdr *phdr;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied, rc;
+
+	lock_sock(sk);
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		release_sock(sk);
+		return -EADDRNOTAVAIL;
+	}
+
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &rc);
+	if (!skb) {
+		release_sock(sk);
+		return rc;
+	}
+
+	phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+	copied = le32_to_cpu(phdr->size);
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied);
+	if (rc < 0)
+		goto out;
+	rc = copied;
+
+	if (addr) {
+		addr->sq_family = AF_QIPCRTR;
+		addr->sq_node = le32_to_cpu(phdr->src_node_id);
+		addr->sq_port = le32_to_cpu(phdr->src_port_id);
+		msg->msg_namelen = sizeof(*addr);
+	}
+
+out:
+	skb_free_datagram(sk, skb);
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_connect(struct socket *sock, struct sockaddr *saddr,
+			int len, int flags)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int rc;
+
+	if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	sk->sk_state = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	rc = qrtr_autobind(sock);
+	if (rc) {
+		release_sock(sk);
+		return rc;
+	}
+
+	ipc->peer = *addr;
+	sock->state = SS_CONNECTED;
+	sk->sk_state = TCP_ESTABLISHED;
+
+	release_sock(sk);
+
+	return 0;
+}
+
+static int qrtr_getname(struct socket *sock, struct sockaddr *saddr,
+			int *len, int peer)
+{
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sockaddr_qrtr qaddr;
+	struct sock *sk = sock->sk;
+
+	lock_sock(sk);
+	if (peer) {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+
+		qaddr = ipc->peer;
+	} else {
+		qaddr = ipc->us;
+	}
+	release_sock(sk);
+
+	*len = sizeof(qaddr);
+	qaddr.sq_family = AF_QIPCRTR;
+
+	memcpy(saddr, &qaddr, sizeof(qaddr));
+
+	return 0;
+}
+
+static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	struct sockaddr_qrtr *sq;
+	struct sk_buff *skb;
+	struct ifreq ifr;
+	long len = 0;
+	int rc = 0;
+
+	lock_sock(sk);
+
+	switch (cmd) {
+	case TIOCOUTQ:
+		len = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (len < 0)
+			len = 0;
+		rc = put_user(len, (int __user *)argp);
+		break;
+	case TIOCINQ:
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			len = skb->len - QRTR_HDR_SIZE;
+		rc = put_user(len, (int __user *)argp);
+		break;
+	case SIOCGIFADDR:
+		if (copy_from_user(&ifr, argp, sizeof(ifr))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		sq = (struct sockaddr_qrtr *)&ifr.ifr_addr;
+		*sq = ipc->us;
+		if (copy_to_user(argp, &ifr, sizeof(ifr))) {
+			rc = -EFAULT;
+			break;
+		}
+		break;
+	case SIOCGSTAMP:
+		rc = sock_get_timestamp(sk, argp);
+		break;
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+		rc = -EINVAL;
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct qrtr_sock *ipc;
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+
+	ipc = qrtr_sk(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+
+	sock_set_flag(sk, SOCK_DEAD);
+	sock->sk = NULL;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		qrtr_port_remove(ipc);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static const struct proto_ops qrtr_proto_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_QIPCRTR,
+	.bind		= qrtr_bind,
+	.connect	= qrtr_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.listen		= sock_no_listen,
+	.sendmsg	= qrtr_sendmsg,
+	.recvmsg	= qrtr_recvmsg,
+	.getname	= qrtr_getname,
+	.ioctl		= qrtr_ioctl,
+	.poll		= datagram_poll,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.release	= qrtr_release,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct proto qrtr_proto = {
+	.name		= "QIPCRTR",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct qrtr_sock),
+};
+
+static int qrtr_create(struct net *net, struct socket *sock,
+		       int protocol, int kern)
+{
+	struct qrtr_sock *ipc;
+	struct sock *sk;
+
+	if (sock->type != SOCK_DGRAM)
+		return -EPROTOTYPE;
+
+	sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	sock_init_data(sock, sk);
+	sock->ops = &qrtr_proto_ops;
+
+	ipc = qrtr_sk(sk);
+	ipc->us.sq_family = AF_QIPCRTR;
+	ipc->us.sq_node = qrtr_local_nid;
+	ipc->us.sq_port = 0;
+
+	return 0;
+}
+
+static const struct nla_policy qrtr_policy[IFA_MAX + 1] = {
+	[IFA_LOCAL] = { .type = NLA_U32 },
+};
+
+static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[IFA_MAX + 1];
+	struct ifaddrmsg *ifm;
+	int rc;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!netlink_capable(skb, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+	if (!tb[IFA_LOCAL])
+		return -EINVAL;
+
+	qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]);
+	return 0;
+}
+
+static const struct net_proto_family qrtr_family = {
+	.owner	= THIS_MODULE,
+	.family	= AF_QIPCRTR,
+	.create	= qrtr_create,
+};
+
+static int __init qrtr_proto_init(void)
+{
+	int rc;
+
+	rc = proto_register(&qrtr_proto, 1);
+	if (rc)
+		return rc;
+
+	rc = sock_register(&qrtr_family);
+	if (rc) {
+		proto_unregister(&qrtr_proto);
+		return rc;
+	}
+
+	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL);
+
+	return 0;
+}
+module_init(qrtr_proto_init);
+
+static void __exit qrtr_proto_fini(void)
+{
+	rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR);
+	sock_unregister(qrtr_family.family);
+	proto_unregister(&qrtr_proto);
+}
+module_exit(qrtr_proto_fini);
+
+MODULE_DESCRIPTION("Qualcomm IPC-router driver");
+MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
new file mode 100644
index 000000000000..2b848718f8fe
--- /dev/null
+++ b/net/qrtr/qrtr.h
@@ -0,0 +1,31 @@
+#ifndef __QRTR_H_
+#define __QRTR_H_
+
+#include <linux/types.h>
+
+struct sk_buff;
+
+/* endpoint node id auto assignment */
+#define QRTR_EP_NID_AUTO (-1)
+
+/**
+ * struct qrtr_endpoint - endpoint handle
+ * @xmit: Callback for outgoing packets
+ *
+ * The socket buffer passed to the xmit function becomes owned by the endpoint
+ * driver.  As such, when the driver is done with the buffer, it should
+ * call kfree_skb() on failure, or consume_skb() on success.
+ */
+struct qrtr_endpoint {
+	int (*xmit)(struct qrtr_endpoint *ep, struct sk_buff *skb);
+	/* private: not for endpoint use */
+	struct qrtr_node *node;
+};
+
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid);
+
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
+
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
+
+#endif
diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c
new file mode 100644
index 000000000000..84ebce73aa23
--- /dev/null
+++ b/net/qrtr/smd.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/soc/qcom/smd.h>
+
+#include "qrtr.h"
+
+struct qrtr_smd_dev {
+	struct qrtr_endpoint ep;
+	struct qcom_smd_channel *channel;
+};
+
+/* from smd to qrtr */
+static int qcom_smd_qrtr_callback(struct qcom_smd_device *sdev,
+				  const void *data, size_t len)
+{
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+	int rc;
+
+	if (!qdev)
+		return -EAGAIN;
+
+	rc = qrtr_endpoint_post(&qdev->ep, data, len);
+	if (rc == -EINVAL) {
+		dev_err(&sdev->dev, "invalid ipcrouter packet\n");
+		/* return 0 to let smd drop the packet */
+		rc = 0;
+	}
+
+	return rc;
+}
+
+/* from qrtr to smd */
+static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+	struct qrtr_smd_dev *qdev = container_of(ep, struct qrtr_smd_dev, ep);
+	int rc;
+
+	rc = skb_linearize(skb);
+	if (rc)
+		goto out;
+
+	rc = qcom_smd_send(qdev->channel, skb->data, skb->len);
+
+out:
+	if (rc)
+		kfree_skb(skb);
+	else
+		consume_skb(skb);
+	return rc;
+}
+
+static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev)
+{
+	struct qrtr_smd_dev *qdev;
+	int rc;
+
+	qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL);
+	if (!qdev)
+		return -ENOMEM;
+
+	qdev->channel = sdev->channel;
+	qdev->ep.xmit = qcom_smd_qrtr_send;
+
+	rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO);
+	if (rc)
+		return rc;
+
+	dev_set_drvdata(&sdev->dev, qdev);
+
+	dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n");
+
+	return 0;
+}
+
+static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev)
+{
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+
+	qrtr_endpoint_unregister(&qdev->ep);
+
+	dev_set_drvdata(&sdev->dev, NULL);
+}
+
+static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = {
+	{ "IPCRTR" },
+	{}
+};
+
+static struct qcom_smd_driver qcom_smd_qrtr_driver = {
+	.probe = qcom_smd_qrtr_probe,
+	.remove = qcom_smd_qrtr_remove,
+	.callback = qcom_smd_qrtr_callback,
+	.smd_match_table = qcom_smd_qrtr_smd_match,
+	.driver = {
+		.name = "qcom_smd_qrtr",
+		.owner = THIS_MODULE,
+	},
+};
+
+module_qcom_smd_driver(qcom_smd_qrtr_driver);
+
+MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From bb208f144cf3f59d8f89a09a80efd04389718907 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 21 Mar 2016 20:18:21 +0100
Subject: can: fix handling of unmodifiable configuration options

As described in 'can: m_can: tag current CAN FD controllers as non-ISO'
(6cfda7fbebe) it is possible to define fixed configuration options by
setting the according bit in 'ctrlmode' and clear it in 'ctrlmode_supported'.
This leads to the incovenience that the fixed configuration bits can not be
passed by netlink even when they have the correct values (e.g. non-ISO, FD).

This patch fixes that issue and not only allows fixed set bit values to be set
again but now requires(!) to provide these fixed values at configuration time.
A valid CAN FD configuration consists of a nominal/arbitration bittiming, a
data bittiming and a control mode with CAN_CTRLMODE_FD set - which is now
enforced by a new can_validate() function. This fix additionally removed the
inconsistency that was prohibiting the support of 'CANFD-only' controller
drivers, like the RCar CAN FD.

For this reason a new helper can_set_static_ctrlmode() has been introduced to
provide a proper interface to handle static enabled CAN controller options.

Reported-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Reviewed-by: Ramesh Shanmugasundaram  <ramesh.shanmugasundaram@bp.renesas.com>
Cc: <stable@vger.kernel.org> # >= 3.18
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c         | 56 +++++++++++++++++++++++++++++++++++++++----
 drivers/net/can/m_can/m_can.c |  2 +-
 include/linux/can/dev.h       | 22 +++++++++++++++--
 3 files changed, 73 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 141c2a42d7ed..910c12e2638e 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -696,11 +696,17 @@ int can_change_mtu(struct net_device *dev, int new_mtu)
 	/* allow change of MTU according to the CANFD ability of the device */
 	switch (new_mtu) {
 	case CAN_MTU:
+		/* 'CANFD-only' controllers can not switch to CAN_MTU */
+		if (priv->ctrlmode_static & CAN_CTRLMODE_FD)
+			return -EINVAL;
+
 		priv->ctrlmode &= ~CAN_CTRLMODE_FD;
 		break;
 
 	case CANFD_MTU:
-		if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD))
+		/* check for potential CANFD ability */
+		if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD) &&
+		    !(priv->ctrlmode_static & CAN_CTRLMODE_FD))
 			return -EINVAL;
 
 		priv->ctrlmode |= CAN_CTRLMODE_FD;
@@ -782,6 +788,35 @@ static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
 				= { .len = sizeof(struct can_bittiming_const) },
 };
 
+static int can_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	bool is_can_fd = false;
+
+	/* Make sure that valid CAN FD configurations always consist of
+	 * - nominal/arbitration bittiming
+	 * - data bittiming
+	 * - control mode with CAN_CTRLMODE_FD set
+	 */
+
+	if (data[IFLA_CAN_CTRLMODE]) {
+		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+
+		is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD;
+	}
+
+	if (is_can_fd) {
+		if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	if (data[IFLA_CAN_DATA_BITTIMING]) {
+		if (!is_can_fd || !data[IFLA_CAN_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int can_changelink(struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[])
 {
@@ -813,19 +848,31 @@ static int can_changelink(struct net_device *dev,
 
 	if (data[IFLA_CAN_CTRLMODE]) {
 		struct can_ctrlmode *cm;
+		u32 ctrlstatic;
+		u32 maskedflags;
 
 		/* Do not allow changing controller mode while running */
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
 		cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+		ctrlstatic = priv->ctrlmode_static;
+		maskedflags = cm->flags & cm->mask;
+
+		/* check whether provided bits are allowed to be passed */
+		if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic))
+			return -EOPNOTSUPP;
+
+		/* do not check for static fd-non-iso if 'fd' is disabled */
+		if (!(maskedflags & CAN_CTRLMODE_FD))
+			ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO;
 
-		/* check whether changed bits are allowed to be modified */
-		if (cm->mask & ~priv->ctrlmode_supported)
+		/* make sure static options are provided by configuration */
+		if ((maskedflags & ctrlstatic) != ctrlstatic)
 			return -EOPNOTSUPP;
 
 		/* clear bits to be modified and copy the flag values */
 		priv->ctrlmode &= ~cm->mask;
-		priv->ctrlmode |= (cm->flags & cm->mask);
+		priv->ctrlmode |= maskedflags;
 
 		/* CAN_CTRLMODE_FD can only be set when driver supports FD */
 		if (priv->ctrlmode & CAN_CTRLMODE_FD)
@@ -966,6 +1013,7 @@ static struct rtnl_link_ops can_link_ops __read_mostly = {
 	.maxtype	= IFLA_CAN_MAX,
 	.policy		= can_policy,
 	.setup		= can_setup,
+	.validate	= can_validate,
 	.newlink	= can_newlink,
 	.changelink	= can_changelink,
 	.get_size	= can_get_size,
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 39cf911f7a1e..195f15edb32e 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -955,7 +955,7 @@ static struct net_device *alloc_m_can_dev(void)
 	priv->can.do_get_berr_counter = m_can_get_berr_counter;
 
 	/* CAN_CTRLMODE_FD_NON_ISO is fixed with M_CAN IP v3.0.1 */
-	priv->can.ctrlmode = CAN_CTRLMODE_FD_NON_ISO;
+	can_set_static_ctrlmode(dev, CAN_CTRLMODE_FD_NON_ISO);
 
 	/* CAN_CTRLMODE_FD_NON_ISO can not be changed with M_CAN IP v3.0.1 */
 	priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK |
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 735f9f8c4e43..5261751f6bd4 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -40,8 +40,11 @@ struct can_priv {
 	struct can_clock clock;
 
 	enum can_state state;
-	u32 ctrlmode;
-	u32 ctrlmode_supported;
+
+	/* CAN controller features - see include/uapi/linux/can/netlink.h */
+	u32 ctrlmode;		/* current options setting */
+	u32 ctrlmode_supported;	/* options that can be modified by netlink */
+	u32 ctrlmode_static;	/* static enabled options for driver/hardware */
 
 	int restart_ms;
 	struct timer_list restart_timer;
@@ -108,6 +111,21 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return skb->len == CANFD_MTU;
 }
 
+/* helper to define static CAN controller features at device creation time */
+static inline void can_set_static_ctrlmode(struct net_device *dev,
+					   u32 static_mode)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	/* alloc_candev() succeeded => netdev_priv() is valid at this point */
+	priv->ctrlmode = static_mode;
+	priv->ctrlmode_static = static_mode;
+
+	/* override MTU which was set by default in can_setup()? */
+	if (static_mode & CAN_CTRLMODE_FD)
+		dev->mtu = CANFD_MTU;
+}
+
 /* get data length from can_dlc with sanitized can_dlc */
 u8 can_dlc2len(u8 can_dlc);
 
-- 
cgit v1.2.3


From 9d9a77cee1ab53dc6419b1ab9da88c4e9342d26a Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Tue, 10 May 2016 00:19:41 +0200
Subject: net: phy: add phy_ethtool_{get|set}_link_ksettings

Ethtool callbacks {get|set}_link_ksettings are often the same, so
we add two generics functions phy_ethtool_{get|set}_link_ksettings
to avoid writing severals times the same function.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Acked-By: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 24 ++++++++++++++++++++++++
 include/linux/phy.h   |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 6f221c8c2a7f..603e8db50162 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1347,3 +1347,27 @@ void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
 		phydev->drv->get_wol(phydev, wol);
 }
 EXPORT_SYMBOL(phy_ethtool_get_wol);
+
+int phy_ethtool_get_link_ksettings(struct net_device *ndev,
+				   struct ethtool_link_ksettings *cmd)
+{
+	struct phy_device *phydev = ndev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_ksettings_get(phydev, cmd);
+}
+EXPORT_SYMBOL(phy_ethtool_get_link_ksettings);
+
+int phy_ethtool_set_link_ksettings(struct net_device *ndev,
+				   const struct ethtool_link_ksettings *cmd)
+{
+	struct phy_device *phydev = ndev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_ksettings_set(phydev, cmd);
+}
+EXPORT_SYMBOL(phy_ethtool_set_link_ksettings);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index be3f83bbdc0b..2d24b283aa2d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -829,6 +829,10 @@ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data);
 int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
 void phy_ethtool_get_wol(struct phy_device *phydev,
 			 struct ethtool_wolinfo *wol);
+int phy_ethtool_get_link_ksettings(struct net_device *ndev,
+				   struct ethtool_link_ksettings *cmd);
+int phy_ethtool_set_link_ksettings(struct net_device *ndev,
+				   const struct ethtool_link_ksettings *cmd);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit v1.2.3


From 1dee3f59a8d5e711797dc82628aaf94a64e99922 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 9 May 2016 11:40:20 +0200
Subject: block/drbd: align properly u64 in nl messages

The attribute 0 is never used in drbd, so let's use it as pad attribute
in netlink messages. This minimizes the patch.

Note that this patch is only compile-tested.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_nl.c      | 28 ++++++++++++++++------------
 include/linux/genl_magic_struct.h |  7 ++++++-
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1fd1dccebb6b..0bac9c8246bc 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -3633,14 +3633,15 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 		goto nla_put_failure;
 	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
 	    nla_put_u32(skb, T_current_state, device->state.i) ||
-	    nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
-	    nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
-	    nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
-	    nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
-	    nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
-	    nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
-	    nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
-	    nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_ed_uuid, device->ed_uuid) ||
+	    nla_put_u64_0pad(skb, T_capacity,
+			     drbd_get_capacity(device->this_bdev)) ||
+	    nla_put_u64_0pad(skb, T_send_cnt, device->send_cnt) ||
+	    nla_put_u64_0pad(skb, T_recv_cnt, device->recv_cnt) ||
+	    nla_put_u64_0pad(skb, T_read_cnt, device->read_cnt) ||
+	    nla_put_u64_0pad(skb, T_writ_cnt, device->writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
 	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
 	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
 	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
@@ -3657,13 +3658,16 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 			goto nla_put_failure;
 
 		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
-		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
-		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+		    nla_put_u64_0pad(skb, T_bits_total, drbd_bm_bits(device)) ||
+		    nla_put_u64_0pad(skb, T_bits_oos,
+				     drbd_bm_total_weight(device)))
 			goto nla_put_failure;
 		if (C_SYNC_SOURCE <= device->state.conn &&
 		    C_PAUSED_SYNC_T >= device->state.conn) {
-			if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
-			    nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+			if (nla_put_u64_0pad(skb, T_bits_rs_total,
+					     device->rs_total) ||
+			    nla_put_u64_0pad(skb, T_bits_rs_failed,
+					     device->rs_failed))
 				goto nla_put_failure;
 		}
 	}
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index eecd19b37001..6270a56e5edc 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -62,6 +62,11 @@ extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
 
 /* MAGIC helpers							{{{2 */
 
+static inline int nla_put_u64_0pad(struct sk_buff *skb, int attrtype, u64 value)
+{
+	return nla_put_64bit(skb, attrtype, sizeof(u64), &value, 0);
+}
+
 /* possible field types */
 #define __flg_field(attr_nr, attr_flag, name) \
 	__field(attr_nr, attr_flag, name, NLA_U8, char, \
@@ -80,7 +85,7 @@ extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
 			nla_get_u32, nla_put_u32, true)
 #define __u64_field(attr_nr, attr_flag, name)	\
 	__field(attr_nr, attr_flag, name, NLA_U64, __u64, \
-			nla_get_u64, nla_put_u64, false)
+			nla_get_u64, nla_put_u64_0pad, false)
 #define __str_field(attr_nr, attr_flag, name, maxlen) \
 	__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
 			nla_strlcpy, nla_put, false)
-- 
cgit v1.2.3


From 74b20582ac389ee9f18a6fcc0eef244658ce8de0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 10 May 2016 11:19:50 -0700
Subject: net: l3mdev: Add hook in ip and ipv6

Currently the VRF driver uses the rx_handler to switch the skb device
to the VRF device. Switching the dev prior to the ip / ipv6 layer
means the VRF driver has to duplicate IP/IPv6 processing which adds
overhead and makes features such as retaining the ingress device index
more complicated than necessary.

This patch moves the hook to the L3 layer just after the first NF_HOOK
for PRE_ROUTING. This location makes exposing the original ingress device
trivial (next patch) and allows adding other NF_HOOKs to the VRF driver
in the future.

dev_queue_xmit_nit is exported so that the VRF driver can cycle the skb
with the switched device through the packet taps to maintain current
behavior (tcpdump can be used on either the vrf device or the enslaved
devices).

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c         | 189 ++++++++++++++++++++++------------------------
 include/linux/ipv6.h      |  17 ++++-
 include/linux/netdevice.h |   2 +
 include/net/l3mdev.h      |  42 +++++++++++
 include/net/tcp.h         |   4 +-
 net/core/dev.c            |   3 +-
 net/ipv4/ip_input.c       |   7 ++
 net/ipv6/ip6_input.c      |   7 ++
 8 files changed, 170 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c8db55aa8280..0ea29345eb2e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -42,9 +42,6 @@
 #define DRV_NAME	"vrf"
 #define DRV_VERSION	"1.0"
 
-#define vrf_master_get_rcu(dev) \
-	((struct net_device *)rcu_dereference(dev->rx_handler_data))
-
 struct net_vrf {
 	struct rtable           *rth;
 	struct rt6_info		*rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-/* neighbor handling is done with actual device; do not want
- * to flip skb->dev for those ndisc packets. This really fails
- * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
- * a start.
- */
-#if IS_ENABLED(CONFIG_IPV6)
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	const struct ipv6hdr *ipv6h;
-	struct ipv6hdr _ipv6h;
-	bool rc = true;
-
-	ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
-	if (!ipv6h)
-		goto out;
-
-	if (ipv6h->nexthdr == NEXTHDR_ICMP) {
-		const struct icmp6hdr *icmph;
-		struct icmp6hdr _icmph;
-
-		icmph = skb_header_pointer(skb, sizeof(_ipv6h),
-					   sizeof(_icmph), &_icmph);
-		if (!icmph)
-			goto out;
-
-		switch (icmph->icmp6_type) {
-		case NDISC_ROUTER_SOLICITATION:
-		case NDISC_ROUTER_ADVERTISEMENT:
-		case NDISC_NEIGHBOUR_SOLICITATION:
-		case NDISC_NEIGHBOUR_ADVERTISEMENT:
-		case NDISC_REDIRECT:
-			rc = false;
-			break;
-		}
-	}
-
-out:
-	return rc;
-}
-#else
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	return false;
-}
-#endif
-
-static bool is_ip_rx_frame(struct sk_buff *skb)
-{
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		return true;
-	case htons(ETH_P_IPV6):
-		return check_ipv6_frame(skb);
-	}
-	return false;
-}
-
 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 {
 	vrf_dev->stats.tx_errors++;
 	kfree_skb(skb);
 }
 
-/* note: already called with rcu_read_lock */
-static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
-{
-	struct sk_buff *skb = *pskb;
-
-	if (is_ip_rx_frame(skb)) {
-		struct net_device *dev = vrf_master_get_rcu(skb->dev);
-		struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
-
-		u64_stats_update_begin(&dstats->syncp);
-		dstats->rx_pkts++;
-		dstats->rx_bytes += skb->len;
-		u64_stats_update_end(&dstats->syncp);
-
-		skb->dev = dev;
-
-		return RX_HANDLER_ANOTHER;
-	}
-	return RX_HANDLER_PASS;
-}
-
 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 						 struct rtnl_link_stats64 *stats)
 {
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 {
 	int ret;
 
-	/* register the packet handler for slave ports */
-	ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
-	if (ret) {
-		netdev_err(port_dev,
-			   "Device %s failed to register rx_handler\n",
-			   port_dev->name);
-		goto out_fail;
-	}
-
 	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
 	if (ret < 0)
-		goto out_unregister;
+		return ret;
 
 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
 	cycle_netdev(port_dev);
 
 	return 0;
-
-out_unregister:
-	netdev_rx_handler_unregister(port_dev);
-out_fail:
-	return ret;
 }
 
 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 	netdev_upper_dev_unlink(port_dev, dev);
 	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
 
-	netdev_rx_handler_unregister(port_dev);
-
 	cycle_netdev(port_dev);
 
 	return 0;
@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 	return rc;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+static bool ipv6_ndisc_frame(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	bool rc = false;
+
+	if (iph->nexthdr == NEXTHDR_ICMP) {
+		const struct icmp6hdr *icmph;
+		struct icmp6hdr _icmph;
+
+		icmph = skb_header_pointer(skb, sizeof(*iph),
+					   sizeof(_icmph), &_icmph);
+		if (!icmph)
+			goto out;
+
+		switch (icmph->icmp6_type) {
+		case NDISC_ROUTER_SOLICITATION:
+		case NDISC_ROUTER_ADVERTISEMENT:
+		case NDISC_NEIGHBOUR_SOLICITATION:
+		case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		case NDISC_REDIRECT:
+			rc = true;
+			break;
+		}
+	}
+
+out:
+	return rc;
+}
+
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	/* if packet is NDISC keep the ingress interface */
+	if (!ipv6_ndisc_frame(skb)) {
+		skb->dev = vrf_dev;
+		skb->skb_iif = vrf_dev->ifindex;
+
+		skb_push(skb, skb->mac_len);
+		dev_queue_xmit_nit(skb, vrf_dev);
+		skb_pull(skb, skb->mac_len);
+
+		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
+	}
+
+	return skb;
+}
+
+#else
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	return skb;
+}
+#endif
+
+static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb)
+{
+	skb->dev = vrf_dev;
+	skb->skb_iif = vrf_dev->ifindex;
+
+	skb_push(skb, skb->mac_len);
+	dev_queue_xmit_nit(skb, vrf_dev);
+	skb_pull(skb, skb->mac_len);
+
+	return skb;
+}
+
+/* called with rcu lock held */
+static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb,
+				  u16 proto)
+{
+	switch (proto) {
+	case AF_INET:
+		return vrf_ip_rcv(vrf_dev, skb);
+	case AF_INET6:
+		return vrf_ip6_rcv(vrf_dev, skb);
+	}
+
+	return skb;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 					 const struct flowi6 *fl6)
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_get_saddr	= vrf_get_saddr,
+	.l3mdev_l3_rcv		= vrf_l3_rcv,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
 #endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 58d6e158755f..5c91b0b055d4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
 #define IP6SKB_ROUTERALERT	8
 #define IP6SKB_FRAGMENTED      16
 #define IP6SKB_HOPBYHOP        32
+#define IP6SKB_L3SLAVE         64
 };
 
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return flags & IP6SKB_L3SLAVE;
+}
+#else
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return false;
+}
+#endif
+
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
 #define IP6CBMTU(skb)	((struct ip6_mtuinfo *)((skb)->cb))
 
 static inline int inet6_iif(const struct sk_buff *skb)
 {
-	return IP6CB(skb)->iif;
+	bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+
+	return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }
 
 struct tcp6_request_sock {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 63580e6d0df4..c2f5112f08f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);
 
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+
 extern int		netdev_budget;
 
 /* Called by rtnetlink.c:rtnl_unlock() */
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 78872bd1dc2c..374388dc01c8 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,6 +25,8 @@
 
 struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
+	struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
+					  struct sk_buff *skb, u16 proto);
 
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
 struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
 
+static inline
+struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
+{
+	struct net_device *master = NULL;
+
+	if (netif_is_l3_slave(skb->dev))
+		master = netdev_master_upper_dev_get_rcu(skb->dev);
+	else if (netif_is_l3_master(skb->dev))
+		master = skb->dev;
+
+	if (master && master->l3mdev_ops->l3mdev_l3_rcv)
+		skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
+
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET6);
+}
+
 #else
 
 static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
 {
 	return NULL;
 }
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c9ab561387c4..0bcc70f4e1fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
  */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
 {
-	return TCP_SKB_CB(skb)->header.h6.iif;
+	bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
+
+	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
 }
 #endif
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c7490339315c..12436d1312ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
  *	taps currently in use.
  */
 
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 	struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 
 /**
  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 751c0658e194..37375eedeef9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ed56012005d..f185cbcda114 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
 		const struct inet6_protocol *ipprot;
 
-- 
cgit v1.2.3


From 7219ab34f184b5d864be38f5ada7cdff1ab5b18a Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 11 May 2016 00:29:14 +0300
Subject: net/mlx5e: CQE compression

CQE compression feature is meant to save PCIe bandwidth by
compressing few CQEs into smaller amount of bytes on PCIe.
CQE compression can be selectively enabled per CQ.  By default
is disabled for now and will be enabled later on.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  10 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 151 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   8 ++
 include/linux/mlx5/device.h                        |  34 +++++
 6 files changed, 211 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index bfa5daaaf5aa..19f0d8db27ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -157,6 +157,8 @@ struct mlx5e_params {
 	u8  log_rq_size;
 	u16 num_channels;
 	u8  num_tc;
+	bool rx_cqe_compress_admin;
+	bool rx_cqe_compress;
 	u16 rx_cq_moderation_usec;
 	u16 rx_cq_moderation_pkts;
 	u16 tx_cq_moderation_usec;
@@ -202,6 +204,13 @@ struct mlx5e_cq {
 	struct mlx5e_channel      *channel;
 	struct mlx5e_priv         *priv;
 
+	/* cqe decompression */
+	struct mlx5_cqe64          title;
+	struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
+	u8                         mini_arr_idx;
+	u16                        decmprs_left;
+	u16                        decmprs_wqe_counter;
+
 	/* control */
 	struct mlx5_wq_ctrl        wq_ctrl;
 } ____cacheline_aligned_in_smp;
@@ -616,6 +625,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv);
 void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv);
 int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr);
 int mlx5e_hwstamp_get(struct net_device *dev, struct ifreq *ifr);
+void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val);
 
 int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
 			  u16 vid);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index 2018eebe1531..847a8f3ac2b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -93,6 +93,8 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
 	/* RX HW timestamp */
 	switch (config.rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression(priv, priv->params.rx_cqe_compress_admin);
 		break;
 	case HWTSTAMP_FILTER_ALL:
 	case HWTSTAMP_FILTER_SOME:
@@ -108,6 +110,8 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		/* Disable CQE compression */
+		mlx5e_modify_rx_cqe_compression(priv, false);
 		config.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1c70e518b5c5..0ea4c03a7946 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -114,6 +114,8 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
 		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
+		s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
+		s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -1204,6 +1206,10 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
+	if (priv->params.rx_cqe_compress) {
+		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_CSUM);
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 23adfe2fcba9..c8b8d456268f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,6 +42,143 @@ static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
 	return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
 }
 
+static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
+				       void *data)
+{
+	u32 ci = cqcc & cq->wq.sz_m1;
+
+	memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
+}
+
+static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
+					 struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
+	cq->decmprs_left        = be32_to_cpu(cq->title.byte_cnt);
+	cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
+	rq->stats.cqe_compress_blks++;
+}
+
+static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, cq->mini_arr);
+	cq->mini_arr_idx = 0;
+}
+
+static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
+{
+	u8 op_own = (cqcc >> cq->wq.log_sz) & 1;
+	u32 wq_sz = 1 << cq->wq.log_sz;
+	u32 ci = cqcc & cq->wq.sz_m1;
+	u32 ci_top = min_t(u32, wq_sz, ci + n);
+
+	for (; ci < ci_top; ci++, n--) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+		cqe->op_own = op_own;
+	}
+
+	if (unlikely(ci == wq_sz)) {
+		op_own = !op_own;
+		for (ci = 0; ci < n; ci++) {
+			struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+			cqe->op_own = op_own;
+		}
+	}
+}
+
+static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
+					struct mlx5e_cq *cq, u32 cqcc)
+{
+	u16 wqe_cnt_step;
+
+	cq->title.byte_cnt     = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
+	cq->title.check_sum    = cq->mini_arr[cq->mini_arr_idx].checksum;
+	cq->title.op_own      &= 0xf0;
+	cq->title.op_own      |= 0x01 & (cqcc >> cq->wq.log_sz);
+	cq->title.wqe_counter  = cpu_to_be16(cq->decmprs_wqe_counter);
+
+	wqe_cnt_step =
+		rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+		mpwrq_get_cqe_consumed_strides(&cq->title) : 1;
+	cq->decmprs_wqe_counter =
+		(cq->decmprs_wqe_counter + wqe_cnt_step) & rq->wq.sz_m1;
+}
+
+static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
+						struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_decompress_cqe(rq, cq, cqcc);
+	cq->title.rss_hash_type   = 0;
+	cq->title.rss_hash_result = 0;
+}
+
+static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
+					     struct mlx5e_cq *cq,
+					     int update_owner_only,
+					     int budget_rem)
+{
+	u32 cqcc = cq->wq.cc + update_owner_only;
+	u32 cqe_count;
+	u32 i;
+
+	cqe_count = min_t(u32, cq->decmprs_left, budget_rem);
+
+	for (i = update_owner_only; i < cqe_count;
+	     i++, cq->mini_arr_idx++, cqcc++) {
+		if (unlikely(cq->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE))
+			mlx5e_read_mini_arr_slot(cq, cqcc);
+
+		mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
+		rq->handle_rx_cqe(rq, &cq->title);
+	}
+	mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
+	cq->wq.cc = cqcc;
+	cq->decmprs_left -= cqe_count;
+	rq->stats.cqe_compress_pkts += cqe_count;
+
+	return cqe_count;
+}
+
+static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
+					      struct mlx5e_cq *cq,
+					      int budget_rem)
+{
+	mlx5e_read_title_slot(rq, cq, cq->wq.cc);
+	mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
+	mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
+	rq->handle_rx_cqe(rq, &cq->title);
+	cq->mini_arr_idx++;
+
+	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
+}
+
+void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val)
+{
+	bool was_opened;
+
+	if (!MLX5_CAP_GEN(priv->mdev, cqe_compression))
+		return;
+
+	mutex_lock(&priv->state_lock);
+
+	if (priv->params.rx_cqe_compress == val)
+		goto unlock;
+
+	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	if (was_opened)
+		mlx5e_close_locked(priv->netdev);
+
+	priv->params.rx_cqe_compress = val;
+
+	if (was_opened)
+		mlx5e_open_locked(priv->netdev);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+}
+
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
 	struct sk_buff *skb;
@@ -738,14 +875,24 @@ mpwrq_cqe_out:
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
-	int work_done;
+	int work_done = 0;
 
-	for (work_done = 0; work_done < budget; work_done++) {
+	if (cq->decmprs_left)
+		work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+
+	for (; work_done < budget; work_done++) {
 		struct mlx5_cqe64 *cqe = mlx5e_get_cqe(cq);
 
 		if (!cqe)
 			break;
 
+		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
+			work_done +=
+				mlx5e_decompress_cqes_start(rq, cq,
+							    budget - work_done);
+			continue;
+		}
+
 		mlx5_cqwq_pop(&cq->wq);
 
 		rq->handle_rx_cqe(rq, cqe);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 115752b53d85..83bc32b25849 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -72,6 +72,8 @@ struct mlx5e_sw_stats {
 	u64 rx_mpwqe_filler;
 	u64 rx_mpwqe_frag;
 	u64 rx_buff_alloc_err;
+	u64 rx_cqe_compress_blks;
+	u64 rx_cqe_compress_pkts;
 
 	/* Special handling counters */
 	u64 link_down_events;
@@ -101,6 +103,8 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events) },
 };
 
@@ -283,6 +287,8 @@ struct mlx5e_rq_stats {
 	u64 mpwqe_filler;
 	u64 mpwqe_frag;
 	u64 buff_alloc_err;
+	u64 cqe_compress_blks;
+	u64 cqe_compress_pkts;
 };
 
 static const struct counter_desc rq_stats_desc[] = {
@@ -297,6 +303,8 @@ static const struct counter_desc rq_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
 };
 
 struct mlx5e_sq_stats {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ee0d5a937f02..035abdf62cfe 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -685,6 +685,40 @@ struct mlx5_cqe64 {
 	u8		op_own;
 };
 
+struct mlx5_mini_cqe8 {
+	union {
+		__be32 rx_hash_result;
+		struct {
+			__be16 checksum;
+			__be16 rsvd;
+		};
+		struct {
+			__be16 wqe_counter;
+			u8  s_wqe_opcode;
+			u8  reserved;
+		} s_wqe_info;
+	};
+	__be32 byte_cnt;
+};
+
+enum {
+	MLX5_NO_INLINE_DATA,
+	MLX5_INLINE_DATA32_SEG,
+	MLX5_INLINE_DATA64_SEG,
+	MLX5_COMPRESSED,
+};
+
+enum {
+	MLX5_CQE_FORMAT_CSUM = 0x1,
+};
+
+#define MLX5_MINI_CQE_ARRAY_SIZE 8
+
+static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->op_own >> 2) & 0x3;
+}
+
 static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
-- 
cgit v1.2.3


From 37bff2b9c6addf6216c8d04e95be596678e8deff Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:13 +0300
Subject: qed: Add VF->PF channel infrastructure

Communication between VF and PF is based on a dedicated HW channel;
VF will prepare a messge, and by signaling the HW the PF would get a
notification of that message existance. The PF would then copy the
message, process it and DMA an answer back to the VF as a response.

The messages themselves are TLV-based - allowing easier backward/forward
compatibility.

This patch adds the infrastructure of the channel on the PF side -
starting with the arrival of the notification and ending with DMAing
the response back to the VF.

It also adds a dummy-response as reference, as it only lays the
groundwork of the communication; it doesn't really add support of any
actual messages.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h         |   6 +
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h |  27 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h     |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_hw.c      |  44 ++-
 drivers/net/ethernet/qlogic/qed/qed_main.c    |  10 +-
 drivers/net/ethernet/qlogic/qed/qed_spq.c     |  16 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c   | 385 ++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h   |  53 ++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h      |  52 ++++
 include/linux/qed/common_hsi.h                |   5 +
 10 files changed, 585 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 2e067c7e8f38..01f9b6c880bd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -378,6 +378,12 @@ struct qed_hwfn {
 
 	struct qed_simd_fp_handler	simd_proto_handler[64];
 
+#ifdef CONFIG_QED_SRIOV
+	struct workqueue_struct *iov_wq;
+	struct delayed_work iov_task;
+	unsigned long iov_task_flags;
+#endif
+
 	struct z_stream_s		*stream;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index 6aac3f855aa1..f567371fe304 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -182,11 +182,15 @@ enum qed_dmae_address_type_t {
  * used mostly to write a zeroed buffer to destination address
  * using DMA
  */
-#define QED_DMAE_FLAG_RW_REPL_SRC       0x00000001
-#define QED_DMAE_FLAG_COMPLETION_DST    0x00000008
+#define QED_DMAE_FLAG_RW_REPL_SRC	0x00000001
+#define QED_DMAE_FLAG_VF_SRC		0x00000002
+#define QED_DMAE_FLAG_VF_DST		0x00000004
+#define QED_DMAE_FLAG_COMPLETION_DST	0x00000008
 
 struct qed_dmae_params {
-	u32	flags; /* consists of QED_DMAE_FLAG_* values */
+	u32 flags; /* consists of QED_DMAE_FLAG_* values */
+	u8 src_vfid;
+	u8 dst_vfid;
 };
 
 /**
@@ -208,6 +212,23 @@ qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
 		  u32 size_in_dwords,
 		  u32 flags);
 
+/**
+ * @brief qed_dmae_host2host - copy data from to source address
+ * to a destination adress (for SRIOV) using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_addr
+ * @param dest_addr
+ * @param size_in_dwords
+ * @param params
+ */
+int qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       dma_addr_t source_addr,
+		       dma_addr_t dest_addr,
+		       u32 size_in_dwords, struct qed_dmae_params *p_params);
+
 /**
  * @brief qed_chain_alloc - Allocate and initialize a chain
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index c4fae71bed11..6ba197c107ed 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -31,7 +31,7 @@ enum common_event_opcode {
 	COMMON_EVENT_PF_STOP,
 	COMMON_EVENT_RESERVED,
 	COMMON_EVENT_RESERVED2,
-	COMMON_EVENT_RESERVED3,
+	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
 	COMMON_EVENT_RESERVED5,
 	COMMON_EVENT_RESERVED6,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index a8cf96c34bef..a9be5a422d2d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -355,8 +355,8 @@ static void qed_dmae_opcode(struct qed_hwfn *p_hwfn,
 			    const u8 is_dst_type_grc,
 			    struct qed_dmae_params *p_params)
 {
+	u16 opcode_b = 0;
 	u32 opcode = 0;
-	u16 opcodeB = 0;
 
 	/* Whether the source is the PCIe or the GRC.
 	 * 0- The source is the PCIe
@@ -398,14 +398,24 @@ static void qed_dmae_opcode(struct qed_hwfn *p_hwfn,
 	opcode |= (DMAE_CMD_DST_ADDR_RESET_MASK <<
 		   DMAE_CMD_DST_ADDR_RESET_SHIFT);
 
-	opcodeB |= (DMAE_CMD_SRC_VF_ID_MASK <<
-		    DMAE_CMD_SRC_VF_ID_SHIFT);
+	/* SRC/DST VFID: all 1's - pf, otherwise VF id */
+	if (p_params->flags & QED_DMAE_FLAG_VF_SRC) {
+		opcode |= 1 << DMAE_CMD_SRC_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->src_vfid << DMAE_CMD_SRC_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_SRC_VF_ID_MASK <<
+			    DMAE_CMD_SRC_VF_ID_SHIFT;
+	}
 
-	opcodeB |= (DMAE_CMD_DST_VF_ID_MASK <<
-		    DMAE_CMD_DST_VF_ID_SHIFT);
+	if (p_params->flags & QED_DMAE_FLAG_VF_DST) {
+		opcode |= 1 << DMAE_CMD_DST_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->dst_vfid << DMAE_CMD_DST_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_DST_VF_ID_MASK << DMAE_CMD_DST_VF_ID_SHIFT;
+	}
 
 	p_hwfn->dmae_info.p_dmae_cmd->opcode = cpu_to_le32(opcode);
-	p_hwfn->dmae_info.p_dmae_cmd->opcode_b = cpu_to_le16(opcodeB);
+	p_hwfn->dmae_info.p_dmae_cmd->opcode_b = cpu_to_le16(opcode_b);
 }
 
 u32 qed_dmae_idx_to_go_cmd(u8 idx)
@@ -753,6 +763,28 @@ int qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+int
+qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt,
+		   dma_addr_t source_addr,
+		   dma_addr_t dest_addr,
+		   u32 size_in_dwords, struct qed_dmae_params *p_params)
+{
+	int rc;
+
+	mutex_lock(&(p_hwfn->dmae_info.mutex));
+
+	rc = qed_dmae_execute_command(p_hwfn, p_ptt, source_addr,
+				      dest_addr,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      size_in_dwords, p_params);
+
+	mutex_unlock(&(p_hwfn->dmae_info.mutex));
+
+	return rc;
+}
+
 u16 qed_get_qm_pq(struct qed_hwfn *p_hwfn,
 		  enum protocol_type proto,
 		  union qed_qm_pq_params *p_params)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1b758bdec587..c209ed49deae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -24,6 +24,7 @@
 #include <linux/qed/qed_if.h>
 
 #include "qed.h"
+#include "qed_sriov.h"
 #include "qed_sp.h"
 #include "qed_dev_api.h"
 #include "qed_mcp.h"
@@ -749,7 +750,10 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	struct qed_mcp_drv_version drv_version;
 	const u8 *data = NULL;
 	struct qed_hwfn *hwfn;
-	int rc;
+	int rc = -EINVAL;
+
+	if (qed_iov_wq_start(cdev))
+		goto err;
 
 	rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
 			      &cdev->pdev->dev);
@@ -826,6 +830,8 @@ err1:
 err:
 	release_firmware(cdev->firmware);
 
+	qed_iov_wq_stop(cdev, false);
+
 	return rc;
 }
 
@@ -842,6 +848,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	qed_disable_msix(cdev);
 	qed_nic_reset(cdev);
 
+	qed_iov_wq_stop(cdev, true);
+
 	release_firmware(cdev->firmware);
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 89469d5aae25..0e439e46fbe9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -27,6 +27,7 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 /***************************************************************************
 * Structures & Definitions
@@ -242,10 +243,17 @@ static int
 qed_async_event_completion(struct qed_hwfn *p_hwfn,
 			   struct event_ring_entry *p_eqe)
 {
-	DP_NOTICE(p_hwfn,
-		  "Unknown Async completion for protocol: %d\n",
-		   p_eqe->protocol_id);
-	return -EINVAL;
+	switch (p_eqe->protocol_id) {
+	case PROTOCOLID_COMMON:
+		return qed_sriov_eqe_event(p_hwfn,
+					   p_eqe->opcode,
+					   p_eqe->echo, &p_eqe->data);
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unknown Async completion for protocol: %d\n",
+			  p_eqe->protocol_id);
+		return -EINVAL;
+	}
 }
 
 /***************************************************************************
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 685e3fa7bc52..4a6af4264141 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -31,6 +31,26 @@ bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 	return true;
 }
 
+static struct qed_vf_info *qed_iov_get_vf_info(struct qed_hwfn *p_hwfn,
+					       u16 relative_vf_id,
+					       bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	if (!p_hwfn->pf_iov_info) {
+		DP_NOTICE(p_hwfn->cdev, "No iov info\n");
+		return NULL;
+	}
+
+	if (qed_iov_is_valid_vfid(p_hwfn, relative_vf_id, b_enabled_only))
+		vf = &p_hwfn->pf_iov_info->vfs_array[relative_vf_id];
+	else
+		DP_ERR(p_hwfn, "qed_iov_get_vf_info: VF[%d] is not enabled\n",
+		       relative_vf_id);
+
+	return vf;
+}
+
 static int qed_iov_pci_cfg_info(struct qed_dev *cdev)
 {
 	struct qed_hw_sriov_info *iov = cdev->p_iov_info;
@@ -349,6 +369,232 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 	return 0;
 }
 
+static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
+{
+	/* Check PF supports sriov */
+	if (!IS_QED_SRIOV(p_hwfn->cdev) || !IS_PF_SRIOV_ALLOC(p_hwfn))
+		return false;
+
+	/* Check VF validity */
+	if (!qed_iov_is_valid_vfid(p_hwfn, vfid, true))
+		return false;
+
+	return true;
+}
+
+static bool qed_iov_tlv_supported(u16 tlvtype)
+{
+	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
+}
+
+/* place a given tlv on the tlv buffer, continuing current tlv list */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length)
+{
+	struct channel_tlv *tl = (struct channel_tlv *)*offset;
+
+	tl->type = type;
+	tl->length = length;
+
+	/* Offset should keep pointing to next TLV (the end of the last) */
+	*offset += length;
+
+	/* Return a pointer to the start of the added tlv */
+	return *offset - length;
+}
+
+/* list the types and lengths of the tlvs on the buffer */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list)
+{
+	u16 i = 1, total_length = 0;
+	struct channel_tlv *tlv;
+
+	do {
+		tlv = (struct channel_tlv *)((u8 *)tlvs_list + total_length);
+
+		/* output tlv */
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "TLV number %d: type %d, length %d\n",
+			   i, tlv->type, tlv->length);
+
+		if (tlv->type == CHANNEL_TLV_LIST_END)
+			return;
+
+		/* Validate entry - protect against malicious VFs */
+		if (!tlv->length) {
+			DP_NOTICE(p_hwfn, "TLV of length 0 found\n");
+			return;
+		}
+
+		total_length += tlv->length;
+
+		if (total_length >= sizeof(struct tlv_buffer_size)) {
+			DP_NOTICE(p_hwfn, "TLV ==> Buffer overflow\n");
+			return;
+		}
+
+		i++;
+	} while (1);
+}
+
+static void qed_iov_send_response(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_vf_info *p_vf,
+				  u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct qed_dmae_params params;
+	u8 eng_vf_id;
+
+	mbx->reply_virt->default_resp.hdr.status = status;
+
+	qed_dp_tlv_list(p_hwfn, mbx->reply_virt);
+
+	eng_vf_id = p_vf->abs_vf_id;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_DST;
+	params.dst_vfid = eng_vf_id;
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys + sizeof(u64),
+			   mbx->req_virt->first_tlv.reply_address +
+			   sizeof(u64),
+			   (sizeof(union pfvf_tlvs) - sizeof(u64)) / 4,
+			   &params);
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys,
+			   mbx->req_virt->first_tlv.reply_address,
+			   sizeof(u64) / 4, &params);
+
+	REG_WR(p_hwfn,
+	       GTT_BAR0_MAP_REG_USDM_RAM +
+	       USTORM_VF_PF_CHANNEL_READY_OFFSET(eng_vf_id), 1);
+}
+
+static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_vf_info *vf_info,
+				 u16 type, u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &vf_info->vf_mbx;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	qed_add_tlv(p_hwfn, &mbx->offset, type, length);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
+}
+
+static void qed_iov_process_mbx_dummy_resp(struct qed_hwfn *p_hwfn,
+					   struct qed_ptt *p_ptt,
+					   struct qed_vf_info *p_vf)
+{
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_NONE,
+			     sizeof(struct pfvf_def_resp_tlv),
+			     PFVF_STATUS_SUCCESS);
+}
+
+static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt, int vfid)
+{
+	struct qed_iov_vf_mbx *mbx;
+	struct qed_vf_info *p_vf;
+	int i;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf)
+		return;
+
+	mbx = &p_vf->vf_mbx;
+
+	/* qed_iov_process_mbx_request */
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "qed_iov_process_mbx_req vfid %d\n", p_vf->abs_vf_id);
+
+	mbx->first_tlv = mbx->req_virt->first_tlv;
+
+	/* check if tlv type is known */
+	if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
+		qed_iov_process_mbx_dummy_resp(p_hwfn, p_ptt, p_vf);
+	} else {
+		/* unknown TLV - this may belong to a VF driver from the future
+		 * - a version written after this PF driver was written, which
+		 * supports features unknown as of yet. Too bad since we don't
+		 * support them. Or this may be because someone wrote a crappy
+		 * VF driver and is sending garbage over the channel.
+		 */
+		DP_ERR(p_hwfn,
+		       "unknown TLV. type %d length %d. first 20 bytes of mailbox buffer:\n",
+		       mbx->first_tlv.tl.type, mbx->first_tlv.tl.length);
+
+		for (i = 0; i < 20; i++) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "%x ",
+				   mbx->req_virt->tlv_buf_size.tlv_buffer[i]);
+		}
+	}
+}
+
+void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	u64 add_bit = 1ULL << (vfid % 64);
+
+	p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit;
+}
+
+static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn,
+						    u64 *events)
+{
+	u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events;
+
+	memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+	memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+}
+
+static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn,
+			      u16 abs_vfid, struct regpair *vf_msg)
+{
+	u8 min = (u8)p_hwfn->cdev->p_iov_info->first_vf_in_pf;
+	struct qed_vf_info *p_vf;
+
+	if (!qed_iov_pf_sanity_check(p_hwfn, (int)abs_vfid - min)) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Got a message from VF [abs 0x%08x] that cannot be handled by PF\n",
+			   abs_vfid);
+		return 0;
+	}
+	p_vf = &p_hwfn->pf_iov_info->vfs_array[(u8)abs_vfid - min];
+
+	/* List the physical address of the request so that handler
+	 * could later on copy the message from it.
+	 */
+	p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
+
+	/* Mark the event and schedule the workqueue */
+	qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id);
+	qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
+
+	return 0;
+}
+
+int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			u8 opcode, __le16 echo, union event_ring_data *data)
+{
+	switch (opcode) {
+	case COMMON_EVENT_VF_PF_CHANNEL:
+		return qed_sriov_vfpf_msg(p_hwfn, le16_to_cpu(echo),
+					  &data->vf_pf_channel.msg_addr);
+	default:
+		DP_INFO(p_hwfn->cdev, "Unknown sriov eqe event 0x%02x\n",
+			opcode);
+		return -EINVAL;
+	}
+}
+
 u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 {
 	struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
@@ -364,3 +610,142 @@ u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 out:
 	return MAX_NUM_VFS;
 }
+
+static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
+			       int vfid)
+{
+	struct qed_dmae_params params;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return -EINVAL;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_SRC | QED_DMAE_FLAG_COMPLETION_DST;
+	params.src_vfid = vf_info->abs_vf_id;
+
+	if (qed_dmae_host2host(p_hwfn, ptt,
+			       vf_info->vf_mbx.pending_req,
+			       vf_info->vf_mbx.req_phys,
+			       sizeof(union vfpf_tlvs) / 4, &params)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Failed to copy message from VF 0x%02x\n", vfid);
+
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * qed_schedule_iov - schedules IOV task for VF and PF
+ * @hwfn: hardware function pointer
+ * @flag: IOV flag for VF/PF
+ */
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag)
+{
+	smp_mb__before_atomic();
+	set_bit(flag, &hwfn->iov_task_flags);
+	smp_mb__after_atomic();
+	DP_VERBOSE(hwfn, QED_MSG_IOV, "Scheduling iov task [Flag: %d]\n", flag);
+	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, 0);
+}
+
+static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
+{
+	u64 events[QED_VF_ARRAY_LENGTH];
+	struct qed_ptt *ptt;
+	int i;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt) {
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Can't acquire PTT; re-scheduling\n");
+		qed_schedule_iov(hwfn, QED_IOV_WQ_MSG_FLAG);
+		return;
+	}
+
+	qed_iov_pf_get_and_clear_pending_events(hwfn, events);
+
+	DP_VERBOSE(hwfn, QED_MSG_IOV,
+		   "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
+		   events[0], events[1], events[2]);
+
+	qed_for_each_vf(hwfn, i) {
+		/* Skip VFs with no pending messages */
+		if (!(events[i / 64] & (1ULL << (i % 64))))
+			continue;
+
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Handling VF message from VF 0x%02x [Abs 0x%02x]\n",
+			   i, hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+		/* Copy VF's message to PF's request buffer for that VF */
+		if (qed_iov_copy_vf_msg(hwfn, ptt, i))
+			continue;
+
+		qed_iov_process_mbx_req(hwfn, ptt, i);
+	}
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+void qed_iov_pf_task(struct work_struct *work)
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     iov_task.work);
+
+	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
+		return;
+
+	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
+		qed_handle_vf_msg(hwfn);
+}
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		if (!cdev->hwfns[i].iov_wq)
+			continue;
+
+		if (schedule_first) {
+			qed_schedule_iov(&cdev->hwfns[i],
+					 QED_IOV_WQ_STOP_WQ_FLAG);
+			cancel_delayed_work_sync(&cdev->hwfns[i].iov_task);
+		}
+
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+		destroy_workqueue(cdev->hwfns[i].iov_wq);
+	}
+}
+
+int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	char name[NAME_SIZE];
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		/* PFs needs a dedicated workqueue only if they support IOV. */
+		if (!IS_PF_SRIOV(p_hwfn))
+			continue;
+
+		snprintf(name, NAME_SIZE, "iov-%02x:%02x.%02x",
+			 cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), p_hwfn->abs_pf_id);
+
+		p_hwfn->iov_wq = create_singlethread_workqueue(name);
+		if (!p_hwfn->iov_wq) {
+			DP_NOTICE(p_hwfn, "Cannot create iov workqueue\n");
+			return -ENOMEM;
+		}
+
+		INIT_DELAYED_WORK(&p_hwfn->iov_task, qed_iov_pf_task);
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 0ac561916e2c..112216812a12 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -50,6 +50,14 @@ struct qed_iov_vf_mbx {
 	dma_addr_t req_phys;
 	union pfvf_tlvs *reply_virt;
 	dma_addr_t reply_phys;
+
+	/* Address in VF where a pending message is located */
+	dma_addr_t pending_req;
+
+	u8 *offset;
+
+	/* saved VF request header */
+	struct vfpf_first_tlv first_tlv;
 };
 
 enum vf_state {
@@ -96,6 +104,14 @@ struct qed_pf_iov {
 	u32 bulletins_size;
 };
 
+enum qed_iov_wq_flag {
+	QED_IOV_WQ_MSG_FLAG,
+	QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+	QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
+	QED_IOV_WQ_STOP_WQ_FLAG,
+	QED_IOV_WQ_FLR_FLAG,
+};
+
 #ifdef CONFIG_QED_SRIOV
 /**
  * @brief - Given a VF index, return index of next [including that] active VF.
@@ -147,6 +163,22 @@ void qed_iov_free(struct qed_hwfn *p_hwfn);
  * @param cdev
  */
 void qed_iov_free_hw_info(struct qed_dev *cdev);
+
+/**
+ * @brief qed_sriov_eqe_event - handle async sriov event arrived on eqe.
+ *
+ * @param p_hwfn
+ * @param opcode
+ * @param echo
+ * @param data
+ */
+int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			u8 opcode, __le16 echo, union event_ring_data *data);
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
+int qed_iov_wq_start(struct qed_dev *cdev);
+
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -175,6 +207,27 @@ static inline void qed_iov_free(struct qed_hwfn *p_hwfn)
 static inline void qed_iov_free_hw_info(struct qed_dev *cdev)
 {
 }
+
+static inline int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+				      u8 opcode,
+				      __le16 echo, union event_ring_data *data)
+{
+	return -EINVAL;
+}
+
+static inline void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+}
+
+static inline int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	return 0;
+}
+
+static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
+				    enum qed_iov_wq_flag flag)
+{
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 52cfa4889d88..f0d8de2be581 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -9,16 +9,62 @@
 #ifndef _QED_VF_H
 #define _QED_VF_H
 
+enum {
+	PFVF_STATUS_WAITING,
+	PFVF_STATUS_SUCCESS,
+	PFVF_STATUS_FAILURE,
+	PFVF_STATUS_NOT_SUPPORTED,
+	PFVF_STATUS_NO_RESOURCE,
+	PFVF_STATUS_FORCED,
+};
+
+/* vf pf channel tlvs */
+/* general tlv header (used for both vf->pf request and pf->vf response) */
+struct channel_tlv {
+	u16 type;
+	u16 length;
+};
+
+/* header of first vf->pf tlv carries the offset used to calculate reponse
+ * buffer address
+ */
+struct vfpf_first_tlv {
+	struct channel_tlv tl;
+	u32 padding;
+	u64 reply_address;
+};
+
+/* header of pf->vf tlvs, carries the status of handling the request */
+struct pfvf_tlv {
+	struct channel_tlv tl;
+	u8 status;
+	u8 padding[3];
+};
+
+/* response tlv used for most tlvs */
+struct pfvf_def_resp_tlv {
+	struct pfvf_tlv hdr;
+};
+
+/* used to terminate and pad a tlv list */
+struct channel_list_end_tlv {
+	struct channel_tlv tl;
+	u8 padding[4];
+};
+
 #define TLV_BUFFER_SIZE                 1024
 struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
 };
 
 union vfpf_tlvs {
+	struct vfpf_first_tlv first_tlv;
+	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
 union pfvf_tlvs {
+	struct pfvf_def_resp_tlv default_resp;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -38,4 +84,10 @@ struct qed_bulletin {
 	u32 size;
 };
 
+enum {
+	CHANNEL_TLV_NONE,	/* ends tlv sequence */
+	CHANNEL_TLV_LIST_END,
+	CHANNEL_TLV_MAX
+};
+
 #endif
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 53ecb37ae563..8914d271ba73 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -327,9 +327,14 @@ struct regpair {
 	__le32	hi;
 };
 
+struct vf_pf_channel_eqe_data {
+	struct regpair msg_addr;
+};
+
 /* Event Data Union */
 union event_ring_data {
 	u8				bytes[8];
+	struct vf_pf_channel_eqe_data	vf_pf_channel;
 	struct async_data		async_info;
 };
 
-- 
cgit v1.2.3


From 1408cc1fa48c5450c0dc4b40cbd9718ecb09d1c9 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:14 +0300
Subject: qed: Introduce VFs

This adds the qed VFs for the first time -
The vfs are limited functions, with a very different PCI bar structure
[when compared with PFs] to better impose the related security demands
associated with them.

This patch includes the logic neccesary to allow VFs to successfully probe
[without actually adding the ability to enable iov].
This includes diverging all the flows that would occur as part of the pci
probe of the driver, preventing VF from accessing registers/memories it
can't and instead utilize the VF->PF channel to query the PF for needed
information.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h             |   5 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c         | 186 +++++++++--
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |   3 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 182 +++++++++--
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  47 ++-
 drivers/net/ethernet/qlogic/qed/qed_hw.c          |  12 +-
 drivers/net/ethernet/qlogic/qed/qed_init_ops.c    |   4 +
 drivers/net/ethernet/qlogic/qed/qed_int.c         | 101 +++++-
 drivers/net/ethernet/qlogic/qed/qed_int.h         |  16 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  57 +++-
 drivers/net/ethernet/qlogic/qed/qed_main.c        | 192 +++++++----
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |  83 ++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |  27 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h    |   6 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |   2 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   8 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c         |   3 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       | 379 +++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |  49 +++
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 357 ++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h          | 229 +++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  13 +-
 include/linux/qed/common_hsi.h                    |  57 ++++
 include/linux/qed/qed_if.h                        |  10 +-
 25 files changed, 1842 insertions(+), 188 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_vf.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index e11a809034ea..a44874562cfd 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -3,4 +3,4 @@ obj-$(CONFIG_QED) := qed.o
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
 	 qed_selftest.o
-qed-$(CONFIG_QED_SRIOV) += qed_sriov.o
+qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 01f9b6c880bd..f9a3576305a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -311,6 +311,8 @@ struct qed_hwfn {
 	bool				first_on_engine;
 	bool				hw_init_done;
 
+	u8				num_funcs_on_engine;
+
 	/* BAR access */
 	void __iomem			*regview;
 	void __iomem			*doorbells;
@@ -361,6 +363,7 @@ struct qed_hwfn {
 	/* True if the driver requests for the link */
 	bool				b_drv_link_init;
 
+	struct qed_vf_iov		*vf_iov_info;
 	struct qed_pf_iov		*pf_iov_info;
 	struct qed_mcp_info		*mcp_info;
 
@@ -497,6 +500,8 @@ struct qed_dev {
 #define IS_QED_SRIOV(cdev)              (!!(cdev)->p_iov_info)
 
 	unsigned long			tunn_mode;
+
+	bool				b_is_vf;
 	u32				drv_type;
 
 	struct qed_eth_stats		*reset_stats;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index fc767c07a264..ac284c58d8c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -24,11 +24,13 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 /* Max number of connection types in HW (DQ/CDU etc.) */
 #define MAX_CONN_TYPES		PROTOCOLID_COMMON
 #define NUM_TASK_TYPES		2
 #define NUM_TASK_PF_SEGMENTS	4
+#define NUM_TASK_VF_SEGMENTS	1
 
 /* QM constants */
 #define QM_PQ_ELEMENT_SIZE	4 /* in bytes */
@@ -63,10 +65,12 @@ union conn_context {
 struct qed_conn_type_cfg {
 	u32 cid_count;
 	u32 cid_start;
+	u32 cids_per_vf;
 };
 
 /* ILT Client configuration, Per connection type (protocol) resources. */
 #define ILT_CLI_PF_BLOCKS	(1 + NUM_TASK_PF_SEGMENTS * 2)
+#define ILT_CLI_VF_BLOCKS       (1 + NUM_TASK_VF_SEGMENTS * 2)
 #define CDUC_BLK		(0)
 
 enum ilt_clients {
@@ -97,6 +101,10 @@ struct qed_ilt_client_cfg {
 	/* ILT client blocks for PF */
 	struct qed_ilt_cli_blk pf_blks[ILT_CLI_PF_BLOCKS];
 	u32 pf_total_lines;
+
+	/* ILT client blocks for VFs */
+	struct qed_ilt_cli_blk vf_blks[ILT_CLI_VF_BLOCKS];
+	u32 vf_total_lines;
 };
 
 /* Per Path -
@@ -123,6 +131,11 @@ struct qed_cxt_mngr {
 	/* computed ILT structure */
 	struct qed_ilt_client_cfg	clients[ILT_CLI_MAX];
 
+	/* total number of VFs for this hwfn -
+	 * ALL VFs are symmetric in terms of HW resources
+	 */
+	u32				vf_count;
+
 	/* Acquired CIDs */
 	struct qed_cid_acquired_map	acquired[MAX_CONN_TYPES];
 
@@ -131,37 +144,60 @@ struct qed_cxt_mngr {
 	u32				pf_start_line;
 };
 
-static u32 qed_cxt_cdu_iids(struct qed_cxt_mngr *p_mngr)
-{
-	u32 type, pf_cids = 0;
+/* counts the iids for the CDU/CDUC ILT client configuration */
+struct qed_cdu_iids {
+	u32 pf_cids;
+	u32 per_vf_cids;
+};
 
-	for (type = 0; type < MAX_CONN_TYPES; type++)
-		pf_cids += p_mngr->conn_cfg[type].cid_count;
+static void qed_cxt_cdu_iids(struct qed_cxt_mngr *p_mngr,
+			     struct qed_cdu_iids *iids)
+{
+	u32 type;
 
-	return pf_cids;
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		iids->pf_cids += p_mngr->conn_cfg[type].cid_count;
+		iids->per_vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+	}
 }
 
 static void qed_cxt_qm_iids(struct qed_hwfn *p_hwfn,
 			    struct qed_qm_iids *iids)
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
-	int type;
+	u32 vf_cids = 0, type;
 
-	for (type = 0; type < MAX_CONN_TYPES; type++)
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
 		iids->cids += p_mngr->conn_cfg[type].cid_count;
+		vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+	}
 
-	DP_VERBOSE(p_hwfn, QED_MSG_ILT, "iids: CIDS %08x\n", iids->cids);
+	iids->vf_cids += vf_cids * p_mngr->vf_count;
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "iids: CIDS %08x vf_cids %08x\n",
+		   iids->cids, iids->vf_cids);
 }
 
 /* set the iids count per protocol */
 static void qed_cxt_set_proto_cid_count(struct qed_hwfn *p_hwfn,
 					enum protocol_type type,
-					u32 cid_count)
+					u32 cid_count, u32 vf_cid_cnt)
 {
 	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
 	struct qed_conn_type_cfg *p_conn = &p_mgr->conn_cfg[type];
 
 	p_conn->cid_count = roundup(cid_count, DQ_RANGE_ALIGN);
+	p_conn->cids_per_vf = roundup(vf_cid_cnt, DQ_RANGE_ALIGN);
+}
+
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn		*p_hwfn,
+				enum protocol_type	type,
+				u32			*vf_cid)
+{
+	if (vf_cid)
+		*vf_cid = p_hwfn->p_cxt_mngr->conn_cfg[type].cids_per_vf;
+
+	return p_hwfn->p_cxt_mngr->conn_cfg[type].cid_count;
 }
 
 static void qed_ilt_cli_blk_fill(struct qed_ilt_client_cfg *p_cli,
@@ -210,10 +246,12 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	struct qed_ilt_client_cfg *p_cli;
 	struct qed_ilt_cli_blk *p_blk;
-	u32 curr_line, total, pf_cids;
+	struct qed_cdu_iids cdu_iids;
 	struct qed_qm_iids qm_iids;
+	u32 curr_line, total, i;
 
 	memset(&qm_iids, 0, sizeof(qm_iids));
+	memset(&cdu_iids, 0, sizeof(cdu_iids));
 
 	p_mngr->pf_start_line = RESC_START(p_hwfn, QED_ILT);
 
@@ -224,14 +262,16 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	/* CDUC */
 	p_cli = &p_mngr->clients[ILT_CLI_CDUC];
 	curr_line = p_mngr->pf_start_line;
+
+	/* CDUC PF */
 	p_cli->pf_total_lines = 0;
 
 	/* get the counters for the CDUC and QM clients  */
-	pf_cids = qed_cxt_cdu_iids(p_mngr);
+	qed_cxt_cdu_iids(p_mngr, &cdu_iids);
 
 	p_blk = &p_cli->pf_blks[CDUC_BLK];
 
-	total = pf_cids * CONN_CXT_SIZE(p_hwfn);
+	total = cdu_iids.pf_cids * CONN_CXT_SIZE(p_hwfn);
 
 	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 			     total, CONN_CXT_SIZE(p_hwfn));
@@ -239,17 +279,36 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
 	p_cli->pf_total_lines = curr_line - p_blk->start_line;
 
+	/* CDUC VF */
+	p_blk = &p_cli->vf_blks[CDUC_BLK];
+	total = cdu_iids.per_vf_cids * CONN_CXT_SIZE(p_hwfn);
+
+	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+			     total, CONN_CXT_SIZE(p_hwfn));
+
+	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
+	p_cli->vf_total_lines = curr_line - p_blk->start_line;
+
+	for (i = 1; i < p_mngr->vf_count; i++)
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUC);
+
 	/* QM */
 	p_cli = &p_mngr->clients[ILT_CLI_QM];
 	p_blk = &p_cli->pf_blks[0];
 
 	qed_cxt_qm_iids(p_hwfn, &qm_iids);
-	total = qed_qm_pf_mem_size(p_hwfn->rel_pf_id, qm_iids.cids, 0, 0,
-				   p_hwfn->qm_info.num_pqs, 0);
-
-	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
-		   "QM ILT Info, (cids=%d, num_pqs=%d, memory_size=%d)\n",
-		   qm_iids.cids, p_hwfn->qm_info.num_pqs, total);
+	total = qed_qm_pf_mem_size(p_hwfn->rel_pf_id, qm_iids.cids,
+				   qm_iids.vf_cids, 0,
+				   p_hwfn->qm_info.num_pqs,
+				   p_hwfn->qm_info.num_vf_pqs);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_ILT,
+		   "QM ILT Info, (cids=%d, vf_cids=%d, num_pqs=%d, num_vf_pqs=%d, memory_size=%d)\n",
+		   qm_iids.cids,
+		   qm_iids.vf_cids,
+		   p_hwfn->qm_info.num_pqs, p_hwfn->qm_info.num_vf_pqs, total);
 
 	qed_ilt_cli_blk_fill(p_cli, p_blk,
 			     curr_line, total * 0x1000,
@@ -358,7 +417,7 @@ static int qed_ilt_shadow_alloc(struct qed_hwfn *p_hwfn)
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	struct qed_ilt_client_cfg *clients = p_mngr->clients;
 	struct qed_ilt_cli_blk *p_blk;
-	u32 size, i, j;
+	u32 size, i, j, k;
 	int rc;
 
 	size = qed_cxt_ilt_shadow_size(clients);
@@ -383,6 +442,16 @@ static int qed_ilt_shadow_alloc(struct qed_hwfn *p_hwfn)
 			if (rc != 0)
 				goto ilt_shadow_fail;
 		}
+		for (k = 0; k < p_mngr->vf_count; k++) {
+			for (j = 0; j < ILT_CLI_VF_BLOCKS; j++) {
+				u32 lines = clients[i].vf_total_lines * k;
+
+				p_blk = &clients[i].vf_blks[j];
+				rc = qed_ilt_blk_alloc(p_hwfn, p_blk, i, lines);
+				if (rc != 0)
+					goto ilt_shadow_fail;
+			}
+		}
 	}
 
 	return 0;
@@ -467,6 +536,9 @@ int qed_cxt_mngr_alloc(struct qed_hwfn *p_hwfn)
 	for (i = 0; i < ILT_CLI_MAX; i++)
 		p_mngr->clients[i].p_size.val = ILT_DEFAULT_HW_P_SIZE;
 
+	if (p_hwfn->cdev->p_iov_info)
+		p_mngr->vf_count = p_hwfn->cdev->p_iov_info->total_vfs;
+
 	/* Set the cxt mangr pointer priori to further allocations */
 	p_hwfn->p_cxt_mngr = p_mngr;
 
@@ -579,8 +651,10 @@ void qed_qm_init_pf(struct qed_hwfn *p_hwfn)
 	params.max_phys_tcs_per_port = qm_info->max_phys_tcs_per_port;
 	params.is_first_pf = p_hwfn->first_on_engine;
 	params.num_pf_cids = iids.cids;
+	params.num_vf_cids = iids.vf_cids;
 	params.start_pq = qm_info->start_pq;
-	params.num_pf_pqs = qm_info->num_pqs;
+	params.num_pf_pqs = qm_info->num_pqs - qm_info->num_vf_pqs;
+	params.num_vf_pqs = qm_info->num_vf_pqs;
 	params.start_vport = qm_info->start_vport;
 	params.num_vports = qm_info->num_vports;
 	params.pf_wfq = qm_info->pf_wfq;
@@ -610,26 +684,55 @@ static int qed_cm_init_pf(struct qed_hwfn *p_hwfn)
 static void qed_dq_init_pf(struct qed_hwfn *p_hwfn)
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
-	u32 dq_pf_max_cid = 0;
+	u32 dq_pf_max_cid = 0, dq_vf_max_cid = 0;
 
 	dq_pf_max_cid += (p_mngr->conn_cfg[0].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_0_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[0].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_0_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[1].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_1_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[1].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_1_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[2].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_2_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[2].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_2_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[3].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_3_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[3].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_3_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[4].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_4_RT_OFFSET, dq_pf_max_cid);
 
-	/* 5 - PF */
+	dq_vf_max_cid += (p_mngr->conn_cfg[4].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_4_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[5].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_5_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[5].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_5_RT_OFFSET, dq_vf_max_cid);
+
+	/* Connection types 6 & 7 are not in use, yet they must be configured
+	 * as the highest possible connection. Not configuring them means the
+	 * defaults will be  used, and with a large number of cids a bug may
+	 * occur, if the defaults will be smaller than dq_pf_max_cid /
+	 * dq_vf_max_cid.
+	 */
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_6_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_6_RT_OFFSET, dq_vf_max_cid);
+
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_7_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_7_RT_OFFSET, dq_vf_max_cid);
 }
 
 static void qed_ilt_bounds_init(struct qed_hwfn *p_hwfn)
@@ -653,6 +756,38 @@ static void qed_ilt_bounds_init(struct qed_hwfn *p_hwfn)
 	}
 }
 
+static void qed_ilt_vf_bounds_init(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	u32 blk_factor;
+
+	/* For simplicty  we set the 'block' to be an ILT page */
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_BASE_RT_OFFSET,
+			     p_iov->first_vf_in_pf);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET,
+			     p_iov->first_vf_in_pf + p_iov->total_vfs);
+	}
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+	blk_factor = ilog2(ILT_PAGE_IN_BYTES(p_cli->p_size.val) >> 10);
+	if (p_cli->active) {
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET,
+			     blk_factor);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET,
+			     p_cli->pf_total_lines);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET,
+			     p_cli->vf_total_lines);
+	}
+}
+
 /* ILT (PSWRQ2) PF */
 static void qed_ilt_init_pf(struct qed_hwfn *p_hwfn)
 {
@@ -662,6 +797,7 @@ static void qed_ilt_init_pf(struct qed_hwfn *p_hwfn)
 	u32 line, rt_offst, i;
 
 	qed_ilt_bounds_init(p_hwfn);
+	qed_ilt_vf_bounds_init(p_hwfn);
 
 	p_mngr = p_hwfn->p_cxt_mngr;
 	p_shdw = p_mngr->ilt_shadow;
@@ -839,10 +975,10 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn)
 	/* Set the number of required CORE connections */
 	u32 core_cids = 1; /* SPQ */
 
-	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids);
+	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids, 0);
 
 	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_ETH,
-				    p_params->num_cons);
+				    p_params->num_cons, 1);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index c8e1f5e5c42b..078ff3fd7920 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -51,6 +51,9 @@ enum qed_cxt_elem_type {
 	QED_ELEM_TASK
 };
 
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type, u32 *vf_cid);
+
 /**
  * @brief qed_cxt_set_pf_params - Set the PF params for cxt init
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 7a359c45360f..362e8db2b374 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -41,10 +41,14 @@ enum BAR_ID {
 static u32 qed_hw_bar_size(struct qed_hwfn	*p_hwfn,
 			   enum BAR_ID		bar_id)
 {
-	u32	bar_reg = (bar_id == BAR_ID_0 ?
-			   PGLUE_B_REG_PF_BAR0_SIZE : PGLUE_B_REG_PF_BAR1_SIZE);
-	u32	val = qed_rd(p_hwfn, p_hwfn->p_main_ptt, bar_reg);
+	u32 bar_reg = (bar_id == BAR_ID_0 ?
+		       PGLUE_B_REG_PF_BAR0_SIZE : PGLUE_B_REG_PF_BAR1_SIZE);
+	u32 val;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 1 << 17;
+
+	val = qed_rd(p_hwfn, p_hwfn->p_main_ptt, bar_reg);
 	if (val)
 		return 1 << (val + 15);
 
@@ -114,6 +118,9 @@ void qed_resc_free(struct qed_dev *cdev)
 {
 	int i;
 
+	if (IS_VF(cdev))
+		return;
+
 	kfree(cdev->fw_data);
 	cdev->fw_data = NULL;
 
@@ -144,14 +151,19 @@ void qed_resc_free(struct qed_dev *cdev)
 
 static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 {
+	u8 num_vports, vf_offset = 0, i, vport_id, num_ports, curr_queue = 0;
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
 	struct init_qm_port_params *p_qm_port;
-	u8 num_vports, i, vport_id, num_ports;
 	u16 num_pqs, multi_cos_tcs = 1;
+	u16 num_vfs = 0;
 
+#ifdef CONFIG_QED_SRIOV
+	if (p_hwfn->cdev->p_iov_info)
+		num_vfs = p_hwfn->cdev->p_iov_info->total_vfs;
+#endif
 	memset(qm_info, 0, sizeof(*qm_info));
 
-	num_pqs = multi_cos_tcs + 1; /* The '1' is for pure-LB */
+	num_pqs = multi_cos_tcs + num_vfs + 1;	/* The '1' is for pure-LB */
 	num_vports = (u8)RESC_NUM(p_hwfn, QED_VPORT);
 
 	/* Sanity checking that setup requires legal number of resources */
@@ -187,8 +199,9 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
 
 	/* First init per-TC PQs */
-	for (i = 0; i < multi_cos_tcs; i++) {
-		struct init_qm_pq_params *params = &qm_info->qm_pq_params[i];
+	for (i = 0; i < multi_cos_tcs; i++, curr_queue++) {
+		struct init_qm_pq_params *params =
+		    &qm_info->qm_pq_params[curr_queue];
 
 		params->vport_id = vport_id;
 		params->tc_id = p_hwfn->hw_info.non_offload_tc;
@@ -196,13 +209,26 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	}
 
 	/* Then init pure-LB PQ */
-	qm_info->pure_lb_pq = i;
-	qm_info->qm_pq_params[i].vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
-	qm_info->qm_pq_params[i].tc_id = PURE_LB_TC;
-	qm_info->qm_pq_params[i].wrr_group = 1;
-	i++;
+	qm_info->pure_lb_pq = curr_queue;
+	qm_info->qm_pq_params[curr_queue].vport_id =
+	    (u8) RESC_START(p_hwfn, QED_VPORT);
+	qm_info->qm_pq_params[curr_queue].tc_id = PURE_LB_TC;
+	qm_info->qm_pq_params[curr_queue].wrr_group = 1;
+	curr_queue++;
 
 	qm_info->offload_pq = 0;
+	/* Then init per-VF PQs */
+	vf_offset = curr_queue;
+	for (i = 0; i < num_vfs; i++) {
+		/* First vport is used by the PF */
+		qm_info->qm_pq_params[curr_queue].vport_id = vport_id + i + 1;
+		qm_info->qm_pq_params[curr_queue].tc_id =
+		    p_hwfn->hw_info.non_offload_tc;
+		qm_info->qm_pq_params[curr_queue].wrr_group = 1;
+		curr_queue++;
+	}
+
+	qm_info->vf_queues_offset = vf_offset;
 	qm_info->num_pqs = num_pqs;
 	qm_info->num_vports = num_vports;
 
@@ -220,7 +246,8 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 
 	qm_info->start_pq = (u16)RESC_START(p_hwfn, QED_PQ);
 
-	qm_info->start_vport = (u8)RESC_START(p_hwfn, QED_VPORT);
+	qm_info->num_vf_pqs = num_vfs;
+	qm_info->start_vport = (u8) RESC_START(p_hwfn, QED_VPORT);
 
 	for (i = 0; i < qm_info->num_vports; i++)
 		qm_info->qm_vport_params[i].vport_wfq = 1;
@@ -244,6 +271,9 @@ int qed_resc_alloc(struct qed_dev *cdev)
 	struct qed_eq *p_eq;
 	int i, rc = 0;
 
+	if (IS_VF(cdev))
+		return rc;
+
 	cdev->fw_data = kzalloc(sizeof(*cdev->fw_data), GFP_KERNEL);
 	if (!cdev->fw_data)
 		return -ENOMEM;
@@ -364,6 +394,9 @@ void qed_resc_setup(struct qed_dev *cdev)
 {
 	int i;
 
+	if (IS_VF(cdev))
+		return;
+
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
@@ -508,7 +541,9 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
 	struct qed_qm_common_rt_init_params params;
 	struct qed_dev *cdev = p_hwfn->cdev;
+	u32 concrete_fid;
 	int rc = 0;
+	u8 vf_id;
 
 	qed_init_cau_rt_data(cdev);
 
@@ -558,6 +593,14 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 	qed_wr(p_hwfn, p_ptt, 0x20b4,
 	       qed_rd(p_hwfn, p_ptt, 0x20b4) & ~0x10);
 
+	for (vf_id = 0; vf_id < MAX_NUM_VFS_BB; vf_id++) {
+		concrete_fid = qed_vfid_to_concrete(p_hwfn, vf_id);
+		qed_fid_pretend(p_hwfn, p_ptt, (u16) concrete_fid);
+		qed_wr(p_hwfn, p_ptt, CCFC_REG_STRONG_ENABLE_VF, 0x1);
+	}
+	/* pretend to original PF */
+	qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+
 	return rc;
 }
 
@@ -698,13 +741,20 @@ int qed_hw_init(struct qed_dev *cdev,
 	u32 load_code, param;
 	int rc, mfw_rc, i;
 
-	rc = qed_init_fw_data(cdev, bin_fw_data);
-	if (rc != 0)
-		return rc;
+	if (IS_PF(cdev)) {
+		rc = qed_init_fw_data(cdev, bin_fw_data);
+		if (rc != 0)
+			return rc;
+	}
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			p_hwfn->b_int_enabled = 1;
+			continue;
+		}
+
 		/* Enable DMAE in PXP */
 		rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true);
 
@@ -829,6 +879,11 @@ int qed_hw_stop(struct qed_dev *cdev)
 
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Stopping hw/fw\n");
 
+		if (IS_VF(cdev)) {
+			/* To be implemented in a later patch */
+			continue;
+		}
+
 		/* mark the hw as uninitialized... */
 		p_hwfn->hw_init_done = false;
 
@@ -860,15 +915,16 @@ int qed_hw_stop(struct qed_dev *cdev)
 		usleep_range(1000, 2000);
 	}
 
-	/* Disable DMAE in PXP - in CMT, this should only be done for
-	 * first hw-function, and only after all transactions have
-	 * stopped for all active hw-functions.
-	 */
-	t_rc = qed_change_pci_hwfn(&cdev->hwfns[0],
-				   cdev->hwfns[0].p_main_ptt,
-				   false);
-	if (t_rc != 0)
-		rc = t_rc;
+	if (IS_PF(cdev)) {
+		/* Disable DMAE in PXP - in CMT, this should only be done for
+		 * first hw-function, and only after all transactions have
+		 * stopped for all active hw-functions.
+		 */
+		t_rc = qed_change_pci_hwfn(&cdev->hwfns[0],
+					   cdev->hwfns[0].p_main_ptt, false);
+		if (t_rc != 0)
+			rc = t_rc;
+	}
 
 	return rc;
 }
@@ -932,6 +988,11 @@ int qed_hw_reset(struct qed_dev *cdev)
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			/* Will be implemented in a later patch */
+			continue;
+		}
+
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Resetting hw/fw\n");
 
 		/* Check for incorrect states */
@@ -1027,11 +1088,10 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
 static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 {
 	u32 *resc_start = p_hwfn->hw_info.resc_start;
+	u8 num_funcs = p_hwfn->num_funcs_on_engine;
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	struct qed_sb_cnt_info sb_cnt_info;
-	int num_funcs, i;
-
-	num_funcs = MAX_NUM_PFS_BB;
+	int i;
 
 	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
 	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
@@ -1238,6 +1298,51 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn,
 	return qed_mcp_fill_shmem_func_info(p_hwfn, p_ptt);
 }
 
+static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 reg_function_hide, tmp, eng_mask;
+	u8 num_funcs;
+
+	num_funcs = MAX_NUM_PFS_BB;
+
+	/* Bit 0 of MISCS_REG_FUNCTION_HIDE indicates whether the bypass values
+	 * in the other bits are selected.
+	 * Bits 1-15 are for functions 1-15, respectively, and their value is
+	 * '0' only for enabled functions (function 0 always exists and
+	 * enabled).
+	 * In case of CMT, only the "even" functions are enabled, and thus the
+	 * number of functions for both hwfns is learnt from the same bits.
+	 */
+	reg_function_hide = qed_rd(p_hwfn, p_ptt, MISCS_REG_FUNCTION_HIDE);
+
+	if (reg_function_hide & 0x1) {
+		if (QED_PATH_ID(p_hwfn) && p_hwfn->cdev->num_hwfns == 1) {
+			num_funcs = 0;
+			eng_mask = 0xaaaa;
+		} else {
+			num_funcs = 1;
+			eng_mask = 0x5554;
+		}
+
+		/* Get the number of the enabled functions on the engine */
+		tmp = (reg_function_hide ^ 0xffffffff) & eng_mask;
+		while (tmp) {
+			if (tmp & 0x1)
+				num_funcs++;
+			tmp >>= 0x1;
+		}
+	}
+
+	p_hwfn->num_funcs_on_engine = num_funcs;
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_PROBE,
+		   "PF [rel_id %d, abs_id %d] within the %d enabled functions on the engine\n",
+		   p_hwfn->rel_pf_id,
+		   p_hwfn->abs_pf_id,
+		   p_hwfn->num_funcs_on_engine);
+}
+
 static int
 qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		struct qed_ptt *p_ptt,
@@ -1296,6 +1401,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		p_hwfn->hw_info.personality = protocol;
 	}
 
+	qed_get_num_funcs(p_hwfn, p_ptt);
+
 	qed_hw_get_resc(p_hwfn);
 
 	return rc;
@@ -1361,6 +1468,9 @@ static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn,
 	p_hwfn->regview = p_regview;
 	p_hwfn->doorbells = p_doorbells;
 
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_hw_prepare(p_hwfn);
+
 	/* Validate that chip access is feasible */
 	if (REG_RD(p_hwfn, PXP_PF_ME_OPAQUE_ADDR) == 0xffffffff) {
 		DP_ERR(p_hwfn,
@@ -1428,7 +1538,8 @@ int qed_hw_prepare(struct qed_dev *cdev,
 	int rc;
 
 	/* Store the precompiled init data ptrs */
-	qed_init_iro_array(cdev);
+	if (IS_PF(cdev))
+		qed_init_iro_array(cdev);
 
 	/* Initialize the first hwfn - will learn number of hwfns */
 	rc = qed_hw_prepare_single(p_hwfn,
@@ -1460,9 +1571,11 @@ int qed_hw_prepare(struct qed_dev *cdev,
 		 * initiliazed hwfn 0.
 		 */
 		if (rc) {
-			qed_init_free(p_hwfn);
-			qed_mcp_free(p_hwfn);
-			qed_hw_hwfn_free(p_hwfn);
+			if (IS_PF(cdev)) {
+				qed_init_free(p_hwfn);
+				qed_mcp_free(p_hwfn);
+				qed_hw_hwfn_free(p_hwfn);
+			}
 		}
 	}
 
@@ -1476,6 +1589,11 @@ void qed_hw_remove(struct qed_dev *cdev)
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			/* Will be implemented in a later patch */
+			continue;
+		}
+
 		qed_init_free(p_hwfn);
 		qed_hw_hwfn_free(p_hwfn);
 		qed_mcp_free(p_hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 6ba197c107ed..c511106870d0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -29,7 +29,7 @@ struct qed_ptt;
 enum common_event_opcode {
 	COMMON_EVENT_PF_START,
 	COMMON_EVENT_PF_STOP,
-	COMMON_EVENT_RESERVED,
+	COMMON_EVENT_VF_START,
 	COMMON_EVENT_RESERVED2,
 	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
@@ -44,7 +44,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_UNUSED,
 	COMMON_RAMROD_PF_START /* PF Function Start Ramrod */,
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
-	COMMON_RAMROD_RESERVED,
+	COMMON_RAMROD_VF_START,
 	COMMON_RAMROD_RESERVED2,
 	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
@@ -573,6 +573,14 @@ union event_ring_element {
 	struct event_ring_next_addr	next_addr;
 };
 
+struct mstorm_non_trigger_vf_zone {
+	struct eth_mstorm_per_queue_stat eth_queue_stat;
+};
+
+struct mstorm_vf_zone {
+	struct mstorm_non_trigger_vf_zone non_trigger;
+};
+
 enum personality_type {
 	BAD_PERSONALITY_TYP,
 	PERSONALITY_RESERVED,
@@ -671,6 +679,16 @@ enum ports_mode {
 	MAX_PORTS_MODE
 };
 
+struct pstorm_non_trigger_vf_zone {
+	struct eth_pstorm_per_queue_stat eth_queue_stat;
+	struct regpair reserved[2];
+};
+
+struct pstorm_vf_zone {
+	struct pstorm_non_trigger_vf_zone non_trigger;
+	struct regpair reserved[7];
+};
+
 /* Ramrod Header of SPQE */
 struct ramrod_header {
 	__le32	cid /* Slowpath Connection CID */;
@@ -700,6 +718,29 @@ struct tstorm_per_port_stat {
 	struct regpair	preroce_irregular_pkt;
 };
 
+struct ustorm_non_trigger_vf_zone {
+	struct eth_ustorm_per_queue_stat eth_queue_stat;
+	struct regpair vf_pf_msg_addr;
+};
+
+struct ustorm_trigger_vf_zone {
+	u8 vf_pf_msg_valid;
+	u8 reserved[7];
+};
+
+struct ustorm_vf_zone {
+	struct ustorm_non_trigger_vf_zone non_trigger;
+	struct ustorm_trigger_vf_zone trigger;
+};
+
+struct vf_start_ramrod_data {
+	u8 vf_id;
+	u8 enable_flr_ack;
+	__le16 opaque_fid;
+	u8 personality;
+	u8 reserved[3];
+};
+
 struct atten_status_block {
 	__le32	atten_bits;
 	__le32	atten_ack;
@@ -1026,7 +1067,7 @@ enum init_phases {
 	PHASE_ENGINE,
 	PHASE_PORT,
 	PHASE_PF,
-	PHASE_RESERVED,
+	PHASE_VF,
 	PHASE_QM_PF,
 	MAX_INIT_PHASES
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index a9be5a422d2d..0ada7fdb91bc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -23,6 +23,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 #define QED_BAR_ACQUIRE_TIMEOUT 1000
 
@@ -236,8 +237,12 @@ static void qed_memcpy_hw(struct qed_hwfn *p_hwfn,
 		quota = min_t(size_t, n - done,
 			      PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE);
 
-		qed_ptt_set_win(p_hwfn, p_ptt, hw_addr + done);
-		hw_offset = qed_ptt_get_bar_addr(p_ptt);
+		if (IS_PF(p_hwfn->cdev)) {
+			qed_ptt_set_win(p_hwfn, p_ptt, hw_addr + done);
+			hw_offset = qed_ptt_get_bar_addr(p_ptt);
+		} else {
+			hw_offset = hw_addr + done;
+		}
 
 		dw_count = quota / 4;
 		host_addr = (u32 *)((u8 *)addr + done);
@@ -808,6 +813,9 @@ u16 qed_get_qm_pq(struct qed_hwfn *p_hwfn,
 		break;
 	case PROTOCOLID_ETH:
 		pq_id = p_params->eth.tc;
+		if (p_params->eth.is_vf)
+			pq_id += p_hwfn->qm_info.vf_queues_offset +
+				 p_params->eth.vf_id;
 		break;
 	default:
 		pq_id = 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
index 3269b3610e03..d358c3bb1308 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -18,6 +18,7 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 #define QED_INIT_MAX_POLL_COUNT 100
 #define QED_INIT_POLL_PERIOD_US 500
@@ -128,6 +129,9 @@ int qed_init_alloc(struct qed_hwfn *p_hwfn)
 {
 	struct qed_rt_data *rt_data = &p_hwfn->rt_data;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	rt_data->b_valid = kzalloc(sizeof(bool) * RUNTIME_ARRAY_SIZE,
 				   GFP_KERNEL);
 	if (!rt_data->b_valid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index 2017b0121f5f..bbecfa579364 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -26,6 +26,8 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
 
 struct qed_pi_info {
 	qed_int_comp_cb_t	comp_cb;
@@ -2513,6 +2515,9 @@ void qed_int_cau_conf_pi(struct qed_hwfn *p_hwfn,
 	u32 sb_offset;
 	u32 pi_offset;
 
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
 	sb_offset = igu_sb_id * PIS_PER_SB;
 	memset(&pi_entry, 0, sizeof(struct cau_pi_entry));
 
@@ -2542,8 +2547,9 @@ void qed_int_sb_setup(struct qed_hwfn *p_hwfn,
 	sb_info->sb_ack = 0;
 	memset(sb_info->sb_virt, 0, sizeof(*sb_info->sb_virt));
 
-	qed_int_cau_conf_sb(p_hwfn, p_ptt, sb_info->sb_phys,
-			    sb_info->igu_sb_id, 0, 0);
+	if (IS_PF(p_hwfn->cdev))
+		qed_int_cau_conf_sb(p_hwfn, p_ptt, sb_info->sb_phys,
+				    sb_info->igu_sb_id, 0, 0);
 }
 
 /**
@@ -2563,8 +2569,10 @@ static u16 qed_get_igu_sb_id(struct qed_hwfn *p_hwfn,
 	/* Assuming continuous set of IGU SBs dedicated for given PF */
 	if (sb_id == QED_SP_SB_ID)
 		igu_sb_id = p_hwfn->hw_info.p_igu_info->igu_dsb_id;
-	else
+	else if (IS_PF(p_hwfn->cdev))
 		igu_sb_id = sb_id + p_hwfn->hw_info.p_igu_info->igu_base_sb;
+	else
+		igu_sb_id = qed_vf_get_igu_sb_id(p_hwfn, sb_id);
 
 	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR, "SB [%s] index is 0x%04x\n",
 		   (sb_id == QED_SP_SB_ID) ? "DSB" : "non-DSB", igu_sb_id);
@@ -2594,9 +2602,16 @@ int qed_int_sb_init(struct qed_hwfn *p_hwfn,
 	/* The igu address will hold the absolute address that needs to be
 	 * written to for a specific status block
 	 */
-	sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
-					  GTT_BAR0_MAP_REG_IGU_CMD +
-					  (sb_info->igu_sb_id << 3);
+	if (IS_PF(p_hwfn->cdev)) {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  GTT_BAR0_MAP_REG_IGU_CMD +
+						  (sb_info->igu_sb_id << 3);
+	} else {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  PXP_VF_BAR0_START_IGU +
+						  ((IGU_CMD_INT_ACK_BASE +
+						    sb_info->igu_sb_id) << 3);
+	}
 
 	sb_info->flags |= QED_SB_INFO_INIT;
 
@@ -2783,6 +2798,9 @@ void qed_int_igu_disable_int(struct qed_hwfn *p_hwfn,
 {
 	p_hwfn->b_int_enabled = 0;
 
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
 	qed_wr(p_hwfn, p_ptt, IGU_REG_PF_CONFIGURATION, 0);
 }
 
@@ -2935,9 +2953,9 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt)
 {
 	struct qed_igu_info *p_igu_info;
+	u32 val, min_vf = 0, max_vf = 0;
+	u16 sb_id, last_iov_sb_id = 0;
 	struct qed_igu_block *blk;
-	u32 val;
-	u16 sb_id;
 	u16 prev_sb_id = 0xFF;
 
 	p_hwfn->hw_info.p_igu_info = kzalloc(sizeof(*p_igu_info), GFP_KERNEL);
@@ -2947,12 +2965,19 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 
 	p_igu_info = p_hwfn->hw_info.p_igu_info;
 
-	/* Initialize base sb / sb cnt for PFs */
+	/* Initialize base sb / sb cnt for PFs and VFs */
 	p_igu_info->igu_base_sb		= 0xffff;
 	p_igu_info->igu_sb_cnt		= 0;
 	p_igu_info->igu_dsb_id		= 0xffff;
 	p_igu_info->igu_base_sb_iov	= 0xffff;
 
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		min_vf	= p_iov->first_vf_in_pf;
+		max_vf	= p_iov->first_vf_in_pf + p_iov->total_vfs;
+	}
+
 	for (sb_id = 0; sb_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev);
 	     sb_id++) {
 		blk = &p_igu_info->igu_map.igu_blocks[sb_id];
@@ -2986,14 +3011,43 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 					(p_igu_info->igu_sb_cnt)++;
 				}
 			}
+		} else {
+			if ((blk->function_id >= min_vf) &&
+			    (blk->function_id < max_vf)) {
+				/* Available for VFs of this PF */
+				if (p_igu_info->igu_base_sb_iov == 0xffff) {
+					p_igu_info->igu_base_sb_iov = sb_id;
+				} else if (last_iov_sb_id != sb_id - 1) {
+					if (!val) {
+						DP_VERBOSE(p_hwfn->cdev,
+							   NETIF_MSG_INTR,
+							   "First uninitialized IGU CAM entry at index 0x%04x\n",
+							   sb_id);
+					} else {
+						DP_NOTICE(p_hwfn->cdev,
+							  "Consecutive igu vectors for HWFN %x vfs is broken [jumps from %04x to %04x]\n",
+							  p_hwfn->rel_pf_id,
+							  last_iov_sb_id,
+							  sb_id); }
+					break;
+				}
+				blk->status |= QED_IGU_STATUS_FREE;
+				p_hwfn->hw_info.p_igu_info->free_blks++;
+				last_iov_sb_id = sb_id;
+			}
 		}
 	}
-
-	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
-		   "IGU igu_base_sb=0x%x igu_sb_cnt=%d igu_dsb_id=0x%x\n",
-		   p_igu_info->igu_base_sb,
-		   p_igu_info->igu_sb_cnt,
-		   p_igu_info->igu_dsb_id);
+	p_igu_info->igu_sb_cnt_iov = p_igu_info->free_blks;
+
+	DP_VERBOSE(
+		p_hwfn,
+		NETIF_MSG_INTR,
+		"IGU igu_base_sb=0x%x [IOV 0x%x] igu_sb_cnt=%d [IOV 0x%x] igu_dsb_id=0x%x\n",
+		p_igu_info->igu_base_sb,
+		p_igu_info->igu_base_sb_iov,
+		p_igu_info->igu_sb_cnt,
+		p_igu_info->igu_sb_cnt_iov,
+		p_igu_info->igu_dsb_id);
 
 	if (p_igu_info->igu_base_sb == 0xffff ||
 	    p_igu_info->igu_dsb_id == 0xffff ||
@@ -3116,6 +3170,23 @@ void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
 	p_sb_cnt_info->sb_free_blk	= info->free_blks;
 }
 
+u16 qed_int_queue_id_from_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+
+	/* Determine origin of SB id */
+	if ((sb_id >= p_info->igu_base_sb) &&
+	    (sb_id < p_info->igu_base_sb + p_info->igu_sb_cnt)) {
+		return sb_id - p_info->igu_base_sb;
+	} else if ((sb_id >= p_info->igu_base_sb_iov) &&
+		   (sb_id < p_info->igu_base_sb_iov + p_info->igu_sb_cnt_iov)) {
+		return sb_id - p_info->igu_base_sb_iov + p_info->igu_sb_cnt;
+	} else {
+		DP_NOTICE(p_hwfn, "SB %d not in range for function\n", sb_id);
+		return 0;
+	}
+}
+
 void qed_int_disable_post_isr_release(struct qed_dev *cdev)
 {
 	int i;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
index c57f2e680770..295df4451e31 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -20,6 +20,12 @@
 #define IGU_PF_CONF_ATTN_BIT_EN   (0x1 << 3)    /* attention enable       */
 #define IGU_PF_CONF_SINGLE_ISR_EN (0x1 << 4)    /* single ISR mode enable */
 #define IGU_PF_CONF_SIMD_MODE     (0x1 << 5)    /* simd all ones mode     */
+/* Fields of IGU VF CONFIGRATION REGISTER */
+#define IGU_VF_CONF_FUNC_EN        (0x1 << 0)	/* function enable        */
+#define IGU_VF_CONF_MSI_MSIX_EN    (0x1 << 1)	/* MSI/MSIX enable        */
+#define IGU_VF_CONF_SINGLE_ISR_EN  (0x1 << 4)	/* single ISR mode enable */
+#define IGU_VF_CONF_PARENT_MASK    (0xF)	/* Parent PF              */
+#define IGU_VF_CONF_PARENT_SHIFT   5		/* Parent PF              */
 
 /* Igu control commands
  */
@@ -364,6 +370,16 @@ void qed_int_free(struct qed_hwfn *p_hwfn);
 void qed_int_setup(struct qed_hwfn *p_hwfn,
 		   struct qed_ptt *p_ptt);
 
+/**
+ * @brief - Returns an Rx queue index appropriate for usage with given SB.
+ *
+ * @param p_hwfn
+ * @param sb_id - absolute index of SB
+ *
+ * @return index of Rx queue
+ */
+u16 qed_int_queue_id_from_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
 /**
  * @brief - Enable Interrupt & Attention for hw function
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 31e1d510a991..8bcbf92b776f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -34,6 +34,7 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 struct qed_rss_params {
 	u8	update_rss_config;
@@ -1580,32 +1581,53 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 
 	info->num_tc = 1;
 
-	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
-		for_each_hwfn(cdev, i)
-			info->num_queues += FEAT_NUM(&cdev->hwfns[i],
-						     QED_PF_L2_QUE);
-		if (cdev->int_params.fp_msix_cnt)
-			info->num_queues = min_t(u8, info->num_queues,
-						 cdev->int_params.fp_msix_cnt);
+	if (IS_PF(cdev)) {
+		if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+			for_each_hwfn(cdev, i)
+			    info->num_queues +=
+			    FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE);
+			if (cdev->int_params.fp_msix_cnt)
+				info->num_queues =
+				    min_t(u8, info->num_queues,
+					  cdev->int_params.fp_msix_cnt);
+		} else {
+			info->num_queues = cdev->num_hwfns;
+		}
+
+		info->num_vlan_filters = RESC_NUM(&cdev->hwfns[0], QED_VLAN);
+		ether_addr_copy(info->port_mac,
+				cdev->hwfns[0].hw_info.hw_mac_addr);
 	} else {
-		info->num_queues = cdev->num_hwfns;
-	}
+		qed_vf_get_num_rxqs(QED_LEADING_HWFN(cdev), &info->num_queues);
+		if (cdev->num_hwfns > 1) {
+			u8 queues = 0;
 
-	info->num_vlan_filters = RESC_NUM(&cdev->hwfns[0], QED_VLAN);
-	ether_addr_copy(info->port_mac,
-			cdev->hwfns[0].hw_info.hw_mac_addr);
+			qed_vf_get_num_rxqs(&cdev->hwfns[1], &queues);
+			info->num_queues += queues;
+		}
+
+		qed_vf_get_num_vlan_filters(&cdev->hwfns[0],
+					    &info->num_vlan_filters);
+		qed_vf_get_port_mac(&cdev->hwfns[0], info->port_mac);
+	}
 
 	qed_fill_dev_info(cdev, &info->common);
 
+	if (IS_VF(cdev))
+		memset(info->common.hw_mac, 0, ETH_ALEN);
+
 	return 0;
 }
 
 static void qed_register_eth_ops(struct qed_dev *cdev,
-				 struct qed_eth_cb_ops *ops,
-				 void *cookie)
+				 struct qed_eth_cb_ops *ops, void *cookie)
 {
-	cdev->protocol_ops.eth	= ops;
-	cdev->ops_cookie	= cookie;
+	cdev->protocol_ops.eth = ops;
+	cdev->ops_cookie = cookie;
+
+	/* For VF, we start bulletin reading */
+	if (IS_VF(cdev))
+		qed_vf_start_iov_wq(cdev);
 }
 
 static int qed_start_vport(struct qed_dev *cdev,
@@ -1890,6 +1912,9 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 	struct qed_tunn_update_params tunn_info;
 	int i, rc;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	memset(&tunn_info, 0, sizeof(tunn_info));
 	if (tunn_params->update_vxlan_port == 1) {
 		tunn_info.update_vxlan_udp_port = 1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c209ed49deae..898347bd2db7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -126,7 +126,7 @@ static int qed_init_pci(struct qed_dev *cdev,
 		goto err1;
 	}
 
-	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+	if (IS_PF(cdev) && !(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
 		DP_NOTICE(cdev, "No memory region found in bar #2\n");
 		rc = -EIO;
 		goto err1;
@@ -176,12 +176,14 @@ static int qed_init_pci(struct qed_dev *cdev,
 		goto err2;
 	}
 
-	cdev->db_phys_addr = pci_resource_start(cdev->pdev, 2);
-	cdev->db_size = pci_resource_len(cdev->pdev, 2);
-	cdev->doorbells = ioremap_wc(cdev->db_phys_addr, cdev->db_size);
-	if (!cdev->doorbells) {
-		DP_NOTICE(cdev, "Cannot map doorbell space\n");
-		return -ENOMEM;
+	if (IS_PF(cdev)) {
+			cdev->db_phys_addr = pci_resource_start(cdev->pdev, 2);
+		cdev->db_size = pci_resource_len(cdev->pdev, 2);
+		cdev->doorbells = ioremap_wc(cdev->db_phys_addr, cdev->db_size);
+		if (!cdev->doorbells) {
+			DP_NOTICE(cdev, "Cannot map doorbell space\n");
+			return -ENOMEM;
+		}
 	}
 
 	return 0;
@@ -208,20 +210,32 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
 	ether_addr_copy(dev_info->hw_mac, cdev->hwfns[0].hw_info.hw_mac_addr);
 
-	dev_info->fw_major = FW_MAJOR_VERSION;
-	dev_info->fw_minor = FW_MINOR_VERSION;
-	dev_info->fw_rev = FW_REVISION_VERSION;
-	dev_info->fw_eng = FW_ENGINEERING_VERSION;
-	dev_info->mf_mode = cdev->mf_mode;
+	if (IS_PF(cdev)) {
+		dev_info->fw_major = FW_MAJOR_VERSION;
+		dev_info->fw_minor = FW_MINOR_VERSION;
+		dev_info->fw_rev = FW_REVISION_VERSION;
+		dev_info->fw_eng = FW_ENGINEERING_VERSION;
+		dev_info->mf_mode = cdev->mf_mode;
+	} else {
+		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
+				      &dev_info->fw_minor, &dev_info->fw_rev,
+				      &dev_info->fw_eng);
+	}
 
-	qed_mcp_get_mfw_ver(cdev, &dev_info->mfw_rev);
+	if (IS_PF(cdev)) {
+		ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+		if (ptt) {
+			qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), ptt,
+					    &dev_info->mfw_rev, NULL);
 
-	ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
-	if (ptt) {
-		qed_mcp_get_flash_size(QED_LEADING_HWFN(cdev), ptt,
-				       &dev_info->flash_size);
+			qed_mcp_get_flash_size(QED_LEADING_HWFN(cdev), ptt,
+					       &dev_info->flash_size);
 
-		qed_ptt_release(QED_LEADING_HWFN(cdev), ptt);
+			qed_ptt_release(QED_LEADING_HWFN(cdev), ptt);
+		}
+	} else {
+		qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), NULL,
+				    &dev_info->mfw_rev, NULL);
 	}
 
 	return 0;
@@ -258,9 +272,7 @@ static int qed_set_power_state(struct qed_dev *cdev,
 
 /* probing */
 static struct qed_dev *qed_probe(struct pci_dev *pdev,
-				 enum qed_protocol protocol,
-				 u32 dp_module,
-				 u8 dp_level)
+				 struct qed_probe_params *params)
 {
 	struct qed_dev *cdev;
 	int rc;
@@ -269,9 +281,12 @@ static struct qed_dev *qed_probe(struct pci_dev *pdev,
 	if (!cdev)
 		goto err0;
 
-	cdev->protocol = protocol;
+	cdev->protocol = params->protocol;
 
-	qed_init_dp(cdev, dp_module, dp_level);
+	if (params->is_vf)
+		cdev->b_is_vf = true;
+
+	qed_init_dp(cdev, params->dp_module, params->dp_level);
 
 	rc = qed_init_pci(cdev, pdev);
 	if (rc) {
@@ -665,6 +680,35 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_slowpath_vf_setup_int(struct qed_dev *cdev)
+{
+	int rc;
+
+	memset(&cdev->int_params, 0, sizeof(struct qed_int_params));
+	cdev->int_params.in.int_mode = QED_INT_MODE_MSIX;
+
+	qed_vf_get_num_rxqs(QED_LEADING_HWFN(cdev),
+			    &cdev->int_params.in.num_vectors);
+	if (cdev->num_hwfns > 1) {
+		u8 vectors = 0;
+
+		qed_vf_get_num_rxqs(&cdev->hwfns[1], &vectors);
+		cdev->int_params.in.num_vectors += vectors;
+	}
+
+	/* We want a minimum of one fastpath vector per vf hwfn */
+	cdev->int_params.in.min_msix_cnt = cdev->num_hwfns;
+
+	rc = qed_set_int_mode(cdev, true);
+	if (rc)
+		return rc;
+
+	cdev->int_params.fp_msix_base = 0;
+	cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors;
+
+	return 0;
+}
+
 u32 qed_unzip_data(struct qed_hwfn *p_hwfn, u32 input_len,
 		   u8 *input_buf, u32 max_size, u8 *unzip_buf)
 {
@@ -755,32 +799,38 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	if (qed_iov_wq_start(cdev))
 		goto err;
 
-	rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
-			      &cdev->pdev->dev);
-	if (rc) {
-		DP_NOTICE(cdev,
-			  "Failed to find fw file - /lib/firmware/%s\n",
-			  QED_FW_FILE_NAME);
-		goto err;
+	if (IS_PF(cdev)) {
+		rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
+				      &cdev->pdev->dev);
+		if (rc) {
+			DP_NOTICE(cdev,
+				  "Failed to find fw file - /lib/firmware/%s\n",
+				  QED_FW_FILE_NAME);
+			goto err;
+		}
 	}
 
 	rc = qed_nic_setup(cdev);
 	if (rc)
 		goto err;
 
-	rc = qed_slowpath_setup_int(cdev, params->int_mode);
+	if (IS_PF(cdev))
+		rc = qed_slowpath_setup_int(cdev, params->int_mode);
+	else
+		rc = qed_slowpath_vf_setup_int(cdev);
 	if (rc)
 		goto err1;
 
-	/* Allocate stream for unzipping */
-	rc = qed_alloc_stream_mem(cdev);
-	if (rc) {
-		DP_NOTICE(cdev, "Failed to allocate stream memory\n");
-		goto err2;
-	}
+	if (IS_PF(cdev)) {
+		/* Allocate stream for unzipping */
+		rc = qed_alloc_stream_mem(cdev);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed to allocate stream memory\n");
+			goto err2;
+		}
 
-	/* Start the slowpath */
-	data = cdev->firmware->data;
+		data = cdev->firmware->data;
+	}
 
 	memset(&tunn_info, 0, sizeof(tunn_info));
 	tunn_info.tunn_mode |=  1 << QED_MODE_VXLAN_TUNN |
@@ -793,6 +843,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	tunn_info.tunn_clss_l2gre = QED_TUNN_CLSS_MAC_VLAN;
 	tunn_info.tunn_clss_ipgre = QED_TUNN_CLSS_MAC_VLAN;
 
+	/* Start the slowpath */
 	rc = qed_hw_init(cdev, &tunn_info, true,
 			 cdev->int_params.out.int_mode,
 			 true, data);
@@ -802,18 +853,20 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	DP_INFO(cdev,
 		"HW initialization and function start completed successfully\n");
 
-	hwfn = QED_LEADING_HWFN(cdev);
-	drv_version.version = (params->drv_major << 24) |
-			      (params->drv_minor << 16) |
-			      (params->drv_rev << 8) |
-			      (params->drv_eng);
-	strlcpy(drv_version.name, params->name,
-		MCP_DRV_VER_STR_SIZE - 4);
-	rc = qed_mcp_send_drv_version(hwfn, hwfn->p_main_ptt,
-				      &drv_version);
-	if (rc) {
-		DP_NOTICE(cdev, "Failed sending drv version command\n");
-		return rc;
+	if (IS_PF(cdev)) {
+		hwfn = QED_LEADING_HWFN(cdev);
+		drv_version.version = (params->drv_major << 24) |
+				      (params->drv_minor << 16) |
+				      (params->drv_rev << 8) |
+				      (params->drv_eng);
+		strlcpy(drv_version.name, params->name,
+			MCP_DRV_VER_STR_SIZE - 4);
+		rc = qed_mcp_send_drv_version(hwfn, hwfn->p_main_ptt,
+					      &drv_version);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed sending drv version command\n");
+			return rc;
+		}
 	}
 
 	qed_reset_vport_stats(cdev);
@@ -822,13 +875,15 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 
 err2:
 	qed_hw_timers_stop_all(cdev);
-	qed_slowpath_irq_free(cdev);
+	if (IS_PF(cdev))
+		qed_slowpath_irq_free(cdev);
 	qed_free_stream_mem(cdev);
 	qed_disable_msix(cdev);
 err1:
 	qed_resc_free(cdev);
 err:
-	release_firmware(cdev->firmware);
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
 
 	qed_iov_wq_stop(cdev, false);
 
@@ -840,17 +895,20 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	if (!cdev)
 		return -ENODEV;
 
-	qed_free_stream_mem(cdev);
+	if (IS_PF(cdev)) {
+		qed_free_stream_mem(cdev);
 
-	qed_nic_stop(cdev);
-	qed_slowpath_irq_free(cdev);
+		qed_nic_stop(cdev);
+		qed_slowpath_irq_free(cdev);
+	}
 
 	qed_disable_msix(cdev);
 	qed_nic_reset(cdev);
 
 	qed_iov_wq_stop(cdev, true);
 
-	release_firmware(cdev->firmware);
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
 
 	return 0;
 }
@@ -940,6 +998,9 @@ static int qed_set_link(struct qed_dev *cdev,
 	if (!cdev)
 		return -ENODEV;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	/* The link should be set only once per PF */
 	hwfn = &cdev->hwfns[0];
 
@@ -1051,10 +1112,16 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 	memset(if_link, 0, sizeof(*if_link));
 
 	/* Prepare source inputs */
-	memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
-	memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
-	memcpy(&link_caps, qed_mcp_get_link_capabilities(hwfn),
-	       sizeof(link_caps));
+	if (IS_PF(hwfn->cdev)) {
+		memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
+		memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
+		memcpy(&link_caps, qed_mcp_get_link_capabilities(hwfn),
+		       sizeof(link_caps));
+	} else {
+		memset(&params, 0, sizeof(params));
+		memset(&link, 0, sizeof(link));
+		memset(&link_caps, 0, sizeof(link_caps));
+	}
 
 	/* Set the link parameters to pass to protocol driver */
 	if (link.link_up)
@@ -1177,6 +1244,9 @@ static int qed_drain(struct qed_dev *cdev)
 	struct qed_ptt *ptt;
 	int i, rc;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	for_each_hwfn(cdev, i) {
 		hwfn = &cdev->hwfns[i];
 		ptt = qed_ptt_acquire(hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 2f8309d772c8..83175007f616 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -19,6 +19,8 @@
 #include "qed_hw.h"
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
 #define CHIP_MCP_RESP_ITER_US 10
 
 #define QED_DRV_MB_MAX_RETRIES	(500 * 1000)	/* Account for 5 sec */
@@ -787,26 +789,42 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-int qed_mcp_get_mfw_ver(struct qed_dev *cdev,
-			u32 *p_mfw_ver)
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id)
 {
-	struct qed_hwfn *p_hwfn = &cdev->hwfns[0];
-	struct qed_ptt *p_ptt;
 	u32 global_offsize;
 
-	p_ptt = qed_ptt_acquire(p_hwfn);
-	if (!p_ptt)
-		return -EBUSY;
+	if (IS_VF(p_hwfn->cdev)) {
+		if (p_hwfn->vf_iov_info) {
+			struct pfvf_acquire_resp_tlv *p_resp;
+
+			p_resp = &p_hwfn->vf_iov_info->acquire_resp;
+			*p_mfw_ver = p_resp->pfdev_info.mfw_ver;
+			return 0;
+		} else {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF requested MFW version prior to ACQUIRE\n");
+			return -EINVAL;
+		}
+	}
 
 	global_offsize = qed_rd(p_hwfn, p_ptt,
-				SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->
-						     public_base,
+				SECTION_OFFSIZE_ADDR(p_hwfn->
+						     mcp_info->public_base,
 						     PUBLIC_GLOBAL));
-	*p_mfw_ver = qed_rd(p_hwfn, p_ptt,
-			    SECTION_ADDR(global_offsize, 0) +
-			    offsetof(struct public_global, mfw_ver));
-
-	qed_ptt_release(p_hwfn, p_ptt);
+	*p_mfw_ver =
+	    qed_rd(p_hwfn, p_ptt,
+		   SECTION_ADDR(global_offsize,
+				0) + offsetof(struct public_global, mfw_ver));
+
+	if (p_running_bundle_id != NULL) {
+		*p_running_bundle_id = qed_rd(p_hwfn, p_ptt,
+					      SECTION_ADDR(global_offsize, 0) +
+					      offsetof(struct public_global,
+						       running_bundle_id));
+	}
 
 	return 0;
 }
@@ -817,6 +835,9 @@ int qed_mcp_get_media_type(struct qed_dev *cdev,
 	struct qed_hwfn *p_hwfn = &cdev->hwfns[0];
 	struct qed_ptt  *p_ptt;
 
+	if (IS_VF(cdev))
+		return -EINVAL;
+
 	if (!qed_mcp_is_init(p_hwfn)) {
 		DP_NOTICE(p_hwfn, "MFW is not initialized !\n");
 		return -EBUSY;
@@ -951,6 +972,9 @@ int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
 {
 	u32 flash_size;
 
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
 	flash_size = qed_rd(p_hwfn, p_ptt, MCP_REG_NVM_CFG4);
 	flash_size = (flash_size & MCP_REG_NVM_CFG4_FLASH_SIZE) >>
 		      MCP_REG_NVM_CFG4_FLASH_SIZE_SHIFT;
@@ -961,6 +985,37 @@ int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num)
+{
+	u32 resp = 0, param = 0, rc_param = 0;
+	int rc;
+
+	/* Only Leader can configure MSIX, and need to take CMT into account */
+	if (!IS_LEAD_HWFN(p_hwfn))
+		return 0;
+	num *= p_hwfn->cdev->num_hwfns;
+
+	param |= (vf_id << DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK;
+	param |= (num << DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_MASK;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_CFG_VF_MSIX, param,
+			 &resp, &rc_param);
+
+	if (resp != FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE) {
+		DP_NOTICE(p_hwfn, "VF[%d]: MFW failed to set MSI-X\n", vf_id);
+		rc = -EINVAL;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Requested 0x%02x MSI-x interrupts from VF 0x%02x\n",
+			   num, vf_id);
+	}
+
+	return rc;
+}
+
 int
 qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 5f218eed0541..e3d5cdfe8e8d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -149,13 +149,16 @@ int qed_mcp_set_link(struct qed_hwfn   *p_hwfn,
 /**
  * @brief Get the management firmware version value
  *
- * @param cdev       - qed dev pointer
- * @param mfw_ver    - mfw version value
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_mfw_ver    - mfw version value
+ * @param p_running_bundle_id	- image id in nvram; Optional.
  *
- * @return int - 0 - operation was successul.
+ * @return int - 0 - operation was successful.
  */
-int qed_mcp_get_mfw_ver(struct qed_dev *cdev,
-			u32 *mfw_ver);
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id);
 
 /**
  * @brief Get media type value of the port.
@@ -418,6 +421,20 @@ int qed_mcp_reset(struct qed_hwfn *p_hwfn,
  * @return true iff MFW is running and mcp_info is initialized
  */
 bool qed_mcp_is_init(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief request MFW to configure MSI-X for a VF
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf_id - absolute inside engine
+ * @param num_sbs - number of entries to request
+ *
+ * @return int
+ */
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num);
+
 int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw);
 int qed_configure_pf_max_bandwidth(struct qed_dev *cdev, u8 max_bw);
 int __qed_configure_pf_max_bandwidth(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index bf4d7ccd56bb..a508b6b7f1d4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -39,6 +39,8 @@
 	0x2aae04UL
 #define  PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER	\
 	0x2aa16cUL
+#define PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR \
+	0x2aa118UL
 #define  BAR0_MAP_REG_MSDM_RAM \
 	0x1d00000UL
 #define  BAR0_MAP_REG_USDM_RAM \
@@ -111,6 +113,8 @@
 	0x009778UL
 #define  MISCS_REG_CHIP_METAL \
 	0x009774UL
+#define MISCS_REG_FUNCTION_HIDE \
+	0x0096f0UL
 #define  BRB_REG_HEADER_SIZE \
 	0x340804UL
 #define  BTB_REG_HEADER_SIZE \
@@ -119,6 +123,8 @@
 	0x1c0708UL
 #define  CCFC_REG_ACTIVITY_COUNTER \
 	0x2e8800UL
+#define CCFC_REG_STRONG_ENABLE_VF \
+	0x2e070cUL
 #define  CDU_REG_CID_ADDR_PARAMS	\
 	0x580900UL
 #define  DBG_REG_CLIENT_ENABLE \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index eec137f40895..dde69090379f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -62,6 +62,8 @@ union ramrod_data {
 	struct vport_stop_ramrod_data vport_stop;
 	struct vport_update_ramrod_data vport_update;
 	struct vport_filter_update_ramrod_data vport_filter_update;
+
+	struct vf_start_ramrod_data vf_start;
 };
 
 #define EQ_MAX_CREDIT   0xffffffff
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index e1e2344b1906..ed90947c451d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -20,6 +20,7 @@
 #include "qed_int.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 			struct qed_spq_entry **pp_ent,
@@ -357,6 +358,13 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 				     &p_ramrod->tunnel_config);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		p_ramrod->base_vf_id = (u8) p_iov->first_vf_in_pf;
+		p_ramrod->num_vfs = (u8) p_iov->total_vfs;
+	}
+
 	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
 		   "Setting event_ring_sb [id %04x index %02x], outer_tag [%d]\n",
 		   sb, sb_index,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 0e439e46fbe9..acac6626a1b2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -387,6 +387,9 @@ static int qed_cqe_completion(
 	struct eth_slow_path_rx_cqe *cqe,
 	enum protocol_type protocol)
 {
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	/* @@@tmp - it's possible we'll eventually want to handle some
 	 * actual commands that can arrive here, but for now this is only
 	 * used to complete the ramrod using the echo value on the cqe
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 4a6af4264141..699d96fb87f0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -6,12 +6,48 @@
  * this source tree.
  */
 
+#include "qed_cxt.h"
+#include "qed_hsi.h"
 #include "qed_hw.h"
+#include "qed_init_ops.h"
 #include "qed_int.h"
+#include "qed_mcp.h"
 #include "qed_reg_addr.h"
+#include "qed_sp.h"
 #include "qed_sriov.h"
 #include "qed_vf.h"
 
+/* IOV ramrods */
+static int qed_sp_vf_start(struct qed_hwfn *p_hwfn,
+			   u32 concrete_vfid, u16 opaque_vfid)
+{
+	struct vf_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_vfid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_START,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_start;
+
+	p_ramrod->vf_id = GET_FIELD(concrete_vfid, PXP_CONCRETE_FID_VFID);
+	p_ramrod->opaque_fid = cpu_to_le16(opaque_vfid);
+
+	p_ramrod->personality = PERSONALITY_ETH;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 			   int rel_vf_id, bool b_enabled_only)
 {
@@ -321,6 +357,9 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 	int pos;
 	int rc;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	/* Learn the PCI configuration */
 	pos = pci_find_ext_capability(p_hwfn->cdev->pdev,
 				      PCI_EXT_CAP_ID_SRIOV);
@@ -376,12 +415,189 @@ static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
 		return false;
 
 	/* Check VF validity */
-	if (!qed_iov_is_valid_vfid(p_hwfn, vfid, true))
+	if (IS_VF(p_hwfn->cdev) || !IS_QED_SRIOV(p_hwfn->cdev) ||
+	    !IS_PF_SRIOV_ALLOC(p_hwfn))
 		return false;
 
 	return true;
 }
 
+static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, u8 abs_vfid)
+{
+	qed_wr(p_hwfn, p_ptt,
+	       PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR + (abs_vfid >> 5) * 4,
+	       1 << (abs_vfid & 0x1f));
+}
+
+static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	u32 igu_vf_conf = IGU_VF_CONF_FUNC_EN;
+	int rc;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "Enable internal access for vf %x [abs %x]\n",
+		   vf->abs_vf_id, QED_VF_ABS_ID(p_hwfn, vf));
+
+	qed_iov_vf_pglue_clear_err(p_hwfn, p_ptt, QED_VF_ABS_ID(p_hwfn, vf));
+
+	rc = qed_mcp_config_vf_msix(p_hwfn, p_ptt, vf->abs_vf_id, vf->num_sbs);
+	if (rc)
+		return rc;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	SET_FIELD(igu_vf_conf, IGU_VF_CONF_PARENT, p_hwfn->rel_pf_id);
+	STORE_RT_REG(p_hwfn, IGU_REG_VF_CONFIGURATION_RT_OFFSET, igu_vf_conf);
+
+	qed_init_run(p_hwfn, p_ptt, PHASE_VF, vf->abs_vf_id,
+		     p_hwfn->hw_info.hw_mode);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	if (vf->state != VF_STOPPED) {
+		DP_NOTICE(p_hwfn, "VF[%02x] is already started\n",
+			  vf->abs_vf_id);
+		return -EINVAL;
+	}
+
+	/* Start VF */
+	rc = qed_sp_vf_start(p_hwfn, vf->concrete_fid, vf->opaque_fid);
+	if (rc)
+		DP_NOTICE(p_hwfn, "Failed to start VF[%02x]\n", vf->abs_vf_id);
+
+	vf->state = VF_FREE;
+
+	return rc;
+}
+
+static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, u16 num_rx_queues)
+{
+	struct qed_igu_block *igu_blocks;
+	int qid = 0, igu_id = 0;
+	u32 val = 0;
+
+	igu_blocks = p_hwfn->hw_info.p_igu_info->igu_map.igu_blocks;
+
+	if (num_rx_queues > p_hwfn->hw_info.p_igu_info->free_blks)
+		num_rx_queues = p_hwfn->hw_info.p_igu_info->free_blks;
+	p_hwfn->hw_info.p_igu_info->free_blks -= num_rx_queues;
+
+	SET_FIELD(val, IGU_MAPPING_LINE_FUNCTION_NUMBER, vf->abs_vf_id);
+	SET_FIELD(val, IGU_MAPPING_LINE_VALID, 1);
+	SET_FIELD(val, IGU_MAPPING_LINE_PF_VALID, 0);
+
+	while ((qid < num_rx_queues) &&
+	       (igu_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev))) {
+		if (igu_blocks[igu_id].status & QED_IGU_STATUS_FREE) {
+			struct cau_sb_entry sb_entry;
+
+			vf->igu_sbs[qid] = (u16)igu_id;
+			igu_blocks[igu_id].status &= ~QED_IGU_STATUS_FREE;
+
+			SET_FIELD(val, IGU_MAPPING_LINE_VECTOR_NUMBER, qid);
+
+			qed_wr(p_hwfn, p_ptt,
+			       IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_id,
+			       val);
+
+			/* Configure igu sb in CAU which were marked valid */
+			qed_init_cau_sb_entry(p_hwfn, &sb_entry,
+					      p_hwfn->rel_pf_id,
+					      vf->abs_vf_id, 1);
+			qed_dmae_host2grc(p_hwfn, p_ptt,
+					  (u64)(uintptr_t)&sb_entry,
+					  CAU_REG_SB_VAR_MEMORY +
+					  igu_id * sizeof(u64), 2, 0);
+			qid++;
+		}
+		igu_id++;
+	}
+
+	vf->num_sbs = (u8) num_rx_queues;
+
+	return vf->num_sbs;
+}
+
+static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u16 rel_vf_id, u16 num_rx_queues)
+{
+	u8 num_of_vf_avaiable_chains = 0;
+	struct qed_vf_info *vf = NULL;
+	int rc = 0;
+	u32 cids;
+	u8 i;
+
+	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_init_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->b_init) {
+		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n", rel_vf_id);
+		return -EINVAL;
+	}
+
+	/* Limit number of queues according to number of CIDs */
+	qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, &cids);
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] - requesting to initialize for 0x%04x queues [0x%04x CIDs available]\n",
+		   vf->relative_vf_id, num_rx_queues, (u16) cids);
+	num_rx_queues = min_t(u16, num_rx_queues, ((u16) cids));
+
+	num_of_vf_avaiable_chains = qed_iov_alloc_vf_igu_sbs(p_hwfn,
+							     p_ptt,
+							     vf,
+							     num_rx_queues);
+	if (!num_of_vf_avaiable_chains) {
+		DP_ERR(p_hwfn, "no available igu sbs\n");
+		return -ENOMEM;
+	}
+
+	/* Choose queue number and index ranges */
+	vf->num_rxqs = num_of_vf_avaiable_chains;
+	vf->num_txqs = num_of_vf_avaiable_chains;
+
+	for (i = 0; i < vf->num_rxqs; i++) {
+		u16 queue_id = qed_int_queue_id_from_sb_id(p_hwfn,
+							   vf->igu_sbs[i]);
+
+		if (queue_id > RESC_NUM(p_hwfn, QED_L2_QUEUE)) {
+			DP_NOTICE(p_hwfn,
+				  "VF[%d] will require utilizing of out-of-bounds queues - %04x\n",
+				  vf->relative_vf_id, queue_id);
+			return -EINVAL;
+		}
+
+		/* CIDs are per-VF, so no problem having them 0-based. */
+		vf->vf_queues[i].fw_rx_qid = queue_id;
+		vf->vf_queues[i].fw_tx_qid = queue_id;
+		vf->vf_queues[i].fw_cid = i;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - [%d] SB %04x, Tx/Rx queue %04x CID %04x\n",
+			   vf->relative_vf_id, i, vf->igu_sbs[i], queue_id, i);
+	}
+	rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, vf);
+	if (!rc) {
+		vf->b_init = true;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs++;
+	}
+
+	return rc;
+}
+
 static bool qed_iov_tlv_supported(u16 tlvtype)
 {
 	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
@@ -486,13 +702,147 @@ static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
 	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
 }
 
-static void qed_iov_process_mbx_dummy_resp(struct qed_hwfn *p_hwfn,
-					   struct qed_ptt *p_ptt,
-					   struct qed_vf_info *p_vf)
+static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf)
 {
-	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_NONE,
-			     sizeof(struct pfvf_def_resp_tlv),
-			     PFVF_STATUS_SUCCESS);
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct pfvf_acquire_resp_tlv *resp = &mbx->reply_virt->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	struct vfpf_acquire_tlv *req = &mbx->req_virt->acquire;
+	u8 i, vfpf_status = PFVF_STATUS_SUCCESS;
+	struct pf_vf_resc *resc = &resp->resc;
+
+	/* Validate FW compatibility */
+	if (req->vfdev_info.fw_major != FW_MAJOR_VERSION ||
+	    req->vfdev_info.fw_minor != FW_MINOR_VERSION ||
+	    req->vfdev_info.fw_revision != FW_REVISION_VERSION ||
+	    req->vfdev_info.fw_engineering != FW_ENGINEERING_VERSION) {
+		DP_INFO(p_hwfn,
+			"VF[%d] is running an incompatible driver [VF needs FW %02x:%02x:%02x:%02x but Hypervisor is using %02x:%02x:%02x:%02x]\n",
+			vf->abs_vf_id,
+			req->vfdev_info.fw_major,
+			req->vfdev_info.fw_minor,
+			req->vfdev_info.fw_revision,
+			req->vfdev_info.fw_engineering,
+			FW_MAJOR_VERSION,
+			FW_MINOR_VERSION,
+			FW_REVISION_VERSION, FW_ENGINEERING_VERSION);
+		vfpf_status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	/* On 100g PFs, prevent old VFs from loading */
+	if ((p_hwfn->cdev->num_hwfns > 1) &&
+	    !(req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_100G)) {
+		DP_INFO(p_hwfn,
+			"VF[%d] is running an old driver that doesn't support 100g\n",
+			vf->abs_vf_id);
+		vfpf_status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	memset(resp, 0, sizeof(*resp));
+
+	/* Fill in vf info stuff */
+	vf->opaque_fid = req->vfdev_info.opaque_fid;
+	vf->num_mac_filters = 1;
+	vf->num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+
+	vf->vf_bulletin = req->bulletin_addr;
+	vf->bulletin.size = (vf->bulletin.size < req->bulletin_size) ?
+			    vf->bulletin.size : req->bulletin_size;
+
+	/* fill in pfdev info */
+	pfdev_info->chip_num = p_hwfn->cdev->chip_num;
+	pfdev_info->db_size = 0;
+	pfdev_info->indices_per_sb = PIS_PER_SB;
+
+	pfdev_info->capabilities = PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED |
+				   PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE;
+	if (p_hwfn->cdev->num_hwfns > 1)
+		pfdev_info->capabilities |= PFVF_ACQUIRE_CAP_100G;
+
+	pfdev_info->stats_info.mstats.address =
+	    PXP_VF_BAR0_START_MSDM_ZONE_B +
+	    offsetof(struct mstorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.mstats.len =
+	    sizeof(struct eth_mstorm_per_queue_stat);
+
+	pfdev_info->stats_info.ustats.address =
+	    PXP_VF_BAR0_START_USDM_ZONE_B +
+	    offsetof(struct ustorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.ustats.len =
+	    sizeof(struct eth_ustorm_per_queue_stat);
+
+	pfdev_info->stats_info.pstats.address =
+	    PXP_VF_BAR0_START_PSDM_ZONE_B +
+	    offsetof(struct pstorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.pstats.len =
+	    sizeof(struct eth_pstorm_per_queue_stat);
+
+	pfdev_info->stats_info.tstats.address = 0;
+	pfdev_info->stats_info.tstats.len = 0;
+
+	memcpy(pfdev_info->port_mac, p_hwfn->hw_info.hw_mac_addr, ETH_ALEN);
+
+	pfdev_info->fw_major = FW_MAJOR_VERSION;
+	pfdev_info->fw_minor = FW_MINOR_VERSION;
+	pfdev_info->fw_rev = FW_REVISION_VERSION;
+	pfdev_info->fw_eng = FW_ENGINEERING_VERSION;
+	pfdev_info->os_type = VFPF_ACQUIRE_OS_LINUX;
+	qed_mcp_get_mfw_ver(p_hwfn, p_ptt, &pfdev_info->mfw_ver, NULL);
+
+	pfdev_info->dev_type = p_hwfn->cdev->type;
+	pfdev_info->chip_rev = p_hwfn->cdev->chip_rev;
+
+	resc->num_rxqs = vf->num_rxqs;
+	resc->num_txqs = vf->num_txqs;
+	resc->num_sbs = vf->num_sbs;
+	for (i = 0; i < resc->num_sbs; i++) {
+		resc->hw_sbs[i].hw_sb_id = vf->igu_sbs[i];
+		resc->hw_sbs[i].sb_qid = 0;
+	}
+
+	for (i = 0; i < resc->num_rxqs; i++) {
+		qed_fw_l2_queue(p_hwfn, vf->vf_queues[i].fw_rx_qid,
+				(u16 *)&resc->hw_qid[i]);
+		resc->cid[i] = vf->vf_queues[i].fw_cid;
+	}
+
+	resc->num_mac_filters = min_t(u8, vf->num_mac_filters,
+				      req->resc_request.num_mac_filters);
+	resc->num_vlan_filters = min_t(u8, vf->num_vlan_filters,
+				       req->resc_request.num_vlan_filters);
+
+	/* This isn't really required as VF isn't limited, but some VFs might
+	 * actually test this value, so need to provide it.
+	 */
+	resc->num_mc_filters = req->resc_request.num_mc_filters;
+
+	/* Fill agreed size of bulletin board in response */
+	resp->bulletin_size = vf->bulletin.size;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] ACQUIRE_RESPONSE: pfdev_info- chip_num=0x%x, db_size=%d, idx_per_sb=%d, pf_cap=0x%llx\n"
+		   "resources- n_rxq-%d, n_txq-%d, n_sbs-%d, n_macs-%d, n_vlans-%d\n",
+		   vf->abs_vf_id,
+		   resp->pfdev_info.chip_num,
+		   resp->pfdev_info.db_size,
+		   resp->pfdev_info.indices_per_sb,
+		   resp->pfdev_info.capabilities,
+		   resc->num_rxqs,
+		   resc->num_txqs,
+		   resc->num_sbs,
+		   resc->num_mac_filters,
+		   resc->num_vlan_filters);
+	vf->state = VF_ACQUIRED;
+
+	/* Prepare Response */
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_ACQUIRE,
+			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
@@ -517,7 +867,11 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 
 	/* check if tlv type is known */
 	if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
-		qed_iov_process_mbx_dummy_resp(p_hwfn, p_ptt, p_vf);
+		switch (mbx->first_tlv.tl.type) {
+		case CHANNEL_TLV_ACQUIRE:
+			qed_iov_vf_mbx_acquire(p_hwfn, p_ptt, p_vf);
+			break;
+		}
 	} else {
 		/* unknown TLV - this may belong to a VF driver from the future
 		 * - a version written after this PF driver was written, which
@@ -652,6 +1006,15 @@ void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag)
 	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, 0);
 }
 
+void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i)
+	    queue_delayed_work(cdev->hwfns[i].iov_wq,
+			       &cdev->hwfns[i].iov_task, 0);
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 112216812a12..4f190d25ee14 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -21,6 +21,9 @@
 #endif
 #define IS_PF_SRIOV_ALLOC(p_hwfn)       (!!((p_hwfn)->pf_iov_info))
 
+#define QED_MAX_VF_CHAINS_PER_PF 16
+#define QED_ETH_VF_NUM_VLAN_FILTERS 2
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -60,7 +63,17 @@ struct qed_iov_vf_mbx {
 	struct vfpf_first_tlv first_tlv;
 };
 
+struct qed_vf_q_info {
+	u16 fw_rx_qid;
+	u16 fw_tx_qid;
+	u8 fw_cid;
+	u8 rxq_active;
+	u8 txq_active;
+};
+
 enum vf_state {
+	VF_FREE = 0,		/* VF ready to be acquired holds no resc */
+	VF_ACQUIRED,		/* VF, acquired, but not initalized */
 	VF_STOPPED		/* VF, Stopped */
 };
 
@@ -82,6 +95,17 @@ struct qed_vf_info {
 #define QED_VF_ABS_ID(p_hwfn, p_vf)	(QED_PATH_ID(p_hwfn) ?		      \
 					 (p_vf)->abs_vf_id + MAX_NUM_VFS_BB : \
 					 (p_vf)->abs_vf_id)
+
+	u8 num_rxqs;
+	u8 num_txqs;
+
+	u8 num_sbs;
+
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+	struct qed_vf_q_info vf_queues[QED_MAX_VF_CHAINS_PER_PF];
+	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
+
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
@@ -133,6 +157,26 @@ u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id);
  */
 int qed_iov_hw_info(struct qed_hwfn *p_hwfn);
 
+/**
+ * @brief qed_add_tlv - place a given tlv on the tlv buffer at next offset
+ *
+ * @param p_hwfn
+ * @param p_iov
+ * @param type
+ * @param length
+ *
+ * @return pointer to the newly placed tlv
+ */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length);
+
+/**
+ * @brief list the types and lengths of the tlvs on the buffer
+ *
+ * @param p_hwfn
+ * @param tlvs_list
+ */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list);
+
 /**
  * @brief qed_iov_alloc - allocate sriov related resources
  *
@@ -179,6 +223,7 @@ void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
 int qed_iov_wq_start(struct qed_dev *cdev);
 
 void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
+void qed_vf_start_iov_wq(struct qed_dev *cdev);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -228,6 +273,10 @@ static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
 				    enum qed_iov_wq_flag flag)
 {
 }
+
+static inline void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
new file mode 100644
index 000000000000..a3c8f4e1b9c1
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -0,0 +1,357 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include "qed.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+
+static void *qed_vf_pf_prep(struct qed_hwfn *p_hwfn, u16 type, u16 length)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	void *p_tlv;
+
+	/* This lock is released when we receive PF's response
+	 * in qed_send_msg2pf().
+	 * So, qed_vf_pf_prep() and qed_send_msg2pf()
+	 * must come in sequence.
+	 */
+	mutex_lock(&(p_iov->mutex));
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "preparing to send 0x%04x tlv over vf pf channel\n",
+		   type);
+
+	/* Reset Requst offset */
+	p_iov->offset = (u8 *)p_iov->vf2pf_request;
+
+	/* Clear mailbox - both request and reply */
+	memset(p_iov->vf2pf_request, 0, sizeof(union vfpf_tlvs));
+	memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+
+	/* Init type and length */
+	p_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, type, length);
+
+	/* Init first tlv header */
+	((struct vfpf_first_tlv *)p_tlv)->reply_address =
+	    (u64)p_iov->pf2vf_reply_phys;
+
+	return p_tlv;
+}
+
+static int qed_send_msg2pf(struct qed_hwfn *p_hwfn, u8 *done, u32 resp_size)
+{
+	union vfpf_tlvs *p_req = p_hwfn->vf_iov_info->vf2pf_request;
+	struct ustorm_trigger_vf_zone trigger;
+	struct ustorm_vf_zone *zone_data;
+	int rc = 0, time = 100;
+
+	zone_data = (struct ustorm_vf_zone *)PXP_VF_BAR0_START_USDM_ZONE_B;
+
+	/* output tlvs list */
+	qed_dp_tlv_list(p_hwfn, p_req);
+
+	/* need to add the END TLV to the message size */
+	resp_size += sizeof(struct channel_list_end_tlv);
+
+	/* Send TLVs over HW channel */
+	memset(&trigger, 0, sizeof(struct ustorm_trigger_vf_zone));
+	trigger.vf_pf_msg_valid = 1;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF -> PF [%02x] message: [%08x, %08x] --> %p, %08x --> %p\n",
+		   GET_FIELD(p_hwfn->hw_info.concrete_fid,
+			     PXP_CONCRETE_FID_PFID),
+		   upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   &zone_data->non_trigger.vf_pf_msg_addr,
+		   *((u32 *)&trigger), &zone_data->trigger);
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.lo,
+	       lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.hi,
+	       upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	/* The message data must be written first, to prevent trigger before
+	 * data is written.
+	 */
+	wmb();
+
+	REG_WR(p_hwfn, (uintptr_t)&zone_data->trigger, *((u32 *)&trigger));
+
+	/* When PF would be done with the response, it would write back to the
+	 * `done' address. Poll until then.
+	 */
+	while ((!*done) && time) {
+		msleep(25);
+		time--;
+	}
+
+	if (!*done) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF <-- PF Timeout [Type %d]\n",
+			   p_req->first_tlv.tl.type);
+		rc = -EBUSY;
+		goto exit;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "PF response: %d [Type %d]\n",
+			   *done, p_req->first_tlv.tl.type);
+	}
+
+exit:
+	mutex_unlock(&(p_hwfn->vf_iov_info->mutex));
+
+	return rc;
+}
+
+#define VF_ACQUIRE_THRESH 3
+#define VF_ACQUIRE_MAC_FILTERS 1
+
+static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_acquire_resp_tlv *resp = &p_iov->pf2vf_reply->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	u8 rx_count = 1, tx_count = 1, num_sbs = 1;
+	u8 num_mac = VF_ACQUIRE_MAC_FILTERS;
+	bool resources_acquired = false;
+	struct vfpf_acquire_tlv *req;
+	int rc = 0, attempts = 0;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_ACQUIRE, sizeof(*req));
+
+	/* starting filling the request */
+	req->vfdev_info.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	req->resc_request.num_rxqs = rx_count;
+	req->resc_request.num_txqs = tx_count;
+	req->resc_request.num_sbs = num_sbs;
+	req->resc_request.num_mac_filters = num_mac;
+	req->resc_request.num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+
+	req->vfdev_info.os_type = VFPF_ACQUIRE_OS_LINUX;
+	req->vfdev_info.fw_major = FW_MAJOR_VERSION;
+	req->vfdev_info.fw_minor = FW_MINOR_VERSION;
+	req->vfdev_info.fw_revision = FW_REVISION_VERSION;
+	req->vfdev_info.fw_engineering = FW_ENGINEERING_VERSION;
+
+	/* Fill capability field with any non-deprecated config we support */
+	req->vfdev_info.capabilities |= VFPF_ACQUIRE_CAP_100G;
+
+	/* pf 2 vf bulletin board address */
+	req->bulletin_addr = p_iov->bulletin.phys;
+	req->bulletin_size = p_iov->bulletin.size;
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	while (!resources_acquired) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV, "attempting to acquire resources\n");
+
+		/* send acquire request */
+		rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+		if (rc)
+			return rc;
+
+		/* copy acquire response from buffer to p_hwfn */
+		memcpy(&p_iov->acquire_resp, resp, sizeof(p_iov->acquire_resp));
+
+		attempts++;
+
+		if (resp->hdr.status == PFVF_STATUS_SUCCESS) {
+			/* PF agrees to allocate our resources */
+			if (!(resp->pfdev_info.capabilities &
+			      PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE)) {
+				DP_INFO(p_hwfn,
+					"PF is using old incompatible driver; Either downgrade driver or request provider to update hypervisor version\n");
+				return -EINVAL;
+			}
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV, "resources acquired\n");
+			resources_acquired = true;
+		} else if (resp->hdr.status == PFVF_STATUS_NO_RESOURCE &&
+			   attempts < VF_ACQUIRE_THRESH) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "PF unwilling to fullfill resource request. Try PF recommended amount\n");
+
+			/* humble our request */
+			req->resc_request.num_txqs = resp->resc.num_txqs;
+			req->resc_request.num_rxqs = resp->resc.num_rxqs;
+			req->resc_request.num_sbs = resp->resc.num_sbs;
+			req->resc_request.num_mac_filters =
+			    resp->resc.num_mac_filters;
+			req->resc_request.num_vlan_filters =
+			    resp->resc.num_vlan_filters;
+
+			/* Clear response buffer */
+			memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+		} else {
+			DP_ERR(p_hwfn,
+			       "PF returned error %d to VF acquisition request\n",
+			       resp->hdr.status);
+			return -EAGAIN;
+		}
+	}
+
+	/* Update bulletin board size with response from PF */
+	p_iov->bulletin.size = resp->bulletin_size;
+
+	/* get HW info */
+	p_hwfn->cdev->type = resp->pfdev_info.dev_type;
+	p_hwfn->cdev->chip_rev = resp->pfdev_info.chip_rev;
+
+	p_hwfn->cdev->chip_num = pfdev_info->chip_num & 0xffff;
+
+	/* Learn of the possibility of CMT */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		if (resp->pfdev_info.capabilities & PFVF_ACQUIRE_CAP_100G) {
+			DP_NOTICE(p_hwfn, "100g VF\n");
+			p_hwfn->cdev->num_hwfns = 2;
+		}
+	}
+
+	return 0;
+}
+
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov;
+	u32 reg;
+
+	/* Set number of hwfns - might be overriden once leading hwfn learns
+	 * actual configuration from PF.
+	 */
+	if (IS_LEAD_HWFN(p_hwfn))
+		p_hwfn->cdev->num_hwfns = 1;
+
+	/* Set the doorbell bar. Assumption: regview is set */
+	p_hwfn->doorbells = (u8 __iomem *)p_hwfn->regview +
+					  PXP_VF_BAR0_START_DQ;
+
+	reg = PXP_VF_BAR0_ME_OPAQUE_ADDRESS;
+	p_hwfn->hw_info.opaque_fid = (u16)REG_RD(p_hwfn, reg);
+
+	reg = PXP_VF_BAR0_ME_CONCRETE_ADDRESS;
+	p_hwfn->hw_info.concrete_fid = REG_RD(p_hwfn, reg);
+
+	/* Allocate vf sriov info */
+	p_iov = kzalloc(sizeof(*p_iov), GFP_KERNEL);
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "Failed to allocate `struct qed_sriov'\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate vf2pf msg */
+	p_iov->vf2pf_request = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						  sizeof(union vfpf_tlvs),
+						  &p_iov->vf2pf_request_phys,
+						  GFP_KERNEL);
+	if (!p_iov->vf2pf_request) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate `vf2pf_request' DMA memory\n");
+		goto free_p_iov;
+	}
+
+	p_iov->pf2vf_reply = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						sizeof(union pfvf_tlvs),
+						&p_iov->pf2vf_reply_phys,
+						GFP_KERNEL);
+	if (!p_iov->pf2vf_reply) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate `pf2vf_reply' DMA memory\n");
+		goto free_vf2pf_request;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF's Request mailbox [%p virt 0x%llx phys], Response mailbox [%p virt 0x%llx phys]\n",
+		   p_iov->vf2pf_request,
+		   (u64) p_iov->vf2pf_request_phys,
+		   p_iov->pf2vf_reply, (u64)p_iov->pf2vf_reply_phys);
+
+	/* Allocate Bulletin board */
+	p_iov->bulletin.size = sizeof(struct qed_bulletin_content);
+	p_iov->bulletin.p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						    p_iov->bulletin.size,
+						    &p_iov->bulletin.phys,
+						    GFP_KERNEL);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF's bulletin Board [%p virt 0x%llx phys 0x%08x bytes]\n",
+		   p_iov->bulletin.p_virt,
+		   (u64)p_iov->bulletin.phys, p_iov->bulletin.size);
+
+	mutex_init(&p_iov->mutex);
+
+	p_hwfn->vf_iov_info = p_iov;
+
+	p_hwfn->hw_info.personality = QED_PCI_ETH;
+
+	return qed_vf_pf_acquire(p_hwfn);
+
+free_vf2pf_request:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(union vfpf_tlvs),
+			  p_iov->vf2pf_request, p_iov->vf2pf_request_phys);
+free_p_iov:
+	kfree(p_iov);
+
+	return -ENOMEM;
+}
+
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "vf_sriov_info isn't initialized\n");
+		return 0;
+	}
+
+	return p_iov->acquire_resp.resc.hw_sbs[sb_id].hw_sb_id;
+}
+
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+	*num_rxqs = p_hwfn->vf_iov_info->acquire_resp.resc.num_rxqs;
+}
+
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+	memcpy(port_mac,
+	       p_hwfn->vf_iov_info->acquire_resp.pfdev_info.port_mac, ETH_ALEN);
+}
+
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn, u8 *num_vlan_filters)
+{
+	struct qed_vf_iov *p_vf;
+
+	p_vf = p_hwfn->vf_iov_info;
+	*num_vlan_filters = p_vf->acquire_resp.resc.num_vlan_filters;
+}
+
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng)
+{
+	struct pf_vf_pfdev_info *info;
+
+	info = &p_hwfn->vf_iov_info->acquire_resp.pfdev_info;
+
+	*fw_major = info->fw_major;
+	*fw_minor = info->fw_minor;
+	*fw_rev = info->fw_rev;
+	*fw_eng = info->fw_eng;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index f0d8de2be581..de9fe8501d21 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -9,6 +9,22 @@
 #ifndef _QED_VF_H
 #define _QED_VF_H
 
+struct vf_pf_resc_request {
+	u8 num_rxqs;
+	u8 num_txqs;
+	u8 num_sbs;
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+	u8 num_mc_filters;
+	u16 padding;
+};
+
+struct hw_sb_info {
+	u16 hw_sb_id;
+	u8 sb_qid;
+	u8 padding[5];
+};
+
 enum {
 	PFVF_STATUS_WAITING,
 	PFVF_STATUS_SUCCESS,
@@ -52,6 +68,107 @@ struct channel_list_end_tlv {
 	u8 padding[4];
 };
 
+#define VFPF_ACQUIRE_OS_LINUX (0)
+#define VFPF_ACQUIRE_OS_WINDOWS (1)
+#define VFPF_ACQUIRE_OS_ESX (2)
+#define VFPF_ACQUIRE_OS_SOLARIS (3)
+#define VFPF_ACQUIRE_OS_LINUX_USERSPACE (4)
+
+struct vfpf_acquire_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	struct vf_pf_vfdev_info {
+#define VFPF_ACQUIRE_CAP_OBSOLETE	(1 << 0)
+#define VFPF_ACQUIRE_CAP_100G		(1 << 1) /* VF can support 100g */
+		u64 capabilities;
+		u8 fw_major;
+		u8 fw_minor;
+		u8 fw_revision;
+		u8 fw_engineering;
+		u32 driver_version;
+		u16 opaque_fid;	/* ME register value */
+		u8 os_type;	/* VFPF_ACQUIRE_OS_* value */
+		u8 padding[5];
+	} vfdev_info;
+
+	struct vf_pf_resc_request resc_request;
+
+	u64 bulletin_addr;
+	u32 bulletin_size;
+	u32 padding;
+};
+
+struct pfvf_storm_stats {
+	u32 address;
+	u32 len;
+};
+
+struct pfvf_stats_info {
+	struct pfvf_storm_stats mstats;
+	struct pfvf_storm_stats pstats;
+	struct pfvf_storm_stats tstats;
+	struct pfvf_storm_stats ustats;
+};
+
+struct pfvf_acquire_resp_tlv {
+	struct pfvf_tlv hdr;
+
+	struct pf_vf_pfdev_info {
+		u32 chip_num;
+		u32 mfw_ver;
+
+		u16 fw_major;
+		u16 fw_minor;
+		u16 fw_rev;
+		u16 fw_eng;
+
+		u64 capabilities;
+#define PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED	BIT(0)
+#define PFVF_ACQUIRE_CAP_100G			BIT(1)	/* If set, 100g PF */
+/* There are old PF versions where the PF might mistakenly override the sanity
+ * mechanism [version-based] and allow a VF that can't be supported to pass
+ * the acquisition phase.
+ * To overcome this, PFs now indicate that they're past that point and the new
+ * VFs would fail probe on the older PFs that fail to do so.
+ */
+#define PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE	BIT(2)
+
+		u16 db_size;
+		u8 indices_per_sb;
+		u8 os_type;
+
+		/* These should match the PF's qed_dev values */
+		u16 chip_rev;
+		u8 dev_type;
+
+		u8 padding;
+
+		struct pfvf_stats_info stats_info;
+
+		u8 port_mac[ETH_ALEN];
+		u8 padding2[2];
+	} pfdev_info;
+
+	struct pf_vf_resc {
+#define PFVF_MAX_QUEUES_PER_VF		16
+#define PFVF_MAX_SBS_PER_VF		16
+		struct hw_sb_info hw_sbs[PFVF_MAX_SBS_PER_VF];
+		u8 hw_qid[PFVF_MAX_QUEUES_PER_VF];
+		u8 cid[PFVF_MAX_QUEUES_PER_VF];
+
+		u8 num_rxqs;
+		u8 num_txqs;
+		u8 num_sbs;
+		u8 num_mac_filters;
+		u8 num_vlan_filters;
+		u8 num_mc_filters;
+		u8 padding[2];
+	} resc;
+
+	u32 bulletin_size;
+	u32 padding;
+};
+
 #define TLV_BUFFER_SIZE                 1024
 struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
@@ -59,12 +176,14 @@ struct tlv_buffer_size {
 
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
+	struct vfpf_acquire_tlv acquire;
 	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
 union pfvf_tlvs {
 	struct pfvf_def_resp_tlv default_resp;
+	struct pfvf_acquire_resp_tlv acquire_resp;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -86,8 +205,118 @@ struct qed_bulletin {
 
 enum {
 	CHANNEL_TLV_NONE,	/* ends tlv sequence */
+	CHANNEL_TLV_ACQUIRE,
 	CHANNEL_TLV_LIST_END,
 	CHANNEL_TLV_MAX
 };
 
+/* This data is held in the qed_hwfn structure for VFs only. */
+struct qed_vf_iov {
+	union vfpf_tlvs *vf2pf_request;
+	dma_addr_t vf2pf_request_phys;
+	union pfvf_tlvs *pf2vf_reply;
+	dma_addr_t pf2vf_reply_phys;
+
+	/* Should be taken whenever the mailbox buffers are accessed */
+	struct mutex mutex;
+	u8 *offset;
+
+	/* Bulletin Board */
+	struct qed_bulletin bulletin;
+	struct qed_bulletin_content bulletin_shadow;
+
+	/* we set aside a copy of the acquire response */
+	struct pfvf_acquire_resp_tlv acquire_resp;
+};
+
+#ifdef CONFIG_QED_SRIOV
+/**
+ * @brief Get number of Rx queues allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated RX queues
+ */
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs);
+
+/**
+ * @brief Get port mac address for VF
+ *
+ * @param p_hwfn
+ * @param port_mac - destination location for port mac
+ */
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac);
+
+/**
+ * @brief Get number of VLAN filters allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated VLAN filters
+ */
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+				 u8 *num_vlan_filters);
+
+/**
+ * @brief Set firmware version information in dev_info from VFs acquire response tlv
+ *
+ * @param p_hwfn
+ * @param fw_major
+ * @param fw_minor
+ * @param fw_rev
+ * @param fw_eng
+ */
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng);
+
+/**
+ * @brief hw preparation for VF
+ *      sends ACQUIRE message
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_vf_get_igu_sb_id - Get the IGU SB ID for a given
+ *        sb_id. For VFs igu sbs don't have to be contiguous
+ *
+ * @param p_hwfn
+ * @param sb_id
+ *
+ * @return INLINE u16
+ */
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+#else
+static inline void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+}
+
+static inline void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+}
+
+static inline void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+					       u8 *num_vlan_filters)
+{
+}
+
+static inline void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+					 u16 *fw_major, u16 *fw_minor,
+					 u16 *fw_rev, u16 *fw_eng)
+{
+}
+
+static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 075faa52eb48..2d5f2735dc0a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2283,8 +2283,9 @@ enum qede_probe_mode {
 };
 
 static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level,
-			enum qede_probe_mode mode)
+			bool is_vf, enum qede_probe_mode mode)
 {
+	struct qed_probe_params probe_params;
 	struct qed_slowpath_params params;
 	struct qed_dev_eth_info dev_info;
 	struct qede_dev *edev;
@@ -2294,8 +2295,12 @@ static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level,
 	if (unlikely(dp_level & QED_LEVEL_INFO))
 		pr_notice("Starting qede probe\n");
 
-	cdev = qed_ops->common->probe(pdev, QED_PROTOCOL_ETH,
-				      dp_module, dp_level);
+	memset(&probe_params, 0, sizeof(probe_params));
+	probe_params.protocol = QED_PROTOCOL_ETH;
+	probe_params.dp_module = dp_module;
+	probe_params.dp_level = dp_level;
+	probe_params.is_vf = is_vf;
+	cdev = qed_ops->common->probe(pdev, &probe_params);
 	if (!cdev) {
 		rc = -ENODEV;
 		goto err0;
@@ -2365,7 +2370,7 @@ static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	qede_config_debug(debug, &dp_module, &dp_level);
 
-	return __qede_probe(pdev, dp_module, dp_level,
+	return __qede_probe(pdev, dp_module, dp_level, false,
 			    QEDE_PROBE_NORMAL);
 }
 
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 8914d271ba73..3f14c7efe68f 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -285,6 +285,63 @@
 #define PXP_ILT_PAGE_SIZE_NUM_BITS_MIN	12
 #define PXP_ILT_BLOCK_FACTOR_MULTIPLIER	1024
 
+#define PXP_VF_BAR0_START_IGU                   0
+#define PXP_VF_BAR0_IGU_LENGTH                  0x3000
+#define PXP_VF_BAR0_END_IGU                     (PXP_VF_BAR0_START_IGU + \
+						 PXP_VF_BAR0_IGU_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_DQ                    0x3000
+#define PXP_VF_BAR0_DQ_LENGTH                   0x200
+#define PXP_VF_BAR0_DQ_OPAQUE_OFFSET            0
+#define PXP_VF_BAR0_ME_OPAQUE_ADDRESS           (PXP_VF_BAR0_START_DQ +	\
+						 PXP_VF_BAR0_DQ_OPAQUE_OFFSET)
+#define PXP_VF_BAR0_ME_CONCRETE_ADDRESS         (PXP_VF_BAR0_ME_OPAQUE_ADDRESS \
+						 + 4)
+#define PXP_VF_BAR0_END_DQ                      (PXP_VF_BAR0_START_DQ +	\
+						 PXP_VF_BAR0_DQ_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_TSDM_ZONE_B           0x3200
+#define PXP_VF_BAR0_SDM_LENGTH_ZONE_B           0x200
+#define PXP_VF_BAR0_END_TSDM_ZONE_B             (PXP_VF_BAR0_START_TSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_MSDM_ZONE_B           0x3400
+#define PXP_VF_BAR0_END_MSDM_ZONE_B             (PXP_VF_BAR0_START_MSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_USDM_ZONE_B           0x3600
+#define PXP_VF_BAR0_END_USDM_ZONE_B             (PXP_VF_BAR0_START_USDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_XSDM_ZONE_B           0x3800
+#define PXP_VF_BAR0_END_XSDM_ZONE_B             (PXP_VF_BAR0_START_XSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_YSDM_ZONE_B           0x3a00
+#define PXP_VF_BAR0_END_YSDM_ZONE_B             (PXP_VF_BAR0_START_YSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_PSDM_ZONE_B           0x3c00
+#define PXP_VF_BAR0_END_PSDM_ZONE_B             (PXP_VF_BAR0_START_PSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_SDM_ZONE_A            0x4000
+#define PXP_VF_BAR0_END_SDM_ZONE_A              0x10000
+
+#define PXP_VF_BAR0_GRC_WINDOW_LENGTH           32
+
 /* ILT Records */
 #define PXP_NUM_ILT_RECORDS_BB 7600
 #define PXP_NUM_ILT_RECORDS_K2 11000
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d72c832a9397..76a6f168a190 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -140,6 +140,13 @@ struct qed_link_output {
 	u32	pause_config;
 };
 
+struct qed_probe_params {
+	enum qed_protocol protocol;
+	u32 dp_module;
+	u8 dp_level;
+	bool is_vf;
+};
+
 #define QED_DRV_VER_STR_SIZE 12
 struct qed_slowpath_params {
 	u32	int_mode;
@@ -207,8 +214,7 @@ struct qed_common_ops {
 	struct qed_selftest_ops *selftest;
 
 	struct qed_dev*	(*probe)(struct pci_dev *dev,
-				 enum qed_protocol protocol,
-				 u32 dp_module, u8 dp_level);
+				 struct qed_probe_params *params);
 
 	void		(*remove)(struct qed_dev *cdev);
 
-- 
cgit v1.2.3


From 0b55e27d563f493665693b494735574e68c3c5b9 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:15 +0300
Subject: qed: IOV configure and FLR

While previous patches have already added the necessary logic to probe
VFs as well as enabling them in the HW, this patch adds the ability to
support VF FLR & SRIOV disable.

It then wraps both flows together into the first IOV callback to be
provided to the protocol driver - `configure'. This would later to be used
to enable and disable SRIOV in the adapter.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c      |  17 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h  |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h      |  11 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c       |   7 +
 drivers/net/ethernet/qlogic/qed/qed_main.c     |   1 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.c      |  72 +++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h      |  12 +
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |  10 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h       |   1 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 660 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h    |  33 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c       |  97 ++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       |  45 ++
 include/linux/qed/qed_eth_if.h                 |   4 +
 include/linux/qed/qed_iov_if.h                 |  20 +
 15 files changed, 983 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/qed/qed_iov_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 362e8db2b374..78e25cf6836f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -31,6 +31,7 @@
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 #include "qed_sriov.h"
+#include "qed_vf.h"
 
 /* API common to all protocols */
 enum BAR_ID {
@@ -420,8 +421,7 @@ void qed_resc_setup(struct qed_dev *cdev)
 #define FINAL_CLEANUP_POLL_CNT          (100)
 #define FINAL_CLEANUP_POLL_TIME         (10)
 int qed_final_cleanup(struct qed_hwfn *p_hwfn,
-		      struct qed_ptt *p_ptt,
-		      u16 id)
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf)
 {
 	u32 command = 0, addr, count = FINAL_CLEANUP_POLL_CNT;
 	int rc = -EBUSY;
@@ -429,6 +429,9 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 	addr = GTT_BAR0_MAP_REG_USDM_RAM +
 		USTORM_FLR_FINAL_ACK_OFFSET(p_hwfn->rel_pf_id);
 
+	if (is_vf)
+		id += 0x10;
+
 	command |= X_FINAL_CLEANUP_AGG_INT <<
 		SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT;
 	command |= 1 << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT;
@@ -663,7 +666,7 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
 
 	/* Cleanup chip from previous driver if such remains exist */
-	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id);
+	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id, false);
 	if (rc != 0)
 		return rc;
 
@@ -880,7 +883,7 @@ int qed_hw_stop(struct qed_dev *cdev)
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Stopping hw/fw\n");
 
 		if (IS_VF(cdev)) {
-			/* To be implemented in a later patch */
+			qed_vf_pf_int_cleanup(p_hwfn);
 			continue;
 		}
 
@@ -989,7 +992,9 @@ int qed_hw_reset(struct qed_dev *cdev)
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
 		if (IS_VF(cdev)) {
-			/* Will be implemented in a later patch */
+			rc = qed_vf_pf_reset(p_hwfn);
+			if (rc)
+				return rc;
 			continue;
 		}
 
@@ -1590,7 +1595,7 @@ void qed_hw_remove(struct qed_dev *cdev)
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
 		if (IS_VF(cdev)) {
-			/* Will be implemented in a later patch */
+			qed_vf_pf_release(p_hwfn);
 			continue;
 		}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index f567371fe304..dde364d6f502 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -303,11 +303,11 @@ int qed_fw_rss_eng(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_ptt
  * @param id - For PF, engine-relative. For VF, PF-relative.
+ * @param is_vf - true iff cleanup is made for a VF.
  *
  * @return int
  */
 int qed_final_cleanup(struct qed_hwfn *p_hwfn,
-		      struct qed_ptt *p_ptt,
-		      u16 id);
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf);
 
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index c511106870d0..82b7727d090b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -30,7 +30,7 @@ enum common_event_opcode {
 	COMMON_EVENT_PF_START,
 	COMMON_EVENT_PF_STOP,
 	COMMON_EVENT_VF_START,
-	COMMON_EVENT_RESERVED2,
+	COMMON_EVENT_VF_STOP,
 	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
 	COMMON_EVENT_RESERVED5,
@@ -45,7 +45,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_PF_START /* PF Function Start Ramrod */,
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
 	COMMON_RAMROD_VF_START,
-	COMMON_RAMROD_RESERVED2,
+	COMMON_RAMROD_VF_STOP,
 	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
 	MAX_COMMON_RAMROD_CMD_ID
@@ -741,6 +741,13 @@ struct vf_start_ramrod_data {
 	u8 reserved[3];
 };
 
+struct vf_stop_ramrod_data {
+	u8 vf_id;
+	u8 reserved0;
+	__le16 reserved1;
+	__le32 reserved2;
+};
+
 struct atten_status_block {
 	__le32	atten_bits;
 	__le32	atten_ack;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 8bcbf92b776f..5978bb57f883 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2066,8 +2066,15 @@ static int qed_fp_cqe_completion(struct qed_dev *dev,
 				      cqe);
 }
 
+#ifdef CONFIG_QED_SRIOV
+extern const struct qed_iov_hv_ops qed_iov_ops_pass;
+#endif
+
 static const struct qed_eth_ops qed_eth_ops_pass = {
 	.common = &qed_common_ops_pass,
+#ifdef CONFIG_QED_SRIOV
+	.iov = &qed_iov_ops_pass,
+#endif
 	.fill_dev_info = &qed_fill_eth_dev_info,
 	.register_ops = &qed_register_eth_ops,
 	.vport_start = &qed_start_vport,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 898347bd2db7..e98610e5bf70 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -897,6 +897,7 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 
 	if (IS_PF(cdev)) {
 		qed_free_stream_mem(cdev);
+		qed_sriov_disable(cdev, true);
 
 		qed_nic_stop(cdev);
 		qed_slowpath_irq_free(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 83175007f616..2be943b91916 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -442,6 +442,75 @@ int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static void qed_mcp_handle_vf_flr(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_PATH);
+	u32 mfw_path_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 path_addr = SECTION_ADDR(mfw_path_offsize,
+				     QED_PATH_ID(p_hwfn));
+	u32 disabled_vfs[VF_MAX_STATIC / 32];
+	int i;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Reading Disabled VF information from [offset %08x], path_addr %08x\n",
+		   mfw_path_offsize, path_addr);
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++) {
+		disabled_vfs[i] = qed_rd(p_hwfn, p_ptt,
+					 path_addr +
+					 offsetof(struct public_path,
+						  mcp_vf_disabled) +
+					 sizeof(u32) * i);
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "FLR-ed VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, disabled_vfs[i]);
+	}
+
+	if (qed_iov_mark_vf_flr(p_hwfn, disabled_vfs))
+		qed_schedule_iov(p_hwfn, QED_IOV_WQ_FLR_FLAG);
+}
+
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_FUNC);
+	u32 mfw_func_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 func_addr = SECTION_ADDR(mfw_func_offsize,
+				     MCP_PF_ID(p_hwfn));
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data union_data;
+	int rc;
+	int i;
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "Acking VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, vfs_to_ack[i]);
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_VF_DISABLED_DONE;
+	memcpy(&union_data.ack_vf_disabled, vfs_to_ack, VF_MAX_STATIC / 8);
+	mb_params.p_data_src = &union_data;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to pass ACK for VF flr to MFW\n");
+		return -EBUSY;
+	}
+
+	/* Clear the ACK bits */
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		qed_wr(p_hwfn, p_ptt,
+		       func_addr +
+		       offsetof(struct public_func, drv_ack_vf_disabled) +
+		       i * sizeof(u32), 0);
+
+	return rc;
+}
+
 static void qed_mcp_handle_transceiver_change(struct qed_hwfn *p_hwfn,
 					      struct qed_ptt *p_ptt)
 {
@@ -753,6 +822,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_LINK_CHANGE:
 			qed_mcp_handle_link_change(p_hwfn, p_ptt, false);
 			break;
+		case MFW_DRV_MSG_VF_DISABLED:
+			qed_mcp_handle_vf_flr(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index e3d5cdfe8e8d..6dd59eb7f4c6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -392,6 +392,18 @@ int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
 void qed_mcp_read_mb(struct qed_hwfn *p_hwfn,
 		     struct qed_ptt *p_ptt);
 
+/**
+ * @brief Ack to mfw that driver finished FLR process for VFs
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vfs_to_ack - bit mask of all engine VFs for which the PF acks.
+ *
+ * @param return int - 0 upon success.
+ */
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack);
+
 /**
  * @brief - calls during init to read shmem of all function-related info.
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index a508b6b7f1d4..80a621754f13 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -41,6 +41,8 @@
 	0x2aa16cUL
 #define PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR \
 	0x2aa118UL
+#define PSWHST_REG_ZONE_PERMISSION_TABLE \
+	0x2a0800UL
 #define  BAR0_MAP_REG_MSDM_RAM \
 	0x1d00000UL
 #define  BAR0_MAP_REG_USDM_RAM \
@@ -79,6 +81,8 @@
 	0x2f2eb0UL
 #define  DORQ_REG_PF_DB_ENABLE \
 	0x100508UL
+#define DORQ_REG_VF_USAGE_CNT \
+	0x1009c4UL
 #define  QM_REG_PF_EN \
 	0x2f2ea4UL
 #define  TCFC_REG_STRONG_ENABLE_PF \
@@ -167,6 +171,10 @@
 	0x040200UL
 #define  PBF_REG_INIT \
 	0xd80000UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 \
+	0xd806c8UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 \
+	0xd806ccUL
 #define  PTU_REG_ATC_INIT_ARRAY \
 	0x560000UL
 #define  PCM_REG_INIT \
@@ -391,6 +399,8 @@
 	0x1d0000UL
 #define  IGU_REG_PF_CONFIGURATION \
 	0x180800UL
+#define IGU_REG_VF_CONFIGURATION \
+	0x180804UL
 #define  MISC_REG_AEU_ENABLE1_IGU_OUT_0 \
 	0x00849cUL
 #define MISC_REG_AEU_AFTER_INVERT_1_IGU	\
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index dde69090379f..c2999cb5d1e2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -64,6 +64,7 @@ union ramrod_data {
 	struct vport_filter_update_ramrod_data vport_filter_update;
 
 	struct vf_start_ramrod_data vf_start;
+	struct vf_stop_ramrod_data vf_stop;
 };
 
 #define EQ_MAX_CREDIT   0xffffffff
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 699d96fb87f0..750166db57cd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -6,6 +6,7 @@
  * this source tree.
  */
 
+#include <linux/qed/qed_iov_if.h>
 #include "qed_cxt.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -48,6 +49,33 @@ static int qed_sp_vf_start(struct qed_hwfn *p_hwfn,
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+static int qed_sp_vf_stop(struct qed_hwfn *p_hwfn,
+			  u32 concrete_vfid, u16 opaque_vfid)
+{
+	struct vf_stop_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_vfid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_STOP,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_stop;
+
+	p_ramrod->vf_id = GET_FIELD(concrete_vfid, PXP_CONCRETE_FID_VFID);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 			   int rel_vf_id, bool b_enabled_only)
 {
@@ -422,6 +450,34 @@ static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
 	return true;
 }
 
+static void qed_iov_set_vf_to_disable(struct qed_dev *cdev,
+				      u16 rel_vf_id, u8 to_disable)
+{
+	struct qed_vf_info *vf;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+		if (!vf)
+			continue;
+
+		vf->to_disable = to_disable;
+	}
+}
+
+void qed_iov_set_vfs_to_disable(struct qed_dev *cdev, u8 to_disable)
+{
+	u16 i;
+
+	if (!IS_QED_SRIOV(cdev))
+		return;
+
+	for (i = 0; i < cdev->p_iov_info->total_vfs; i++)
+		qed_iov_set_vf_to_disable(cdev, i, to_disable);
+}
+
 static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt, u8 abs_vfid)
 {
@@ -430,6 +486,27 @@ static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
 	       1 << (abs_vfid & 0x1f));
 }
 
+static void qed_iov_vf_igu_set_int(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, bool enable)
+{
+	u32 igu_vf_conf;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	igu_vf_conf = qed_rd(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION);
+
+	if (enable)
+		igu_vf_conf |= IGU_VF_CONF_MSI_MSIX_EN;
+	else
+		igu_vf_conf &= ~IGU_VF_CONF_MSI_MSIX_EN;
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION, igu_vf_conf);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+}
+
 static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt,
 				    struct qed_vf_info *vf)
@@ -437,6 +514,9 @@ static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 	u32 igu_vf_conf = IGU_VF_CONF_FUNC_EN;
 	int rc;
 
+	if (vf->to_disable)
+		return 0;
+
 	DP_VERBOSE(p_hwfn,
 		   QED_MSG_IOV,
 		   "Enable internal access for vf %x [abs %x]\n",
@@ -475,6 +555,36 @@ static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+/**
+ * @brief qed_iov_config_perm_table - configure the permission
+ *      zone table.
+ *      In E4, queue zone permission table size is 320x9. There
+ *      are 320 VF queues for single engine device (256 for dual
+ *      engine device), and each entry has the following format:
+ *      {Valid, VF[7:0]}
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf
+ * @param enable
+ */
+static void qed_iov_config_perm_table(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *vf, u8 enable)
+{
+	u32 reg_addr, val;
+	u16 qzone_id = 0;
+	int qid;
+
+	for (qid = 0; qid < vf->num_rxqs; qid++) {
+		qed_fw_l2_queue(p_hwfn, vf->vf_queues[qid].fw_rx_qid,
+				&qzone_id);
+
+		reg_addr = PSWHST_REG_ZONE_PERMISSION_TABLE + qzone_id * 4;
+		val = enable ? (vf->abs_vf_id | (1 << 8)) : 0;
+		qed_wr(p_hwfn, p_ptt, reg_addr, val);
+	}
+}
+
 static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   struct qed_vf_info *vf, u16 num_rx_queues)
@@ -525,6 +635,32 @@ static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 	return vf->num_sbs;
 }
 
+static void qed_iov_free_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+	int idx, igu_id;
+	u32 addr, val;
+
+	/* Invalidate igu CAM lines and mark them as free */
+	for (idx = 0; idx < vf->num_sbs; idx++) {
+		igu_id = vf->igu_sbs[idx];
+		addr = IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_id;
+
+		val = qed_rd(p_hwfn, p_ptt, addr);
+		SET_FIELD(val, IGU_MAPPING_LINE_VALID, 0);
+		qed_wr(p_hwfn, p_ptt, addr, val);
+
+		p_info->igu_map.igu_blocks[igu_id].status |=
+		    QED_IGU_STATUS_FREE;
+
+		p_hwfn->hw_info.p_igu_info->free_blks++;
+	}
+
+	vf->num_sbs = 0;
+}
+
 static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
 				  u16 rel_vf_id, u16 num_rx_queues)
@@ -598,6 +734,54 @@ static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+static int qed_iov_release_hw_for_vf(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 rel_vf_id)
+{
+	struct qed_vf_info *vf = NULL;
+	int rc = 0;
+
+	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_release_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->state != VF_STOPPED) {
+		/* Stopping the VF */
+		rc = qed_sp_vf_stop(p_hwfn, vf->concrete_fid, vf->opaque_fid);
+
+		if (rc != 0) {
+			DP_ERR(p_hwfn, "qed_sp_vf_stop returned error %d\n",
+			       rc);
+			return rc;
+		}
+
+		vf->state = VF_STOPPED;
+	}
+
+	/* disablng interrupts and resetting permission table was done during
+	 * vf-close, however, we could get here without going through vf_close
+	 */
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	vf->num_rxqs = 0;
+	vf->num_txqs = 0;
+	qed_iov_free_vf_igu_sbs(p_hwfn, p_ptt, vf);
+
+	if (vf->b_init) {
+		vf->b_init = false;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs--;
+	}
+
+	return 0;
+}
+
 static bool qed_iov_tlv_supported(u16 tlvtype)
 {
 	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
@@ -702,6 +886,51 @@ static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
 	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
 }
 
+struct qed_public_vf_info *qed_iov_get_public_vf_info(struct qed_hwfn *p_hwfn,
+						      u16 relative_vf_id,
+						      bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	vf = qed_iov_get_vf_info(p_hwfn, relative_vf_id, b_enabled_only);
+	if (!vf)
+		return NULL;
+
+	return &vf->p_vf_info;
+}
+
+void qed_iov_clean_vf(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	struct qed_public_vf_info *vf_info;
+
+	vf_info = qed_iov_get_public_vf_info(p_hwfn, vfid, false);
+
+	if (!vf_info)
+		return;
+
+	/* Clear the VF mac */
+	memset(vf_info->mac, 0, ETH_ALEN);
+}
+
+static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf)
+{
+	u32 i;
+
+	p_vf->vf_bulletin = 0;
+	p_vf->num_mac_filters = 0;
+	p_vf->num_vlan_filters = 0;
+
+	/* If VF previously requested less resources, go back to default */
+	p_vf->num_rxqs = p_vf->num_sbs;
+	p_vf->num_txqs = p_vf->num_sbs;
+
+	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
+		p_vf->vf_queues[i].rxq_active = 0;
+
+	qed_iov_clean_vf(p_hwfn, p_vf->relative_vf_id);
+}
+
 static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   struct qed_vf_info *vf)
@@ -845,6 +1074,271 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static void qed_iov_vf_mbx_int_cleanup(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	int i;
+
+	/* Reset the SBs */
+	for (i = 0; i < vf->num_sbs; i++)
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt,
+						vf->igu_sbs[i],
+						vf->opaque_fid, false);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_INT_CLEANUP,
+			     sizeof(struct pfvf_def_resp_tlv),
+			     PFVF_STATUS_SUCCESS);
+}
+
+static void qed_iov_vf_mbx_close(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, struct qed_vf_info *vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	u8 status = PFVF_STATUS_SUCCESS;
+
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_CLOSE,
+			     length, status);
+}
+
+static void qed_iov_vf_mbx_release(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *p_vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+
+	qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_RELEASE,
+			     length, PFVF_STATUS_SUCCESS);
+}
+
+static int
+qed_iov_vf_flr_poll_dorq(struct qed_hwfn *p_hwfn,
+			 struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int cnt;
+	u32 val;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_vf->concrete_fid);
+
+	for (cnt = 0; cnt < 50; cnt++) {
+		val = qed_rd(p_hwfn, p_ptt, DORQ_REG_VF_USAGE_CNT);
+		if (!val)
+			break;
+		msleep(20);
+	}
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn,
+		       "VF[%d] - dorq failed to cleanup [usage 0x%08x]\n",
+		       p_vf->abs_vf_id, val);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int
+qed_iov_vf_flr_poll_pbf(struct qed_hwfn *p_hwfn,
+			struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	u32 cons[MAX_NUM_VOQS], distance[MAX_NUM_VOQS];
+	int i, cnt;
+
+	/* Read initial consumers & producers */
+	for (i = 0; i < MAX_NUM_VOQS; i++) {
+		u32 prod;
+
+		cons[i] = qed_rd(p_hwfn, p_ptt,
+				 PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				 i * 0x40);
+		prod = qed_rd(p_hwfn, p_ptt,
+			      PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 +
+			      i * 0x40);
+		distance[i] = prod - cons[i];
+	}
+
+	/* Wait for consumers to pass the producers */
+	i = 0;
+	for (cnt = 0; cnt < 50; cnt++) {
+		for (; i < MAX_NUM_VOQS; i++) {
+			u32 tmp;
+
+			tmp = qed_rd(p_hwfn, p_ptt,
+				     PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				     i * 0x40);
+			if (distance[i] > tmp - cons[i])
+				break;
+		}
+
+		if (i == MAX_NUM_VOQS)
+			break;
+
+		msleep(20);
+	}
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn, "VF[%d] - pbf polling failed on VOQ %d\n",
+		       p_vf->abs_vf_id, i);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int qed_iov_vf_flr_poll(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int rc;
+
+	rc = qed_iov_vf_flr_poll_dorq(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	rc = qed_iov_vf_flr_poll_pbf(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int
+qed_iov_execute_vf_flr_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 rel_vf_id, u32 *ack_vfs)
+{
+	struct qed_vf_info *p_vf;
+	int rc = 0;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	if (!p_vf)
+		return 0;
+
+	if (p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &
+	    (1ULL << (rel_vf_id % 64))) {
+		u16 vfid = p_vf->abs_vf_id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - Handling FLR\n", vfid);
+
+		qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+		/* If VF isn't active, no need for anything but SW */
+		if (!p_vf->b_init)
+			goto cleanup;
+
+		rc = qed_iov_vf_flr_poll(p_hwfn, p_vf, p_ptt);
+		if (rc)
+			goto cleanup;
+
+		rc = qed_final_cleanup(p_hwfn, p_ptt, vfid, true);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed handle FLR of VF[%d]\n", vfid);
+			return rc;
+		}
+
+		/* VF_STOPPED has to be set only after final cleanup
+		 * but prior to re-enabling the VF.
+		 */
+		p_vf->state = VF_STOPPED;
+
+		rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, p_vf);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed to re-enable VF[%d] acces\n",
+			       vfid);
+			return rc;
+		}
+cleanup:
+		/* Mark VF for ack and clean pending state */
+		if (p_vf->state == VF_RESET)
+			p_vf->state = VF_STOPPED;
+		ack_vfs[vfid / 32] |= (1 << (vfid % 32));
+		p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
+		    ~(1ULL << (rel_vf_id % 64));
+		p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &=
+		    ~(1ULL << (rel_vf_id % 64));
+	}
+
+	return rc;
+}
+
+int qed_iov_vf_flr_cleanup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 ack_vfs[VF_MAX_STATIC / 32];
+	int rc = 0;
+	u16 i;
+
+	memset(ack_vfs, 0, sizeof(u32) * (VF_MAX_STATIC / 32));
+
+	/* Since BRB <-> PRS interface can't be tested as part of the flr
+	 * polling due to HW limitations, simply sleep a bit. And since
+	 * there's no need to wait per-vf, do it before looping.
+	 */
+	msleep(100);
+
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++)
+		qed_iov_execute_vf_flr_cleanup(p_hwfn, p_ptt, i, ack_vfs);
+
+	rc = qed_mcp_ack_vf_flr(p_hwfn, p_ptt, ack_vfs);
+	return rc;
+}
+
+int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *p_disabled_vfs)
+{
+	u16 i, found = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV, "Marking FLR-ed VFs\n");
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "[%08x,...,%08x]: %08x\n",
+			   i * 32, (i + 1) * 32 - 1, p_disabled_vfs[i]);
+
+	if (!p_hwfn->cdev->p_iov_info) {
+		DP_NOTICE(p_hwfn, "VF flr but no IOV\n");
+		return 0;
+	}
+
+	/* Mark VFs */
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_vf_info *p_vf;
+		u8 vfid;
+
+		p_vf = qed_iov_get_vf_info(p_hwfn, i, false);
+		if (!p_vf)
+			continue;
+
+		vfid = p_vf->abs_vf_id;
+		if ((1 << (vfid % 32)) & p_disabled_vfs[vfid / 32]) {
+			u64 *p_flr = p_hwfn->pf_iov_info->pending_flr;
+			u16 rel_vf_id = p_vf->relative_vf_id;
+
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d] [rel %d] got FLR-ed\n",
+				   vfid, rel_vf_id);
+
+			p_vf->state = VF_RESET;
+
+			/* No need to lock here, since pending_flr should
+			 * only change here and before ACKing MFw. Since
+			 * MFW will not trigger an additional attention for
+			 * VF flr until ACKs, we're safe.
+			 */
+			p_flr[rel_vf_id / 64] |= 1ULL << (rel_vf_id % 64);
+			found = 1;
+		}
+	}
+
+	return found;
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -871,6 +1365,15 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_ACQUIRE:
 			qed_iov_vf_mbx_acquire(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_CLOSE:
+			qed_iov_vf_mbx_close(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_INT_CLEANUP:
+			qed_iov_vf_mbx_int_cleanup(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_RELEASE:
+			qed_iov_vf_mbx_release(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else {
 		/* unknown TLV - this may belong to a VF driver from the future
@@ -992,6 +1495,17 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return true;
+
+	return p_vf_info->state == VF_STOPPED;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -1015,6 +1529,132 @@ void qed_vf_start_iov_wq(struct qed_dev *cdev)
 			       &cdev->hwfns[i].iov_task, 0);
 }
 
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	int i, j;
+
+	for_each_hwfn(cdev, i)
+	    if (cdev->hwfns[i].iov_wq)
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+
+	/* Mark VFs for disablement */
+	qed_iov_set_vfs_to_disable(cdev, true);
+
+	if (cdev->p_iov_info && cdev->p_iov_info->num_vfs && pci_enabled)
+		pci_disable_sriov(cdev->pdev);
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		/* Failure to acquire the ptt in 100g creates an odd error
+		 * where the first engine has already relased IOV.
+		 */
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			return -EBUSY;
+		}
+
+		qed_for_each_vf(hwfn, j) {
+			int k;
+
+			if (!qed_iov_is_valid_vfid(hwfn, j, true))
+				continue;
+
+			/* Wait until VF is disabled before releasing */
+			for (k = 0; k < 100; k++) {
+				if (!qed_iov_is_vf_stopped(hwfn, j))
+					msleep(20);
+				else
+					break;
+			}
+
+			if (k < 100)
+				qed_iov_release_hw_for_vf(&cdev->hwfns[i],
+							  ptt, j);
+			else
+				DP_ERR(hwfn,
+				       "Timeout waiting for VF's FLR to end\n");
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	qed_iov_set_vfs_to_disable(cdev, false);
+
+	return 0;
+}
+
+static int qed_sriov_enable(struct qed_dev *cdev, int num)
+{
+	struct qed_sb_cnt_info sb_cnt_info;
+	int i, j, rc;
+
+	if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
+		DP_NOTICE(cdev, "Can start at most %d VFs\n",
+			  RESC_NUM(&cdev->hwfns[0], QED_VPORT) - 1);
+		return -EINVAL;
+	}
+
+	/* Initialize HW for VF access */
+	for_each_hwfn(cdev, j) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[j];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+		int num_sbs = 0, limit = 16;
+
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			rc = -EBUSY;
+			goto err;
+		}
+
+		memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+		qed_int_get_num_sbs(hwfn, &sb_cnt_info);
+		num_sbs = min_t(int, sb_cnt_info.sb_free_blk, limit);
+
+		for (i = 0; i < num; i++) {
+			if (!qed_iov_is_valid_vfid(hwfn, i, false))
+				continue;
+
+			rc = qed_iov_init_hw_for_vf(hwfn,
+						    ptt, i, num_sbs / num);
+			if (rc) {
+				DP_ERR(cdev, "Failed to enable VF[%d]\n", i);
+				qed_ptt_release(hwfn, ptt);
+				goto err;
+			}
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	/* Enable SRIOV PCIe functions */
+	rc = pci_enable_sriov(cdev->pdev, num);
+	if (rc) {
+		DP_ERR(cdev, "Failed to enable sriov [%d]\n", rc);
+		goto err;
+	}
+
+	return num;
+
+err:
+	qed_sriov_disable(cdev, false);
+	return rc;
+}
+
+static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
+{
+	if (!IS_QED_SRIOV(cdev)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV, "SR-IOV is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (num_vfs_param)
+		return qed_sriov_enable(cdev, num_vfs_param);
+	else
+		return qed_sriov_disable(cdev, true);
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
@@ -1058,10 +1698,26 @@ void qed_iov_pf_task(struct work_struct *work)
 {
 	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
 					     iov_task.work);
+	int rc;
 
 	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
 		return;
 
+	if (test_and_clear_bit(QED_IOV_WQ_FLR_FLAG, &hwfn->iov_task_flags)) {
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		if (!ptt) {
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+			return;
+		}
+
+		rc = qed_iov_vf_flr_cleanup(hwfn, ptt);
+		if (rc)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
 	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
 		qed_handle_vf_msg(hwfn);
 }
@@ -1112,3 +1768,7 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 	return 0;
 }
+
+const struct qed_iov_hv_ops qed_iov_ops_pass = {
+	.configure = &qed_sriov_configure,
+};
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 4f190d25ee14..10794b08fd21 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -24,6 +24,13 @@
 #define QED_MAX_VF_CHAINS_PER_PF 16
 #define QED_ETH_VF_NUM_VLAN_FILTERS 2
 
+struct qed_public_vf_info {
+	/* These copies will later be reflected in the bulletin board,
+	 * but this copy should be newer.
+	 */
+	u8 mac[ETH_ALEN];
+};
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -74,6 +81,7 @@ struct qed_vf_q_info {
 enum vf_state {
 	VF_FREE = 0,		/* VF ready to be acquired holds no resc */
 	VF_ACQUIRED,		/* VF, acquired, but not initalized */
+	VF_RESET,		/* VF, FLR'd, pending cleanup */
 	VF_STOPPED		/* VF, Stopped */
 };
 
@@ -82,6 +90,7 @@ struct qed_vf_info {
 	struct qed_iov_vf_mbx vf_mbx;
 	enum vf_state state;
 	bool b_init;
+	u8 to_disable;
 
 	struct qed_bulletin bulletin;
 	dma_addr_t vf_bulletin;
@@ -105,7 +114,7 @@ struct qed_vf_info {
 	u8 num_vlan_filters;
 	struct qed_vf_q_info vf_queues[QED_MAX_VF_CHAINS_PER_PF];
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
-
+	struct qed_public_vf_info p_vf_info;
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
@@ -219,11 +228,22 @@ void qed_iov_free_hw_info(struct qed_dev *cdev);
 int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
 			u8 opcode, __le16 echo, union event_ring_data *data);
 
+/**
+ * @brief Mark structs of vfs that have been FLR-ed.
+ *
+ * @param p_hwfn
+ * @param disabled_vfs - bitmask of all VFs on path that were FLRed
+ *
+ * @return 1 iff one of the PF's vfs got FLRed. 0 otherwise.
+ */
+int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *disabled_vfs);
+
 void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
 int qed_iov_wq_start(struct qed_dev *cdev);
 
 void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
 void qed_vf_start_iov_wq(struct qed_dev *cdev);
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -260,6 +280,12 @@ static inline int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
 	return -EINVAL;
 }
 
+static inline int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn,
+				      u32 *disabled_vfs)
+{
+	return 0;
+}
+
 static inline void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
 {
 }
@@ -277,6 +303,11 @@ static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
 static inline void qed_vf_start_iov_wq(struct qed_dev *cdev)
 {
 }
+
+static inline int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	return 0;
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index a3c8f4e1b9c1..2460e39724f1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -311,6 +311,103 @@ free_p_iov:
 	return -ENOMEM;
 }
 
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_CLOSE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		return rc;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		return -EAGAIN;
+
+	p_hwfn->b_int_enabled = 0;
+
+	return 0;
+}
+
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	u32 size;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_RELEASE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+
+	if (!rc && resp->hdr.status != PFVF_STATUS_SUCCESS)
+		rc = -EAGAIN;
+
+	p_hwfn->b_int_enabled = 0;
+
+	if (p_iov->vf2pf_request)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union vfpf_tlvs),
+				  p_iov->vf2pf_request,
+				  p_iov->vf2pf_request_phys);
+	if (p_iov->pf2vf_reply)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union pfvf_tlvs),
+				  p_iov->pf2vf_reply, p_iov->pf2vf_reply_phys);
+
+	if (p_iov->bulletin.p_virt) {
+		size = sizeof(struct qed_bulletin_content);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  size,
+				  p_iov->bulletin.p_virt, p_iov->bulletin.phys);
+	}
+
+	kfree(p_hwfn->vf_iov_info);
+	p_hwfn->vf_iov_info = NULL;
+
+	return rc;
+}
+
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp = &p_iov->pf2vf_reply->default_resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_INT_CLEANUP,
+		       sizeof(struct vfpf_first_tlv));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		return rc;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		return -EINVAL;
+
+	return 0;
+}
+
 u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index de9fe8501d21..c872e5e2985e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -206,6 +206,9 @@ struct qed_bulletin {
 enum {
 	CHANNEL_TLV_NONE,	/* ends tlv sequence */
 	CHANNEL_TLV_ACQUIRE,
+	CHANNEL_TLV_INT_CLEANUP,
+	CHANNEL_TLV_CLOSE,
+	CHANNEL_TLV_RELEASE,
 	CHANNEL_TLV_LIST_END,
 	CHANNEL_TLV_MAX
 };
@@ -278,6 +281,24 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
  */
 int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
 
+/**
+ *
+ * @brief VF - send a close message to PF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief VF - free vf`s memories
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn);
 /**
  * @brief qed_vf_get_igu_sb_id - Get the IGU SB ID for a given
  *        sb_id. For VFs igu sbs don't have to be contiguous
@@ -288,6 +309,15 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
  * @return INLINE u16
  */
 u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
+/**
+ * @brief qed_vf_pf_int_cleanup - clean the SB of the VF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn);
 #else
 static inline void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
 {
@@ -313,10 +343,25 @@ static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 	return -EINVAL;
 }
 
+static inline int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
 static inline u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
 {
 	return 0;
 }
+
+static inline int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 3a4c806be156..acfafca43aa5 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -13,6 +13,7 @@
 #include <linux/if_link.h>
 #include <linux/qed/eth_common.h>
 #include <linux/qed/qed_if.h>
+#include <linux/qed/qed_iov_if.h>
 
 struct qed_dev_eth_info {
 	struct qed_dev_info common;
@@ -125,6 +126,9 @@ struct qed_eth_cb_ops {
 
 struct qed_eth_ops {
 	const struct qed_common_ops *common;
+#ifdef CONFIG_QED_SRIOV
+	const struct qed_iov_hv_ops *iov;
+#endif
 
 	int (*fill_dev_info)(struct qed_dev *cdev,
 			     struct qed_dev_eth_info *info);
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
new file mode 100644
index 000000000000..c53bfa6374c5
--- /dev/null
+++ b/include/linux/qed/qed_iov_if.h
@@ -0,0 +1,20 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_IOV_IF_H
+#define _QED_IOV_IF_H
+
+#include <linux/qed/qed_if.h>
+
+/* Structs used by PF to control and manipulate child VFs */
+struct qed_iov_hv_ops {
+	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
+
+};
+
+#endif
-- 
cgit v1.2.3


From 08feecd7fc709077ce92d21a979f522a5f57170a Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:20 +0300
Subject: qed*: Support PVID configuration

This adds support for PF control over the VF vlan configuration.
I.e., `ip link ... vf <x> vlan <vid>' should now be supported.

 1. <vid> != 0 => VF receives [unknowingly] only traffic tagged by
    <vid> and tags all outgoing traffic sent by VF with <vid>.
 2. <vid> == 0 ==> Remove the pvid configuration, reverting to previous.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |   9 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  14 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.h     |   6 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 364 ++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  26 ++
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  18 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  19 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c |  18 ++
 include/linux/qed/qed_iov_if.h               |   1 +
 9 files changed, 468 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 9d01a16bfb1a..e75e73a77b27 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1104,9 +1104,16 @@ static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 	u8 num_funcs = p_hwfn->num_funcs_on_engine;
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	struct qed_sb_cnt_info sb_cnt_info;
-	int i;
+	int i, max_vf_vlan_filters;
 
 	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+
+#ifdef CONFIG_QED_SRIOV
+	max_vf_vlan_filters = QED_ETH_MAX_VF_NUM_VLAN_FILTERS;
+#else
+	max_vf_vlan_filters = 0;
+#endif
+
 	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
 
 	resc_num[QED_SB] = min_t(u32,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 80f0b853a142..7fb6b82f1a97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -114,7 +114,8 @@ int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 					     p_params->mtu,
 					     p_params->remove_inner_vlan,
 					     p_params->tpa_mode,
-					     p_params->max_buffers_per_cqe);
+					     p_params->max_buffers_per_cqe,
+					     p_params->only_untagged);
 	}
 
 	return qed_sp_eth_vport_start(p_hwfn, p_params);
@@ -367,6 +368,16 @@ int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 	p_cmn->inner_vlan_removal_en = p_params->inner_vlan_removal_flg;
 	val = p_params->update_inner_vlan_removal_flg;
 	p_cmn->update_inner_vlan_removal_en_flg = val;
+
+	p_cmn->default_vlan_en = p_params->default_vlan_enable_flg;
+	val = p_params->update_default_vlan_enable_flg;
+	p_cmn->update_default_vlan_en_flg = val;
+
+	p_cmn->default_vlan = cpu_to_le16(p_params->default_vlan);
+	p_cmn->update_default_vlan_flg = p_params->update_default_vlan_flg;
+
+	p_cmn->silent_vlan_removal_en = p_params->silent_vlan_removal_flg;
+
 	p_ramrod->common.tx_switching_en = p_params->tx_switching_flg;
 	p_cmn->update_tx_switching_en_flg = p_params->update_tx_switching_flg;
 
@@ -1702,6 +1713,7 @@ static int qed_start_vport(struct qed_dev *cdev,
 		start.tpa_mode = params->gro_enable ? QED_TPA_MODE_GRO :
 							QED_TPA_MODE_NONE;
 		start.remove_inner_vlan = params->remove_inner_vlan;
+		start.only_untagged = true;	/* untagged only */
 		start.drop_ttl0 = params->drop_ttl0;
 		start.opaque_fid = p_hwfn->hw_info.opaque_fid;
 		start.concrete_fid = p_hwfn->hw_info.concrete_fid;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index f9e677a29751..fad30ae12f63 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -94,6 +94,7 @@ enum qed_tpa_mode {
 struct qed_sp_vport_start_params {
 	enum qed_tpa_mode tpa_mode;
 	bool remove_inner_vlan;
+	bool only_untagged;
 	bool drop_ttl0;
 	u8 max_buffers_per_cqe;
 	u32 concrete_fid;
@@ -140,6 +141,11 @@ struct qed_sp_vport_update_params {
 	u8				vport_active_tx_flg;
 	u8				update_inner_vlan_removal_flg;
 	u8				inner_vlan_removal_flg;
+	u8				silent_vlan_removal_flg;
+	u8				update_default_vlan_enable_flg;
+	u8				default_vlan_enable_flg;
+	u8				update_default_vlan_flg;
+	u16				default_vlan;
 	u8				update_tx_switching_flg;
 	u8				tx_switching_flg;
 	u8				update_approx_mcast_flg;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 29a53dd0d9fd..77d44baa5df3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1075,6 +1075,7 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 	p_vf->vport_instance = 0;
 	p_vf->num_mac_filters = 0;
 	p_vf->num_vlan_filters = 0;
+	p_vf->configured_features = 0;
 
 	/* If VF previously requested less resources, go back to default */
 	p_vf->num_rxqs = p_vf->num_sbs;
@@ -1085,6 +1086,7 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
 		p_vf->vf_queues[i].rxq_active = 0;
 
+	memset(&p_vf->shadow_config, 0, sizeof(p_vf->shadow_config));
 	qed_iov_clean_vf(p_hwfn, p_vf->relative_vf_id);
 }
 
@@ -1232,6 +1234,149 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static int qed_iov_reconfigure_unicast_vlan(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf)
+{
+	struct qed_filter_ucast filter;
+	int rc = 0;
+	int i;
+
+	memset(&filter, 0, sizeof(filter));
+	filter.is_rx_filter = 1;
+	filter.is_tx_filter = 1;
+	filter.vport_to_add_to = p_vf->vport_id;
+	filter.opcode = QED_FILTER_ADD;
+
+	/* Reconfigure vlans */
+	for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+		if (!p_vf->shadow_config.vlans[i].used)
+			continue;
+
+		filter.type = QED_FILTER_VLAN;
+		filter.vlan = p_vf->shadow_config.vlans[i].vid;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Reconfiguring VLAN [0x%04x] for VF [%04x]\n",
+			   filter.vlan, p_vf->relative_vf_id);
+		rc = qed_sp_eth_filter_ucast(p_hwfn,
+					     p_vf->opaque_fid,
+					     &filter,
+					     QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to configure VLAN [%04x] to VF [%04x]\n",
+				  filter.vlan, p_vf->relative_vf_id);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int
+qed_iov_reconfigure_unicast_shadow(struct qed_hwfn *p_hwfn,
+				   struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+
+	if ((events & (1 << VLAN_ADDR_FORCED)) &&
+	    !(p_vf->configured_features & (1 << VLAN_ADDR_FORCED)))
+		rc = qed_iov_reconfigure_unicast_vlan(p_hwfn, p_vf);
+
+	return rc;
+}
+
+static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
+					  struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+	struct qed_filter_ucast filter;
+
+	if (!p_vf->vport_instance)
+		return -EINVAL;
+
+	if (events & (1 << VLAN_ADDR_FORCED)) {
+		struct qed_sp_vport_update_params vport_update;
+		u8 removal;
+		int i;
+
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_VLAN;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		filter.vlan = p_vf->bulletin.p_virt->pvid;
+		filter.opcode = filter.vlan ? QED_FILTER_REPLACE :
+					      QED_FILTER_FLUSH;
+
+		/* Send the ramrod */
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VLAN for VF\n");
+			return rc;
+		}
+
+		/* Update the default-vlan & silent vlan stripping */
+		memset(&vport_update, 0, sizeof(vport_update));
+		vport_update.opaque_fid = p_vf->opaque_fid;
+		vport_update.vport_id = p_vf->vport_id;
+		vport_update.update_default_vlan_enable_flg = 1;
+		vport_update.default_vlan_enable_flg = filter.vlan ? 1 : 0;
+		vport_update.update_default_vlan_flg = 1;
+		vport_update.default_vlan = filter.vlan;
+
+		vport_update.update_inner_vlan_removal_flg = 1;
+		removal = filter.vlan ? 1
+				      : p_vf->shadow_config.inner_vlan_removal;
+		vport_update.inner_vlan_removal_flg = removal;
+		vport_update.silent_vlan_removal_flg = filter.vlan ? 1 : 0;
+		rc = qed_sp_vport_update(p_hwfn,
+					 &vport_update,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VF vport for vlan\n");
+			return rc;
+		}
+
+		/* Update all the Rx queues */
+		for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+			u16 qid;
+
+			if (!p_vf->vf_queues[i].rxq_active)
+				continue;
+
+			qid = p_vf->vf_queues[i].fw_rx_qid;
+
+			rc = qed_sp_eth_rx_queues_update(p_hwfn, qid,
+							 1, 0, 1,
+							 QED_SPQ_MODE_EBLOCK,
+							 NULL);
+			if (rc) {
+				DP_NOTICE(p_hwfn,
+					  "Failed to send Rx update fo queue[0x%04x]\n",
+					  qid);
+				return rc;
+			}
+		}
+
+		if (filter.vlan)
+			p_vf->configured_features |= 1 << VLAN_ADDR_FORCED;
+		else
+			p_vf->configured_features &= ~(1 << VLAN_ADDR_FORCED);
+	}
+
+	/* If forced features are terminated, we need to configure the shadow
+	 * configuration back again.
+	 */
+	if (events)
+		qed_iov_reconfigure_unicast_shadow(p_hwfn, p_vf, events);
+
+	return rc;
+}
+
 static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_vf_info *vf)
@@ -1241,6 +1386,7 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 	struct vfpf_vport_start_tlv *start;
 	u8 status = PFVF_STATUS_SUCCESS;
 	struct qed_vf_info *vf_info;
+	u64 *p_bitmap;
 	int sb_id;
 	int rc;
 
@@ -1272,10 +1418,24 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 	qed_iov_enable_vf_traffic(p_hwfn, p_ptt, vf);
 
 	vf->mtu = start->mtu;
+	vf->shadow_config.inner_vlan_removal = start->inner_vlan_removal;
+
+	/* Take into consideration configuration forced by hypervisor;
+	 * If none is configured, use the supplied VF values [for old
+	 * vfs that would still be fine, since they passed '0' as padding].
+	 */
+	p_bitmap = &vf_info->bulletin.p_virt->valid_bitmap;
+	if (!(*p_bitmap & (1 << VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED))) {
+		u8 vf_req = start->only_untagged;
+
+		vf_info->bulletin.p_virt->default_only_untagged = vf_req;
+		*p_bitmap |= 1 << VFPF_BULLETIN_UNTAGGED_DEFAULT;
+	}
 
 	params.tpa_mode = start->tpa_mode;
 	params.remove_inner_vlan = start->inner_vlan_removal;
 
+	params.only_untagged = vf_info->bulletin.p_virt->default_only_untagged;
 	params.drop_ttl0 = false;
 	params.concrete_fid = vf->concrete_fid;
 	params.opaque_fid = vf->opaque_fid;
@@ -1290,6 +1450,9 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 	} else {
 		vf->vport_instance++;
+
+		/* Force configuration if needed on the newly opened vport */
+		qed_iov_configure_vport_forced(p_hwfn, vf, *p_bitmap);
 	}
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_START,
 			     sizeof(struct pfvf_def_resp_tlv), status);
@@ -1311,6 +1474,10 @@ static void qed_iov_vf_mbx_stop_vport(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 	}
 
+	/* Forget the configuration on the vport */
+	vf->configured_features = 0;
+	memset(&vf->shadow_config, 0, sizeof(vf->shadow_config));
+
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_TEARDOWN,
 			     sizeof(struct pfvf_def_resp_tlv), status);
 }
@@ -1634,8 +1801,13 @@ qed_iov_vp_update_vlan_param(struct qed_hwfn *p_hwfn,
 	if (!p_vlan_tlv)
 		return;
 
-	p_data->update_inner_vlan_removal_flg = 1;
-	p_data->inner_vlan_removal_flg = p_vlan_tlv->remove_vlan;
+	p_vf->shadow_config.inner_vlan_removal = p_vlan_tlv->remove_vlan;
+
+	/* Ignore the VF request if we're forcing a vlan */
+	if (!(p_vf->configured_features & (1 << VLAN_ADDR_FORCED))) {
+		p_data->update_inner_vlan_removal_flg = 1;
+		p_data->inner_vlan_removal_flg = p_vlan_tlv->remove_vlan;
+	}
 
 	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_VLAN_STRIP;
 }
@@ -1886,6 +2058,67 @@ out:
 	qed_iov_send_response(p_hwfn, p_ptt, vf, length, status);
 }
 
+static int qed_iov_vf_update_unicast_shadow(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf,
+					    struct qed_filter_ucast *p_params)
+{
+	int i;
+
+	if (p_params->type == QED_FILTER_MAC)
+		return 0;
+
+	/* First remove entries and then add new ones */
+	if (p_params->opcode == QED_FILTER_REMOVE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			if (p_vf->shadow_config.vlans[i].used &&
+			    p_vf->shadow_config.vlans[i].vid ==
+			    p_params->vlan) {
+				p_vf->shadow_config.vlans[i].used = false;
+				break;
+			}
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to remove a non-existing vlan\n",
+				   p_vf->relative_vf_id);
+			return -EINVAL;
+		}
+	} else if (p_params->opcode == QED_FILTER_REPLACE ||
+		   p_params->opcode == QED_FILTER_FLUSH) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			p_vf->shadow_config.vlans[i].used = false;
+	}
+
+	/* In forced mode, we're willing to remove entries - but we don't add
+	 * new ones.
+	 */
+	if (p_vf->bulletin.p_virt->valid_bitmap & (1 << VLAN_ADDR_FORCED))
+		return 0;
+
+	if (p_params->opcode == QED_FILTER_ADD ||
+	    p_params->opcode == QED_FILTER_REPLACE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+			if (p_vf->shadow_config.vlans[i].used)
+				continue;
+
+			p_vf->shadow_config.vlans[i].used = true;
+			p_vf->shadow_config.vlans[i].vid = p_params->vlan;
+			break;
+		}
+
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to configure more than %d vlan filters\n",
+				   p_vf->relative_vf_id,
+				   QED_ETH_VF_NUM_VLAN_FILTERS + 1);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 int qed_iov_chk_ucast(struct qed_hwfn *hwfn,
 		      int vfid, struct qed_filter_ucast *params)
 {
@@ -1907,6 +2140,7 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt,
 					struct qed_vf_info *vf)
 {
+	struct qed_bulletin_content *p_bulletin = vf->bulletin.p_virt;
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	struct vfpf_ucast_filter_tlv *req;
 	u8 status = PFVF_STATUS_SUCCESS;
@@ -1946,6 +2180,25 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 		goto out;
 	}
 
+	/* Update shadow copy of the VF configuration */
+	if (qed_iov_vf_update_unicast_shadow(p_hwfn, vf, &params)) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	/* Determine if the unicast filtering is acceptible by PF */
+	if ((p_bulletin->valid_bitmap & (1 << VLAN_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_VLAN ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		/* Once VLAN is forced or PVID is set, do not allow
+		 * to add/replace any further VLANs.
+		 */
+		if (params.opcode == QED_FILTER_ADD ||
+		    params.opcode == QED_FILTER_REPLACE)
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
 	rc = qed_iov_chk_ucast(p_hwfn, vf->relative_vf_id, &params);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
@@ -2449,6 +2702,29 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
+				      u16 pvid, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << VLAN_ADDR_FORCED;
+	vf_info->bulletin.p_virt->pvid = pvid;
+	if (pvid)
+		vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	else
+		vf_info->bulletin.p_virt->valid_bitmap &= ~feature;
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
 bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
@@ -2460,6 +2736,20 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return 0;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & (1 << VLAN_ADDR_FORCED)))
+		return 0;
+
+	return p_vf->bulletin.p_virt->pvid;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -2609,6 +2899,38 @@ static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
 		return qed_sriov_disable(cdev, true);
 }
 
+static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced vlan, and schedule the IOV task */
+		vf_info->forced_vlan = vid;
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
 void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 {
 	struct qed_mcp_link_capabilities caps;
@@ -2671,6 +2993,38 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 	qed_ptt_release(hwfn, ptt);
 }
 
+static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
+{
+	int i;
+
+	qed_for_each_vf(hwfn, i) {
+		struct qed_public_vf_info *info;
+		bool update = false;
+
+		info = qed_iov_get_public_vf_info(hwfn, i, true);
+		if (!info)
+			continue;
+
+		/* Update data on bulletin board */
+
+		if (qed_iov_bulletin_get_forced_vlan(hwfn, i) ^
+		    info->forced_vlan) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of pvid [0x%04x] to VF 0x%02x [Abs 0x%02x]\n",
+				   info->forced_vlan,
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+			qed_iov_bulletin_set_forced_vlan(hwfn,
+							 info->forced_vlan, i);
+			update = true;
+		}
+
+		if (update)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	}
+}
+
 static void qed_handle_bulletin_post(struct qed_hwfn *hwfn)
 {
 	struct qed_ptt *ptt;
@@ -2715,6 +3069,11 @@ void qed_iov_pf_task(struct work_struct *work)
 
 	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
 		qed_handle_vf_msg(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+			       &hwfn->iov_task_flags))
+		qed_handle_pf_set_vf_unicast(hwfn);
+
 	if (test_and_clear_bit(QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
 			       &hwfn->iov_task_flags))
 		qed_handle_bulletin_post(hwfn);
@@ -2774,4 +3133,5 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
+	.set_vlan = &qed_sriov_pf_set_vlan,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 2c94b445d07f..e65f403349c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -24,6 +24,9 @@
 #define QED_MAX_VF_CHAINS_PER_PF 16
 #define QED_ETH_VF_NUM_VLAN_FILTERS 2
 
+#define QED_ETH_MAX_VF_NUM_VLAN_FILTERS	\
+	(MAX_NUM_VFS * QED_ETH_VF_NUM_VLAN_FILTERS)
+
 enum qed_iov_vport_update_flag {
 	QED_IOV_VP_UPDATE_ACTIVATE,
 	QED_IOV_VP_UPDATE_VLAN_STRIP,
@@ -40,6 +43,7 @@ struct qed_public_vf_info {
 	/* These copies will later be reflected in the bulletin board,
 	 * but this copy should be newer.
 	 */
+	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
 };
 
@@ -98,6 +102,18 @@ enum vf_state {
 	VF_STOPPED		/* VF, Stopped */
 };
 
+struct qed_vf_vlan_shadow {
+	bool used;
+	u16 vid;
+};
+
+struct qed_vf_shadow_config {
+	/* Shadow copy of all guest vlans */
+	struct qed_vf_vlan_shadow vlans[QED_ETH_VF_NUM_VLAN_FILTERS + 1];
+
+	u8 inner_vlan_removal;
+};
+
 /* PFs maintain an array of this structure, per VF */
 struct qed_vf_info {
 	struct qed_iov_vf_mbx vf_mbx;
@@ -131,6 +147,16 @@ struct qed_vf_info {
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
 	u8 num_active_rxqs;
 	struct qed_public_vf_info p_vf_info;
+
+	/* Stores the configuration requested by VF */
+	struct qed_vf_shadow_config shadow_config;
+
+	/* A bitfield using bulletin's valid-map bits, used to indicate
+	 * which of the bulletin board features have been configured.
+	 */
+	u64 configured_features;
+#define QED_IOV_CONFIGURED_FEATURES_MASK        ((1 << MAC_ADDR_FORCED) | \
+						 (1 << VLAN_ADDR_FORCED))
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index e788954568d4..3c8911de3ed4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -474,7 +474,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 			  u16 mtu,
 			  u8 inner_vlan_removal,
 			  enum qed_tpa_mode tpa_mode,
-			  u8 max_buffers_per_cqe)
+			  u8 max_buffers_per_cqe, u8 only_untagged)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_vport_start_tlv *req;
@@ -489,6 +489,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 	req->inner_vlan_removal = inner_vlan_removal;
 	req->tpa_mode = tpa_mode;
 	req->max_buffers_per_cqe = max_buffers_per_cqe;
+	req->only_untagged = only_untagged;
 
 	/* status blocks */
 	for (i = 0; i < p_hwfn->vf_iov_info->acquire_resp.resc.num_sbs; i++)
@@ -547,6 +548,8 @@ qed_vf_handle_vp_update_is_needed(struct qed_hwfn *p_hwfn,
 		return !!p_data->update_tx_switching_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP:
 		return !!p_data->update_inner_vlan_removal_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN:
+		return !!p_data->update_accept_any_vlan_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_MCAST:
 		return !!p_data->update_approx_mcast_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM:
@@ -696,6 +699,19 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 		       sizeof(rss_params->rss_key));
 	}
 
+	if (p_params->update_accept_any_vlan_flg) {
+		struct vfpf_vport_update_accept_any_vlan_tlv *p_any_vlan_tlv;
+
+		size = sizeof(struct vfpf_vport_update_accept_any_vlan_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN;
+		p_any_vlan_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, tlv, size);
+
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+		p_any_vlan_tlv->accept_any_vlan = p_params->accept_any_vlan;
+		p_any_vlan_tlv->update_accept_any_vlan_flg =
+		    p_params->update_accept_any_vlan_flg;
+	}
+
 	/* add list termination tlv */
 	qed_add_tlv(p_hwfn, &p_iov->offset,
 		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index d9a8aa684ad7..35eced3691ba 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -417,6 +417,16 @@ union pfvf_tlvs {
 	struct pfvf_start_queue_resp_tlv queue_start;
 };
 
+enum qed_bulletin_bit {
+	/* Alert the VF that a forced VLAN was set by the PF */
+	VLAN_ADDR_FORCED = 2,
+
+	/* Indicate that `default_only_untagged' contains actual data */
+	VFPF_BULLETIN_UNTAGGED_DEFAULT = 3,
+	VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED = 4,
+
+};
+
 struct qed_bulletin_content {
 	/* crc of structure to ensure is not in mid-update */
 	u32 crc;
@@ -465,6 +475,10 @@ struct qed_bulletin_content {
 	u32 partner_adv_speed;
 
 	u32 capability_speed;
+
+	/* Forced vlan */
+	u16 pvid;
+	u16 padding5;
 };
 
 struct qed_bulletin {
@@ -737,7 +751,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 			  u16 mtu,
 			  u8 inner_vlan_removal,
 			  enum qed_tpa_mode tpa_mode,
-			  u8 max_buffers_per_cqe);
+			  u8 max_buffers_per_cqe, u8 only_untagged);
 
 /**
  * @brief qed_vf_pf_vport_stop - stop the VF's vport
@@ -898,7 +912,8 @@ static inline int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 					u16 mtu,
 					u8 inner_vlan_removal,
 					enum qed_tpa_mode tpa_mode,
-					u8 max_buffers_per_cqe)
+					u8 max_buffers_per_cqe,
+					u8 only_untagged)
 {
 	return -EINVAL;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index bf54cfcd75c0..4d59d7e00e42 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -103,6 +103,21 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 #ifdef CONFIG_QED_SRIOV
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (vlan > 4095) {
+		DP_NOTICE(edev, "Illegal vlan value %d\n", vlan);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
+		   vlan, vf);
+
+	return edev->ops->iov->set_vlan(edev->cdev, vlan, vf);
+}
+
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
@@ -2071,6 +2086,9 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_set_mac_address = qede_set_mac_addr,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_change_mtu = qede_change_mtu,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_vlan = qede_set_vf_vlan,
+#endif
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
 	.ndo_get_stats64 = qede_get_stats64,
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index c53bfa6374c5..825c007d50f1 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -15,6 +15,7 @@
 struct qed_iov_hv_ops {
 	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
 
+	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 };
 
 #endif
-- 
cgit v1.2.3


From eff169608c250193e72089dc4ab15cb79e0bd68c Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:21 +0300
Subject: qed*: Support forced MAC

Allows the PF to enforce the VF's mac.
i.e., by using `ip link ... vf <x> mac <value>'.

While a MAC is forced, PF would prevent the VF from configuring any other
MAC.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |   9 ++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 120 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |   1 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  47 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  21 +++++
 drivers/net/ethernet/qlogic/qede/qede_main.c |  31 +++++++
 include/linux/qed/qed_eth_if.h               |   3 +
 include/linux/qed/qed_iov_if.h               |   2 +
 8 files changed, 234 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 7fb6b82f1a97..8d83250aa5ba 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1701,6 +1701,14 @@ static void qed_register_eth_ops(struct qed_dev *cdev,
 		qed_vf_start_iov_wq(cdev);
 }
 
+static bool qed_check_mac(struct qed_dev *cdev, u8 *mac)
+{
+	if (IS_PF(cdev))
+		return true;
+
+	return qed_vf_check_mac(&cdev->hwfns[0], mac);
+}
+
 static int qed_start_vport(struct qed_dev *cdev,
 			   struct qed_start_vport_params *params)
 {
@@ -2149,6 +2157,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 #endif
 	.fill_dev_info = &qed_fill_eth_dev_info,
 	.register_ops = &qed_register_eth_ops,
+	.check_mac = &qed_check_mac,
 	.vport_start = &qed_start_vport,
 	.vport_stop = &qed_stop_vport,
 	.vport_update = &qed_update_vport,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 77d44baa5df3..c1b79190ce4d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1295,6 +1295,29 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 	if (!p_vf->vport_instance)
 		return -EINVAL;
 
+	if (events & (1 << MAC_ADDR_FORCED)) {
+		/* Since there's no way [currently] of removing the MAC,
+		 * we can always assume this means we need to force it.
+		 */
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_MAC;
+		filter.opcode = QED_FILTER_REPLACE;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		ether_addr_copy(filter.mac, p_vf->bulletin.p_virt->mac);
+
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure MAC for VF\n");
+			return rc;
+		}
+
+		p_vf->configured_features |= 1 << MAC_ADDR_FORCED;
+	}
+
 	if (events & (1 << VLAN_ADDR_FORCED)) {
 		struct qed_sp_vport_update_params vport_update;
 		u8 removal;
@@ -2199,6 +2222,16 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 		goto out;
 	}
 
+	if ((p_bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_MAC ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		if (!ether_addr_equal(p_bulletin->mac, params.mac) ||
+		    (params.opcode != QED_FILTER_ADD &&
+		     params.opcode != QED_FILTER_REPLACE))
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
 	rc = qed_iov_chk_ucast(p_hwfn, vf->relative_vf_id, &params);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
@@ -2702,6 +2735,30 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+static void qed_iov_bulletin_set_forced_mac(struct qed_hwfn *p_hwfn,
+					    u8 *mac, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << MAC_ADDR_FORCED;
+	memcpy(vf_info->bulletin.p_virt->mac, mac, ETH_ALEN);
+
+	vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	/* Forced MAC will disable MAC_ADDR */
+	vf_info->bulletin.p_virt->valid_bitmap &=
+				~(1 << VFPF_BULLETIN_MAC_ADDR);
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
 void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 				      u16 pvid, int vfid)
 {
@@ -2736,6 +2793,21 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
+					   u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return NULL;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & (1 << MAC_ADDR_FORCED)))
+		return NULL;
+
+	return p_vf->bulletin.p_virt->mac;
+}
+
 u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 {
 	struct qed_vf_info *p_vf;
@@ -2899,6 +2971,38 @@ static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
 		return qed_sriov_disable(cdev, true);
 }
 
+static int qed_sriov_pf_set_mac(struct qed_dev *cdev, u8 *mac, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced MAC, and schedule the IOV task */
+		ether_addr_copy(vf_info->forced_mac, mac);
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
 static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
 {
 	int i;
@@ -3000,12 +3104,27 @@ static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
 	qed_for_each_vf(hwfn, i) {
 		struct qed_public_vf_info *info;
 		bool update = false;
+		u8 *mac;
 
 		info = qed_iov_get_public_vf_info(hwfn, i, true);
 		if (!info)
 			continue;
 
 		/* Update data on bulletin board */
+		mac = qed_iov_bulletin_get_forced_mac(hwfn, i);
+		if (is_valid_ether_addr(info->forced_mac) &&
+		    (!mac || !ether_addr_equal(mac, info->forced_mac))) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of VF MAC to VF 0x%02x [Abs 0x%02x]\n",
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+			/* Update bulletin board with forced MAC */
+			qed_iov_bulletin_set_forced_mac(hwfn,
+							info->forced_mac, i);
+			update = true;
+		}
 
 		if (qed_iov_bulletin_get_forced_vlan(hwfn, i) ^
 		    info->forced_vlan) {
@@ -3133,5 +3252,6 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
+	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index e65f403349c2..e38ea985abe1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -43,6 +43,7 @@ struct qed_public_vf_info {
 	/* These copies will later be reflected in the bulletin board,
 	 * but this copy should be newer.
 	 */
+	u8 forced_mac[ETH_ALEN];
 	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 3c8911de3ed4..db14e230c9a4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/etherdevice.h>
 #include "qed.h"
 #include "qed_sriov.h"
 #include "qed_vf.h"
@@ -1004,6 +1005,43 @@ void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn, u8 *num_vlan_filters)
 	*num_vlan_filters = p_vf->acquire_resp.resc.num_vlan_filters;
 }
 
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+	if (!(bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)))
+		return true;
+
+	/* Forbid VF from changing a MAC enforced by PF */
+	if (ether_addr_equal(bulletin->mac, mac))
+		return false;
+
+	return false;
+}
+
+bool qed_vf_bulletin_get_forced_mac(struct qed_hwfn *hwfn,
+				    u8 *dst_mac, u8 *p_is_forced)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &hwfn->vf_iov_info->bulletin_shadow;
+
+	if (bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)) {
+		if (p_is_forced)
+			*p_is_forced = 1;
+	} else if (bulletin->valid_bitmap & (1 << VFPF_BULLETIN_MAC_ADDR)) {
+		if (p_is_forced)
+			*p_is_forced = 0;
+	} else {
+		return false;
+	}
+
+	ether_addr_copy(dst_mac, bulletin->mac);
+
+	return true;
+}
+
 void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 			   u16 *fw_major, u16 *fw_minor,
 			   u16 *fw_rev, u16 *fw_eng)
@@ -1020,6 +1058,15 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 
 static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
 {
+	struct qed_eth_cb_ops *ops = hwfn->cdev->protocol_ops.eth;
+	u8 mac[ETH_ALEN], is_mac_exist, is_mac_forced;
+	void *cookie = hwfn->cdev->ops_cookie;
+
+	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
+						      &is_mac_forced);
+	if (is_mac_exist && is_mac_forced && cookie)
+		ops->force_mac(cookie, mac);
+
 	/* Always update link configuration according to bulletin */
 	qed_link_update(hwfn);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 35eced3691ba..b82fda964bbd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -418,6 +418,8 @@ union pfvf_tlvs {
 };
 
 enum qed_bulletin_bit {
+	/* Alert the VF that a forced MAC was set by the PF */
+	MAC_ADDR_FORCED = 0,
 	/* Alert the VF that a forced VLAN was set by the PF */
 	VLAN_ADDR_FORCED = 2,
 
@@ -425,6 +427,10 @@ enum qed_bulletin_bit {
 	VFPF_BULLETIN_UNTAGGED_DEFAULT = 3,
 	VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED = 4,
 
+	/* Alert the VF that suggested mac was sent by the PF.
+	 * MAC_ADDR will be disabled in case MAC_ADDR_FORCED is set.
+	 */
+	VFPF_BULLETIN_MAC_ADDR = 5
 };
 
 struct qed_bulletin_content {
@@ -601,6 +607,16 @@ void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac);
 void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
 				 u8 *num_vlan_filters);
 
+/**
+ * @brief Check if VF can set a MAC address
+ *
+ * @param p_hwfn
+ * @param mac
+ *
+ * @return bool
+ */
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac);
+
 /**
  * @brief Set firmware version information in dev_info from VFs acquire response tlv
  *
@@ -841,6 +857,11 @@ static inline void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
 {
 }
 
+static inline bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	return false;
+}
+
 static inline void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 					 u16 *fw_major, u16 *fw_minor,
 					 u16 *fw_rev, u16 *fw_eng)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 4d59d7e00e42..b326b15d5196 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -118,6 +118,22 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
 	return edev->ops->iov->set_vlan(edev->cdev, vlan, vf);
 }
 
+static int qede_set_vf_mac(struct net_device *ndev, int vfidx, u8 *mac)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	DP_VERBOSE(edev, QED_MSG_IOV,
+		   "Setting MAC %02x:%02x:%02x:%02x:%02x:%02x to VF [%d]\n",
+		   mac[0], mac[1], mac[2], mac[3], mac[4], mac[5], vfidx);
+
+	if (!is_valid_ether_addr(mac)) {
+		DP_VERBOSE(edev, QED_MSG_IOV, "MAC address isn't valid\n");
+		return -EINVAL;
+	}
+
+	return edev->ops->iov->set_mac(edev->cdev, mac, vfidx);
+}
+
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
@@ -138,10 +154,19 @@ static struct pci_driver qede_pci_driver = {
 #endif
 };
 
+static void qede_force_mac(void *dev, u8 *mac)
+{
+	struct qede_dev *edev = dev;
+
+	ether_addr_copy(edev->ndev->dev_addr, mac);
+	ether_addr_copy(edev->primary_mac, mac);
+}
+
 static struct qed_eth_cb_ops qede_ll_ops = {
 	{
 		.link_update = qede_link_update,
 	},
+	.force_mac = qede_force_mac,
 };
 
 static int qede_netdev_event(struct notifier_block *this, unsigned long event,
@@ -2087,6 +2112,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_change_mtu = qede_change_mtu,
 #ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_mac = qede_set_vf_mac,
 	.ndo_set_vf_vlan = qede_set_vf_vlan,
 #endif
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
@@ -3512,6 +3538,11 @@ static int qede_set_mac_addr(struct net_device *ndev, void *p)
 		return -EFAULT;
 	}
 
+	if (!edev->ops->check_mac(edev->cdev, addr->sa_data)) {
+		DP_NOTICE(edev, "qed prevents setting MAC\n");
+		return -EINVAL;
+	}
+
 	ether_addr_copy(ndev->dev_addr, addr->sa_data);
 
 	if (!netif_running(ndev))  {
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index acfafca43aa5..e0f6e6482031 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -122,6 +122,7 @@ struct qed_tunn_params {
 
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
+	void (*force_mac) (void *dev, u8 *mac);
 };
 
 struct qed_eth_ops {
@@ -137,6 +138,8 @@ struct qed_eth_ops {
 			     struct qed_eth_cb_ops *ops,
 			     void *cookie);
 
+	 bool(*check_mac) (struct qed_dev *cdev, u8 *mac);
+
 	int (*vport_start)(struct qed_dev *cdev,
 			   struct qed_start_vport_params *params);
 
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 825c007d50f1..7a67fbf4336a 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -15,6 +15,8 @@
 struct qed_iov_hv_ops {
 	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
 
+	int (*set_mac) (struct qed_dev *cdev, u8 *mac, int vfid);
+
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 };
 
-- 
cgit v1.2.3


From 733def6a04bf3d2810dd675e1240f8df94d633c3 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:22 +0300
Subject: qed*: IOV link control

This adds support in 2 ndo that allow PF to tweak the VF's view of the
link - `ndo_set_vf_link_state' to allow it a view independent of the PF's,
and `ndo_set_vf_rate' which would allow the PF to limit the VF speed.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |   2 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |  76 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 164 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |   6 +
 drivers/net/ethernet/qlogic/qede/qede_main.c |  26 +++++
 include/linux/qed/qed_iov_if.h               |   6 +
 6 files changed, 280 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index d7da64556e4b..77323fc70927 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -554,8 +554,10 @@ static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev,
 
 #define PURE_LB_TC 8
 
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
 void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate);
 
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
 
 /* Other Linux specific common definitions */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index e75e73a77b27..acaa2866dae3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1889,6 +1889,32 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static int __qed_configure_vport_wfq(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 vp_id, u32 rate)
+{
+	struct qed_mcp_link_state *p_link;
+	int rc = 0;
+
+	p_link = &p_hwfn->cdev->hwfns[0].mcp_info->link_output;
+
+	if (!p_link->min_pf_rate) {
+		p_hwfn->qm_info.wfq_data[vp_id].min_speed = rate;
+		p_hwfn->qm_info.wfq_data[vp_id].configured = true;
+		return rc;
+	}
+
+	rc = qed_init_wfq_param(p_hwfn, vp_id, rate, p_link->min_pf_rate);
+
+	if (rc == 0)
+		qed_configure_wfq_for_all_vports(p_hwfn, p_ptt,
+						 p_link->min_pf_rate);
+	else
+		DP_NOTICE(p_hwfn,
+			  "Validation failed while configuring min rate\n");
+
+	return rc;
+}
+
 static int __qed_configure_vp_wfq_on_link_change(struct qed_hwfn *p_hwfn,
 						 struct qed_ptt *p_ptt,
 						 u32 min_pf_rate)
@@ -1923,6 +1949,42 @@ static int __qed_configure_vp_wfq_on_link_change(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+/* Main API for qed clients to configure vport min rate.
+ * vp_id - vport id in PF Range[0 - (total_num_vports_per_pf - 1)]
+ * rate - Speed in Mbps needs to be assigned to a given vport.
+ */
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate)
+{
+	int i, rc = -EINVAL;
+
+	/* Currently not supported; Might change in future */
+	if (cdev->num_hwfns > 1) {
+		DP_NOTICE(cdev,
+			  "WFQ configuration is not supported for this device\n");
+		return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_ptt *p_ptt;
+
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = __qed_configure_vport_wfq(p_hwfn, p_ptt, vp_id, rate);
+
+		if (!rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			return rc;
+		}
+
+		qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	return rc;
+}
+
 /* API to configure WFQ from mcp link change */
 void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
 {
@@ -2069,3 +2131,17 @@ int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw)
 
 	return rc;
 }
+
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_link_state *p_link;
+
+	p_link = &p_hwfn->mcp_info->link_output;
+
+	if (p_link->min_pf_rate)
+		qed_disable_wfq_for_all_vports(p_hwfn, p_ptt,
+					       p_link->min_pf_rate);
+
+	memset(p_hwfn->qm_info.wfq_data, 0,
+	       sizeof(*p_hwfn->qm_info.wfq_data) * p_hwfn->qm_info.num_vports);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index c1b79190ce4d..c9a3bb63cc87 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -2822,6 +2822,46 @@ u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 	return p_vf->bulletin.p_virt->pvid;
 }
 
+static int qed_iov_configure_tx_rate(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, int vfid, int val)
+{
+	struct qed_vf_info *vf;
+	u8 abs_vp_id = 0;
+	int rc;
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf)
+		return -EINVAL;
+
+	rc = qed_fw_vport(p_hwfn, vf->vport_id, &abs_vp_id);
+	if (rc)
+		return rc;
+
+	return qed_init_vport_rl(p_hwfn, p_ptt, abs_vp_id, (u32)val);
+}
+
+int qed_iov_configure_min_tx_rate(struct qed_dev *cdev, int vfid, u32 rate)
+{
+	struct qed_vf_info *vf;
+	u8 vport_id;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set min rate\n");
+			return -EINVAL;
+		}
+	}
+
+	vf = qed_iov_get_vf_info(QED_LEADING_HWFN(cdev), (u16)vfid, true);
+	vport_id = vf->vport_id;
+
+	return qed_configure_vport_wfq(cdev, vport_id, rate);
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -2871,6 +2911,9 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
 			return -EBUSY;
 		}
 
+		/* Clean WFQ db and configure equal weight for all vports */
+		qed_clean_wfq_db(hwfn, ptt);
+
 		qed_for_each_vf(hwfn, j) {
 			int k;
 
@@ -3047,17 +3090,136 @@ void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 
 	/* Update bulletin of all future possible VFs with link configuration */
 	for (i = 0; i < hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, i, false);
+		if (!vf_info)
+			continue;
+
 		memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
 		memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
 		memcpy(&caps, qed_mcp_get_link_capabilities(hwfn),
 		       sizeof(caps));
 
+		/* Modify link according to the VF's configured link state */
+		switch (vf_info->link_state) {
+		case IFLA_VF_LINK_STATE_DISABLE:
+			link.link_up = false;
+			break;
+		case IFLA_VF_LINK_STATE_ENABLE:
+			link.link_up = true;
+			/* Set speed according to maximum supported by HW.
+			 * that is 40G for regular devices and 100G for CMT
+			 * mode devices.
+			 */
+			link.speed = (hwfn->cdev->num_hwfns > 1) ?
+				     100000 : 40000;
+		default:
+			/* In auto mode pass PF link image to VF */
+			break;
+		}
+
+		if (link.link_up && vf_info->tx_rate) {
+			struct qed_ptt *ptt;
+			int rate;
+
+			rate = min_t(int, vf_info->tx_rate, link.speed);
+
+			ptt = qed_ptt_acquire(hwfn);
+			if (!ptt) {
+				DP_NOTICE(hwfn, "Failed to acquire PTT\n");
+				return;
+			}
+
+			if (!qed_iov_configure_tx_rate(hwfn, ptt, i, rate)) {
+				vf_info->tx_rate = rate;
+				link.speed = rate;
+			}
+
+			qed_ptt_release(hwfn, ptt);
+		}
+
 		qed_iov_set_link(hwfn, i, &params, &link, &caps);
 	}
 
 	qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
 }
 
+static int qed_set_vf_link_state(struct qed_dev *cdev,
+				 int vf_id, int link_state)
+{
+	int i;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	/* Handle configuration of link state */
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		vf = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+		if (!vf)
+			continue;
+
+		if (vf->link_state == link_state)
+			continue;
+
+		vf->link_state = link_state;
+		qed_inform_vf_link_state(&cdev->hwfns[i]);
+	}
+
+	return 0;
+}
+
+static int qed_configure_max_vf_rate(struct qed_dev *cdev, int vfid, int rate)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set tx rate\n");
+			return -EINVAL;
+		}
+
+		vf = qed_iov_get_public_vf_info(p_hwfn, vfid, true);
+
+		vf->tx_rate = rate;
+
+		qed_inform_vf_link_state(p_hwfn);
+	}
+
+	return 0;
+}
+
+static int qed_set_vf_rate(struct qed_dev *cdev,
+			   int vfid, u32 min_rate, u32 max_rate)
+{
+	int rc_min = 0, rc_max = 0;
+
+	if (max_rate)
+		rc_max = qed_configure_max_vf_rate(cdev, vfid, max_rate);
+
+	if (min_rate)
+		rc_min = qed_iov_configure_min_tx_rate(cdev, vfid, min_rate);
+
+	if (rc_max | rc_min)
+		return -EINVAL;
+
+	return 0;
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
@@ -3254,4 +3416,6 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
+	.set_link_state = &qed_set_vf_link_state,
+	.set_rate = &qed_set_vf_rate,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index e38ea985abe1..ab3d291cf7f0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -46,6 +46,12 @@ struct qed_public_vf_info {
 	u8 forced_mac[ETH_ALEN];
 	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
+
+	/* IFLA_VF_LINK_STATE_<X> */
+	int link_state;
+
+	/* Currently configured Tx rate in MB/sec. 0 if unconfigured */
+	int tx_rate;
 };
 
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b326b15d5196..3d0f98f81122 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1792,6 +1792,28 @@ static struct rtnl_link_stats64 *qede_get_stats64(
 	return stats;
 }
 
+#ifdef CONFIG_QED_SRIOV
+static int qede_set_vf_rate(struct net_device *dev, int vfidx,
+			    int min_tx_rate, int max_tx_rate)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->iov->set_rate(edev->cdev, vfidx, max_tx_rate,
+					max_tx_rate);
+}
+
+static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
+				  int link_state)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_link_state(edev->cdev, vfidx, link_state);
+}
+#endif
+
 static void qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
 {
 	struct qed_update_vport_params params;
@@ -2118,6 +2140,10 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
 	.ndo_get_stats64 = qede_get_stats64,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_link_state = qede_set_vf_link_state,
+	.ndo_set_vf_rate = qede_set_vf_rate,
+#endif
 #ifdef CONFIG_QEDE_VXLAN
 	.ndo_add_vxlan_port = qede_add_vxlan_port,
 	.ndo_del_vxlan_port = qede_del_vxlan_port,
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 7a67fbf4336a..f364f2bd7a4d 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -18,6 +18,12 @@ struct qed_iov_hv_ops {
 	int (*set_mac) (struct qed_dev *cdev, u8 *mac, int vfid);
 
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
+
+	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
+			       int link_state);
+
+	int (*set_rate) (struct qed_dev *cdev, int vfid,
+			 u32 min_rate, u32 max_rate);
 };
 
 #endif
-- 
cgit v1.2.3


From 6ddc7608258d57d61e16d55461400bb6eff18d72 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:23 +0300
Subject: qed*: IOV support spoof-checking

Add support in `ndo_set_vf_spoofchk' for allowing PF control over
its VF spoof-checking configuration.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  4 ++
 drivers/net/ethernet/qlogic/qed/qed_l2.h     |  2 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 91 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  2 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 11 ++++
 include/linux/qed/qed_iov_if.h               |  2 +
 6 files changed, 112 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 8d83250aa5ba..e0275a78b121 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -381,6 +381,10 @@ int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 	p_ramrod->common.tx_switching_en = p_params->tx_switching_flg;
 	p_cmn->update_tx_switching_en_flg = p_params->update_tx_switching_flg;
 
+	p_cmn->anti_spoofing_en = p_params->anti_spoofing_en;
+	val = p_params->update_anti_spoofing_en_flg;
+	p_ramrod->common.update_anti_spoofing_en_flg = val;
+
 	rc = qed_sp_vport_update_rss(p_hwfn, p_ramrod, p_rss_params);
 	if (rc) {
 		/* Return spq entry which is taken in qed_sp_init_request()*/
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index fad30ae12f63..a04fb7f061ea 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -149,6 +149,8 @@ struct qed_sp_vport_update_params {
 	u8				update_tx_switching_flg;
 	u8				tx_switching_flg;
 	u8				update_approx_mcast_flg;
+	u8				update_anti_spoofing_en_flg;
+	u8				anti_spoofing_en;
 	u8				update_accept_any_vlan_flg;
 	u8				accept_any_vlan;
 	unsigned long			bins[8];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index c9a3bb63cc87..804102c257e6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1234,6 +1234,39 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static int __qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn,
+				  struct qed_vf_info *p_vf, bool val)
+{
+	struct qed_sp_vport_update_params params;
+	int rc;
+
+	if (val == p_vf->spoof_chk) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk value[%d] is already configured\n", val);
+		return 0;
+	}
+
+	memset(&params, 0, sizeof(struct qed_sp_vport_update_params));
+	params.opaque_fid = p_vf->opaque_fid;
+	params.vport_id = p_vf->vport_id;
+	params.update_anti_spoofing_en_flg = 1;
+	params.anti_spoofing_en = val;
+
+	rc = qed_sp_vport_update(p_hwfn, &params, QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc) {
+		p_vf->spoof_chk = val;
+		p_vf->req_spoofchk_val = p_vf->spoof_chk;
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk val[%d] configured\n", val);
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk configuration[val:%d] failed for VF[%d]\n",
+			   val, p_vf->relative_vf_id);
+	}
+
+	return rc;
+}
+
 static int qed_iov_reconfigure_unicast_vlan(struct qed_hwfn *p_hwfn,
 					    struct qed_vf_info *p_vf)
 {
@@ -1476,6 +1509,8 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 
 		/* Force configuration if needed on the newly opened vport */
 		qed_iov_configure_vport_forced(p_hwfn, vf, *p_bitmap);
+
+		__qed_iov_spoofchk_set(p_hwfn, vf, vf->req_spoofchk_val);
 	}
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_START,
 			     sizeof(struct pfvf_def_resp_tlv), status);
@@ -1489,6 +1524,7 @@ static void qed_iov_vf_mbx_stop_vport(struct qed_hwfn *p_hwfn,
 	int rc;
 
 	vf->vport_instance--;
+	vf->spoof_chk = false;
 
 	rc = qed_sp_vport_stop(p_hwfn, vf->opaque_fid, vf->vport_id);
 	if (rc != 0) {
@@ -2782,6 +2818,17 @@ void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
 }
 
+static bool qed_iov_vf_has_vport_instance(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return false;
+
+	return !!p_vf_info->vport_instance;
+}
+
 bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
@@ -2793,6 +2840,34 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+int qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn, int vfid, bool val)
+{
+	struct qed_vf_info *vf;
+	int rc = -EINVAL;
+
+	if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+		DP_NOTICE(p_hwfn,
+			  "SR-IOV sanity check failed, can't set spoofchk\n");
+		goto out;
+	}
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf)
+		goto out;
+
+	if (!qed_iov_vf_has_vport_instance(p_hwfn, vfid)) {
+		/* After VF VPORT start PF will configure spoof check */
+		vf->req_spoofchk_val = val;
+		rc = 0;
+		goto out;
+	}
+
+	rc = __qed_iov_spoofchk_set(p_hwfn, vf, val);
+
+out:
+	return rc;
+}
+
 static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
 					   u16 rel_vf_id)
 {
@@ -3179,6 +3254,21 @@ static int qed_set_vf_link_state(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_spoof_configure(struct qed_dev *cdev, int vfid, bool val)
+{
+	int i, rc = -EINVAL;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		rc = qed_iov_spoofchk_set(p_hwfn, vfid, val);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
 static int qed_configure_max_vf_rate(struct qed_dev *cdev, int vfid, int rate)
 {
 	int i;
@@ -3417,5 +3507,6 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
 	.set_link_state = &qed_set_vf_link_state,
+	.set_spoof = &qed_spoof_configure,
 	.set_rate = &qed_set_vf_rate,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index ab3d291cf7f0..c8667c65e685 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -154,6 +154,8 @@ struct qed_vf_info {
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
 	u8 num_active_rxqs;
 	struct qed_public_vf_info p_vf_info;
+	bool spoof_chk;
+	bool req_spoofchk_val;
 
 	/* Stores the configuration requested by VF */
 	struct qed_vf_shadow_config shadow_config;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3d0f98f81122..a908bd69d252 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1802,6 +1802,16 @@ static int qede_set_vf_rate(struct net_device *dev, int vfidx,
 					max_tx_rate);
 }
 
+static int qede_set_vf_spoofchk(struct net_device *dev, int vfidx, bool val)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_spoof(edev->cdev, vfidx, val);
+}
+
 static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
 				  int link_state)
 {
@@ -2142,6 +2152,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_get_stats64 = qede_get_stats64,
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_link_state = qede_set_vf_link_state,
+	.ndo_set_vf_spoofchk = qede_set_vf_spoofchk,
 	.ndo_set_vf_rate = qede_set_vf_rate,
 #endif
 #ifdef CONFIG_QEDE_VXLAN
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index f364f2bd7a4d..2596d30d9e63 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -22,6 +22,8 @@ struct qed_iov_hv_ops {
 	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
 			       int link_state);
 
+	int (*set_spoof) (struct qed_dev *cdev, int vfid, bool val);
+
 	int (*set_rate) (struct qed_dev *cdev, int vfid,
 			 u32 min_rate, u32 max_rate);
 };
-- 
cgit v1.2.3


From 73390ac9d82bf9f0c849ff57b06a03145fbf05d6 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:24 +0300
Subject: qed*: support ndo_get_vf_config

Allows the user to view the VF configuration by observing the PF's
device.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 93 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c | 12 ++++
 include/linux/qed/qed_iov_if.h               |  3 +
 3 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 804102c257e6..6af8fd9fd560 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -2588,6 +2588,30 @@ void qed_iov_set_link(struct qed_hwfn *p_hwfn,
 	p_bulletin->capability_speed = p_caps->speed_capabilities;
 }
 
+static void qed_iov_get_link(struct qed_hwfn *p_hwfn,
+			     u16 vfid,
+			     struct qed_mcp_link_params *p_params,
+			     struct qed_mcp_link_state *p_link,
+			     struct qed_mcp_link_capabilities *p_caps)
+{
+	struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn,
+						       vfid,
+						       false);
+	struct qed_bulletin_content *p_bulletin;
+
+	if (!p_vf)
+		return;
+
+	p_bulletin = p_vf->bulletin.p_virt;
+
+	if (p_params)
+		__qed_vf_get_link_params(p_hwfn, p_params, p_bulletin);
+	if (p_link)
+		__qed_vf_get_link_state(p_hwfn, p_link, p_bulletin);
+	if (p_caps)
+		__qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin);
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -2840,6 +2864,17 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+static bool qed_iov_spoofchk_get(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return false;
+
+	return vf_info->spoof_chk;
+}
+
 int qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn, int vfid, bool val)
 {
 	struct qed_vf_info *vf;
@@ -2937,6 +2972,23 @@ int qed_iov_configure_min_tx_rate(struct qed_dev *cdev, int vfid, u32 rate)
 	return qed_configure_vport_wfq(cdev, vport_id, rate);
 }
 
+static int qed_iov_get_vf_min_rate(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_wfq_data *vf_vp_wfq;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return 0;
+
+	vf_vp_wfq = &p_hwfn->qm_info.wfq_data[vf_info->vport_id];
+
+	if (vf_vp_wfq->configured)
+		return vf_vp_wfq->min_speed;
+	else
+		return 0;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -3153,6 +3205,46 @@ static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
 	return 0;
 }
 
+static int qed_get_vf_config(struct qed_dev *cdev,
+			     int vf_id, struct ifla_vf_info *ivi)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_public_vf_info *vf_info;
+	struct qed_mcp_link_state link;
+	u32 tx_rate;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+
+	qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL);
+
+	/* Fill information about VF */
+	ivi->vf = vf_id;
+
+	if (is_valid_ether_addr(vf_info->forced_mac))
+		ether_addr_copy(ivi->mac, vf_info->forced_mac);
+	else
+		ether_addr_copy(ivi->mac, vf_info->mac);
+
+	ivi->vlan = vf_info->forced_vlan;
+	ivi->spoofchk = qed_iov_spoofchk_get(hwfn, vf_id);
+	ivi->linkstate = vf_info->link_state;
+	tx_rate = vf_info->tx_rate;
+	ivi->max_tx_rate = tx_rate ? tx_rate : link.speed;
+	ivi->min_tx_rate = qed_iov_get_vf_min_rate(hwfn, vf_id);
+
+	return 0;
+}
+
 void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 {
 	struct qed_mcp_link_capabilities caps;
@@ -3506,6 +3598,7 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
+	.get_config = &qed_get_vf_config,
 	.set_link_state = &qed_set_vf_link_state,
 	.set_spoof = &qed_spoof_configure,
 	.set_rate = &qed_set_vf_rate,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index a908bd69d252..7130ee7f87da 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1793,6 +1793,17 @@ static struct rtnl_link_stats64 *qede_get_stats64(
 }
 
 #ifdef CONFIG_QED_SRIOV
+static int qede_get_vf_config(struct net_device *dev, int vfidx,
+			      struct ifla_vf_info *ivi)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->get_config(edev->cdev, vfidx, ivi);
+}
+
 static int qede_set_vf_rate(struct net_device *dev, int vfidx,
 			    int min_tx_rate, int max_tx_rate)
 {
@@ -2153,6 +2164,7 @@ static const struct net_device_ops qede_netdev_ops = {
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_link_state = qede_set_vf_link_state,
 	.ndo_set_vf_spoofchk = qede_set_vf_spoofchk,
+	.ndo_get_vf_config = qede_get_vf_config,
 	.ndo_set_vf_rate = qede_set_vf_rate,
 #endif
 #ifdef CONFIG_QEDE_VXLAN
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 2596d30d9e63..5a4f8d0899e9 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -19,6 +19,9 @@ struct qed_iov_hv_ops {
 
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 
+	int (*get_config) (struct qed_dev *cdev, int vf_id,
+			   struct ifla_vf_info *ivi);
+
 	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
 			       int link_state);
 
-- 
cgit v1.2.3


From 831bfb0e88b54726d6e027a1d547066ffeb8b27e Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:25 +0300
Subject: qed*: Tx-switching configuration

Device should be configured by default to VEB once VFs are active.
This changes the configuration of both PFs' and VFs' vports into enabling
tx-switching once sriov is enabled.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  4 ++++
 drivers/net/ethernet/qlogic/qed/qed_l2.h          |  1 +
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  1 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  5 ++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       |  1 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 12 ++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 24 ++++++++++++++++++++++-
 include/linux/qed/qed_eth_if.h                    |  2 ++
 include/linux/qed/qed_if.h                        |  1 +
 11 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index acaa2866dae3..6fb6016409c6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -688,7 +688,8 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		qed_int_igu_enable(p_hwfn, p_ptt, int_mode);
 
 		/* send function start command */
-		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode);
+		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode,
+				     allow_npar_tx_switch);
 		if (rc)
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e0275a78b121..8fba87dd48af 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -99,6 +99,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 		break;
 	}
 
+	p_ramrod->tx_switching_en = p_params->tx_switching;
+
 	/* Software Function ID in hwfn (PFs are 0 - 15, VFs are 16 - 135) */
 	p_ramrod->sw_fid = qed_concrete_to_sw_fid(p_hwfn->cdev,
 						  p_params->concrete_fid);
@@ -1792,6 +1794,8 @@ static int qed_update_vport(struct qed_dev *cdev,
 		params->update_vport_active_flg;
 	sp_params.vport_active_rx_flg = params->vport_active_flg;
 	sp_params.vport_active_tx_flg = params->vport_active_flg;
+	sp_params.update_tx_switching_flg = params->update_tx_switching_flg;
+	sp_params.tx_switching_flg = params->tx_switching_flg;
 	sp_params.accept_any_vlan = params->accept_any_vlan;
 	sp_params.update_accept_any_vlan_flg =
 		params->update_accept_any_vlan_flg;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index a04fb7f061ea..002114543451 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -94,6 +94,7 @@ enum qed_tpa_mode {
 struct qed_sp_vport_start_params {
 	enum qed_tpa_mode tpa_mode;
 	bool remove_inner_vlan;
+	bool tx_switching;
 	bool only_untagged;
 	bool drop_ttl0;
 	u8 max_buffers_per_cqe;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index dcb782c14e5c..6ffc21da1415 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -216,6 +216,7 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 		dev_info->fw_rev = FW_REVISION_VERSION;
 		dev_info->fw_eng = FW_ENGINEERING_VERSION;
 		dev_info->mf_mode = cdev->mf_mode;
+		dev_info->tx_switching = true;
 	} else {
 		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
 				      &dev_info->fw_minor, &dev_info->fw_rev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index c2999cb5d1e2..ab5549f4e5ea 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -344,13 +344,14 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_tunn
  * @param mode
+ * @param allow_npar_tx_switch
  *
  * @return int
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
-		    enum qed_mf_mode mode);
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
 
 /**
  * @brief qed_sp_pf_stop - PF Function Stop Ramrod
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index ed90947c451d..8c555ed1f949 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -299,7 +299,7 @@ qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
-		    enum qed_mf_mode mode)
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
 	u16 sb = qed_int_get_sp_sb_id(p_hwfn);
@@ -358,6 +358,9 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 				     &p_ramrod->tunnel_config);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
+	if (IS_MF_SI(p_hwfn))
+		p_ramrod->allow_npar_tx_switching = allow_npar_tx_switch;
+
 	if (p_hwfn->cdev->p_iov_info) {
 		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 6af8fd9fd560..d4df406ac0a4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1490,6 +1490,7 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 
 	params.tpa_mode = start->tpa_mode;
 	params.remove_inner_vlan = start->inner_vlan_removal;
+	params.tx_switching = true;
 
 	params.only_untagged = vf_info->bulletin.p_virt->default_only_untagged;
 	params.drop_ttl0 = false;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index db14e230c9a4..72e69c0ec10d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -633,6 +633,18 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 		}
 	}
 
+	if (p_params->update_tx_switching_flg) {
+		struct vfpf_vport_update_tx_switch_tlv *p_tx_switch_tlv;
+
+		size = sizeof(struct vfpf_vport_update_tx_switch_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH;
+		p_tx_switch_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+					      tlv, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		p_tx_switch_tlv->tx_switching = p_params->tx_switching_flg;
+	}
+
 	if (p_params->update_approx_mcast_flg) {
 		struct vfpf_vport_update_mcast_bin_tlv *p_mcast_tlv;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 7130ee7f87da..8114541f327c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -137,10 +137,26 @@ static int qede_set_vf_mac(struct net_device *ndev, int vfidx, u8 *mac)
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
+	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	int rc;
 
 	DP_VERBOSE(edev, QED_MSG_IOV, "Requested %d VFs\n", num_vfs_param);
 
-	return edev->ops->iov->configure(edev->cdev, num_vfs_param);
+	rc = edev->ops->iov->configure(edev->cdev, num_vfs_param);
+
+	/* Enable/Disable Tx switching for PF */
+	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
+	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
+		struct qed_update_vport_params params;
+
+		memset(&params, 0, sizeof(params));
+		params.vport_id = 0;
+		params.update_tx_switching_flg = 1;
+		params.tx_switching_flg = num_vfs_param ? 1 : 0;
+		edev->ops->vport_update(edev->cdev, &params);
+	}
+
+	return rc;
 }
 #endif
 
@@ -3291,6 +3307,12 @@ static int qede_start_queues(struct qede_dev *edev)
 	vport_update_params.update_vport_active_flg = 1;
 	vport_update_params.vport_active_flg = 1;
 
+	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
+	    qed_info->tx_switching) {
+		vport_update_params.update_tx_switching_flg = 1;
+		vport_update_params.tx_switching_flg = 1;
+	}
+
 	/* Fill struct with RSS params */
 	if (QEDE_RSS_CNT(edev) > 1) {
 		vport_update_params.update_rss_flg = 1;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e0f6e6482031..6ae8cb4a61d3 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -35,6 +35,8 @@ struct qed_update_vport_params {
 	u8 vport_id;
 	u8 update_vport_active_flg;
 	u8 vport_active_flg;
+	u8 update_tx_switching_flg;
+	u8 tx_switching_flg;
 	u8 update_accept_any_vlan_flg;
 	u8 accept_any_vlan;
 	u8 update_rss_flg;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 76a6f168a190..0fd8f247e65f 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -93,6 +93,7 @@ struct qed_dev_info {
 
 	u32		flash_size;
 	u8		mf_mode;
+	bool		tx_switching;
 };
 
 enum qed_sb_type {
-- 
cgit v1.2.3


From 9dc0b289c4c09bc1a92bdcc055cb37af9b72eb28 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:39 +0000
Subject: net/mlx5_core: Firmware commands to support flow counters

Getting packet/byte statistics on flows is done through flow counters.
Implement the firmware commands to alloc, free and query flow counters.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c    |  6 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 66 ++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h |  5 ++
 include/linux/mlx5/mlx5_ifc.h                    | 99 +++++++++++++++++++++++-
 4 files changed, 173 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 63cac841cfa9..dcd2df6518de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -294,6 +294,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
 	case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
 	case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -395,6 +396,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -539,6 +542,9 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY);
 	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY);
 	MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 9797768891ee..ccb63a0bb54a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -323,3 +323,69 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
 	return err;
 }
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					 sizeof(out));
+	if (err)
+		return err;
+
+	*id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+
+	return 0;
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(dealloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+	MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+		      u64 *packets, u64 *bytes)
+{
+	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter)];
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)];
+	void *stats;
+	int err = 0;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index c97b4a03eeed..18c111a4691f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -70,4 +70,9 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 			    struct mlx5_flow_table *ft);
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+		      u64 *packets, u64 *bytes);
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4ce4ea422a10..614c795eadea 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -202,6 +202,9 @@ enum {
 	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x936,
 	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x937,
 	MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY       = 0x938,
+	MLX5_CMD_OP_ALLOC_FLOW_COUNTER            = 0x939,
+	MLX5_CMD_OP_DEALLOC_FLOW_COUNTER          = 0x93a,
+	MLX5_CMD_OP_QUERY_FLOW_COUNTER            = 0x93b,
 	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c
 };
 
@@ -265,7 +268,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         ft_support[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         flow_counter[0x1];
 	u8	   flow_modify_en[0x1];
 	u8         modify_root[0x1];
 	u8         identified_miss_table_mode[0x1];
@@ -941,6 +945,19 @@ struct mlx5_ifc_dest_format_struct_bits {
 	u8         reserved_at_20[0x20];
 };
 
+struct mlx5_ifc_flow_counter_list_bits {
+	u8         reserved_at_0[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_20[0x20];
+};
+
+union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits {
+	struct mlx5_ifc_dest_format_struct_bits dest_format_struct;
+	struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
+	u8         reserved_at_0[0x40];
+};
+
 struct mlx5_ifc_fte_match_param_bits {
 	struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers;
 
@@ -2006,6 +2023,7 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_ALLOW     = 0x1,
 	MLX5_FLOW_CONTEXT_ACTION_DROP      = 0x2,
 	MLX5_FLOW_CONTEXT_ACTION_FWD_DEST  = 0x4,
+	MLX5_FLOW_CONTEXT_ACTION_COUNT     = 0x8,
 };
 
 struct mlx5_ifc_flow_context_bits {
@@ -2022,13 +2040,16 @@ struct mlx5_ifc_flow_context_bits {
 	u8         reserved_at_80[0x8];
 	u8         destination_list_size[0x18];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x8];
+	u8         flow_counter_list_size[0x18];
+
+	u8         reserved_at_c0[0x140];
 
 	struct mlx5_ifc_fte_match_param_bits match_value;
 
 	u8         reserved_at_1200[0x600];
 
-	struct mlx5_ifc_dest_format_struct_bits destination[0];
+	union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[0];
 };
 
 enum {
@@ -3937,6 +3958,34 @@ struct mlx5_ifc_query_flow_group_in_bits {
 	u8         reserved_at_e0[0x120];
 };
 
+struct mlx5_ifc_query_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_traffic_counter_bits flow_statistics[0];
+};
+
+struct mlx5_ifc_query_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x80];
+
+	u8         clear[0x1];
+	u8         reserved_at_c1[0xf];
+	u8         num_of_counters[0x10];
+
+	u8         reserved_at_e0[0x10];
+	u8         flow_counter_id[0x10];
+};
+
 struct mlx5_ifc_query_esw_vport_context_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -5510,6 +5559,28 @@ struct mlx5_ifc_dealloc_pd_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_dealloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
 struct mlx5_ifc_create_xrc_srq_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -6237,6 +6308,28 @@ struct mlx5_ifc_alloc_pd_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_alloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_add_vxlan_udp_dport_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From bd5251dbf156b6bc0661a9409d46e47160df61dd Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:40 +0000
Subject: net/mlx5_core: Introduce flow steering destination of type counter

When adding a flow steering rule with a counter, need to supply a
destination of type MLX5_FLOW_DESTINATION_TYPE_COUNTER, with a pointer
to a struct mlx5_fc.
Also, MLX5_FLOW_CONTEXT_ACTION_COUNT bit should be set in the action.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  | 36 +++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 52 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 23 ++++++++++
 include/linux/mlx5/fs.h                           |  2 +
 include/linux/mlx5/mlx5_ifc.h                     |  2 +
 6 files changed, 106 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index ccb63a0bb54a..a5bb6b695242 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -241,17 +241,20 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
 	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->flow_tag);
 	MLX5_SET(flow_context, in_flow_context, action, fte->action);
-	MLX5_SET(flow_context, in_flow_context, destination_list_size,
-		 fte->dests_size);
 	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
 				      match_value);
 	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
 
+	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
 	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
+		int list_size = 0;
+
 		list_for_each_entry(dst, &fte->node.children, node.list) {
 			unsigned int id;
 
+			if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
 			MLX5_SET(dest_format_struct, in_dests, destination_type,
 				 dst->dest_attr.type);
 			if (dst->dest_attr.type ==
@@ -262,8 +265,31 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			}
 			MLX5_SET(dest_format_struct, in_dests, destination_id, id);
 			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, destination_list_size,
+			 list_size);
+	}
+
+	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			if (dst->dest_attr.type !=
+			    MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
+				 dst->dest_attr.counter->id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
 		}
+
+		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
+			 list_size);
 	}
+
 	memset(out, 0, sizeof(out));
 	err = mlx5_cmd_exec_check_status(dev, in, inlen, out,
 					 sizeof(out));
@@ -283,18 +309,16 @@ int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
 int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
 			struct mlx5_flow_table *ft,
 			unsigned group_id,
+			int modify_mask,
 			struct fs_fte *fte)
 {
 	int opmod;
-	int modify_mask;
 	int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev,
 						flow_table_properties_nic_receive.
 						flow_modify_en);
 	if (!atomic_mod_cap)
 		return -ENOTSUPP;
 	opmod = 1;
-	modify_mask = 1 <<
-		MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST;
 
 	return	mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, group_id, fte);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 18c111a4691f..fc4f7b83fe0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -62,6 +62,7 @@ int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
 int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
 			struct mlx5_flow_table *ft,
 			unsigned group_id,
+			int modify_mask,
 			struct fs_fte *fte);
 
 int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 659a6980cda2..9420def3a2fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -344,6 +344,7 @@ static void del_rule(struct fs_node *node)
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
 	u32	*match_value;
+	int modify_mask;
 	struct mlx5_core_dev *dev = get_dev(node);
 	int match_len = MLX5_ST_SZ_BYTES(fte_match_param);
 	int err;
@@ -367,8 +368,11 @@ static void del_rule(struct fs_node *node)
 	}
 	if ((fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
 	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST),
 		err = mlx5_cmd_update_fte(dev, ft,
-					  fg->id, fte);
+					  fg->id,
+					  modify_mask,
+					  fte);
 		if (err)
 			pr_warn("%s can't del rule fg id=%d fte_index=%d\n",
 				__func__, fg->id, fte->index);
@@ -615,6 +619,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
+	int modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
 	int err = 0;
 
 	fs_get_obj(fte, rule->node.parent);
@@ -626,7 +631,9 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 
 	memcpy(&rule->dest_attr, dest, sizeof(*dest));
 	err = mlx5_cmd_update_fte(get_dev(&ft->node),
-				  ft, fg->id, fte);
+				  ft, fg->id,
+				  modify_mask,
+				  fte);
 	unlock_ref_node(&fte->node);
 
 	return err;
@@ -877,6 +884,7 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_rule *rule;
+	int modify_mask = 0;
 	int err;
 
 	rule = alloc_rule(dest);
@@ -892,14 +900,20 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 		list_add(&rule->node.list, &fte->node.children);
 	else
 		list_add_tail(&rule->node.list, &fte->node.children);
-	if (dest)
+	if (dest) {
 		fte->dests_size++;
+
+		modify_mask |= dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER ?
+			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS) :
+			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	}
+
 	if (fte->dests_size == 1 || !dest)
 		err = mlx5_cmd_create_fte(get_dev(&ft->node),
 					  ft, fg->id, fte);
 	else
 		err = mlx5_cmd_update_fte(get_dev(&ft->node),
-					  ft, fg->id, fte);
+					  ft, fg->id, modify_mask, fte);
 	if (err)
 		goto free_rule;
 
@@ -1092,10 +1106,40 @@ unlock_fg:
 	return rule;
 }
 
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule)
+{
+	struct mlx5_flow_rule *dst;
+	struct fs_fte *fte;
+
+	fs_get_obj(fte, rule->node.parent);
+
+	fs_for_each_dst(dst, fte) {
+		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+			return dst->dest_attr.counter;
+	}
+
+	return NULL;
+}
+
+static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
+		return !counter;
+
+	if (!counter)
+		return false;
+
+	/* Hardware support counter for a drop action only */
+	return action == (MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT);
+}
+
 static bool dest_is_valid(struct mlx5_flow_destination *dest,
 			  u32 action,
 			  struct mlx5_flow_table *ft)
 {
+	if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
+		return counter_is_valid(dest->counter, action);
+
 	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
 		return true;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 8e76cc505f5a..1989048ebdfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -96,6 +96,28 @@ struct mlx5_flow_table {
 	struct list_head		fwd_rules;
 };
 
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	struct list_head list;
+
+	/* last{packets,bytes} members are used when calculating the delta since
+	 * last reading
+	 */
+	u64 lastpackets;
+	u64 lastbytes;
+
+	u16 id;
+	bool deleted;
+	bool aging;
+
+	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+};
+
 /* Type of children is mlx5_flow_rule */
 struct fs_fte {
 	struct fs_node			node;
@@ -105,6 +127,7 @@ struct fs_fte {
 	u32				index;
 	u32				action;
 	enum fs_fte_status		status;
+	struct mlx5_fc			*counter;
 };
 
 /* Type of children is mlx5_flow_table/namespace */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6467569ad76e..c8b9ede1c20a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -73,6 +73,7 @@ struct mlx5_flow_destination {
 		u32			tir_num;
 		struct mlx5_flow_table	*ft;
 		u32			vport_num;
+		struct mlx5_fc		*counter;
 	};
 };
 
@@ -125,4 +126,5 @@ void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
 int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 				 struct mlx5_flow_destination *dest);
 
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 614c795eadea..9a05cd7e5890 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -936,6 +936,8 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
 	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
+
+	MLX5_FLOW_DESTINATION_TYPE_COUNTER      = 0x100,
 };
 
 struct mlx5_ifc_dest_format_struct_bits {
-- 
cgit v1.2.3


From 43a335e055bb7ebdc8a68ce7362ef26ef5bda92b Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:41 +0000
Subject: net/mlx5_core: Flow counters infrastructure

If a counter has the aging flag set when created, it is added to a list
of counters that will be queried periodically from a workqueue.  query
result and last use timestamp are cached.
add/del counter must be very efficient since thousands of such
operations might be issued in a second.
There is only a single reference to counters without aging, therefore
no need for locks.
But, counters with aging enabled are stored in a list. In order to make
code as lockless as possible, all the list manipulation and access to
hardware is done from a single context - the periodic counters query
thread.

The hardware supports multiple counters per FTE, however currently we
are using one counter for each FTE.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   3 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 226 +++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  14 ++
 include/linux/mlx5/fs.h                            |   5 +
 6 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index b531d4f3c00b..9ea7b583096a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o
+		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o fs_counters.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
 		en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9420def3a2fe..8b5f0b2c0d5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1771,6 +1771,7 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
 	cleanup_single_prio_root_ns(dev, dev->priv.esw_egress_root_ns);
 	cleanup_single_prio_root_ns(dev, dev->priv.esw_ingress_root_ns);
+	mlx5_cleanup_fc_stats(dev);
 }
 
 static int init_fdb_root_ns(struct mlx5_core_dev *dev)
@@ -1827,10 +1828,14 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	int err = 0;
 
+	err = mlx5_init_fc_stats(dev);
+	if (err)
+		return err;
+
 	if (MLX5_CAP_GEN(dev, nic_flow_table)) {
 		err = init_root_ns(dev);
 		if (err)
-			return err;
+			goto err;
 	}
 	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
 		err = init_fdb_root_ns(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 1989048ebdfd..aa41a7314691 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -169,6 +169,9 @@ struct mlx5_flow_root_namespace {
 	struct mutex			chain_lock;
 };
 
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
new file mode 100644
index 000000000000..164dc37fda72
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+
+#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+
+/* locking scheme:
+ *
+ * It is the responsibility of the user to prevent concurrent calls or bad
+ * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
+ * to struct mlx5_fc.
+ * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
+ * dump (access to struct mlx5_fc) after a counter is destroyed.
+ *
+ * access to counter list:
+ * - create (user context)
+ *   - mlx5_fc_create() only adds to an addlist to be used by
+ *     mlx5_fc_stats_query_work(). addlist is protected by a spinlock.
+ *   - spawn thread to do the actual destroy
+ *
+ * - destroy (user context)
+ *   - mark a counter as deleted
+ *   - spawn thread to do the actual del
+ *
+ * - dump (user context)
+ *   user should not call dump after destroy
+ *
+ * - query (single thread workqueue context)
+ *   destroy/dump - no conflict (see destroy)
+ *   query/dump - packets and bytes might be inconsistent (since update is not
+ *                atomic)
+ *   query/create - no conflict (see create)
+ *   since every create/destroy spawn the work, only after necessary time has
+ *   elapsed, the thread will actually query the hardware.
+ */
+
+static void mlx5_fc_stats_work(struct work_struct *work)
+{
+	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
+						 priv.fc_stats.work.work);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	unsigned long now = jiffies;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+	int err = 0;
+
+	spin_lock(&fc_stats->addlist_lock);
+
+	list_splice_tail_init(&fc_stats->addlist, &fc_stats->list);
+
+	if (!list_empty(&fc_stats->list))
+		queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
+
+	spin_unlock(&fc_stats->addlist_lock);
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->list, list) {
+		struct mlx5_fc_cache *c = &counter->cache;
+		u64 packets;
+		u64 bytes;
+
+		if (counter->deleted) {
+			list_del(&counter->list);
+
+			mlx5_cmd_fc_free(dev, counter->id);
+
+			kfree(counter);
+			continue;
+		}
+
+		if (time_before(now, fc_stats->next_query))
+			continue;
+
+		err = mlx5_cmd_fc_query(dev, counter->id, &packets, &bytes);
+		if (err) {
+			pr_err("Error querying stats for counter id %d\n",
+			       counter->id);
+			continue;
+		}
+
+		if (packets == c->packets)
+			continue;
+
+		c->lastuse = jiffies;
+		c->packets = packets;
+		c->bytes   = bytes;
+	}
+
+	if (time_after_eq(now, fc_stats->next_query))
+		fc_stats->next_query = now + MLX5_FC_STATS_PERIOD;
+}
+
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_cmd_fc_alloc(dev, &counter->id);
+	if (err)
+		goto err_out;
+
+	if (aging) {
+		counter->aging = true;
+
+		spin_lock(&fc_stats->addlist_lock);
+		list_add(&counter->list, &fc_stats->addlist);
+		spin_unlock(&fc_stats->addlist_lock);
+
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+	}
+
+	return counter;
+
+err_out:
+	kfree(counter);
+
+	return ERR_PTR(err);
+}
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (!counter)
+		return;
+
+	if (counter->aging) {
+		counter->deleted = true;
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+		return;
+	}
+
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	INIT_LIST_HEAD(&fc_stats->list);
+	INIT_LIST_HEAD(&fc_stats->addlist);
+	spin_lock_init(&fc_stats->addlist_lock);
+
+	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
+	if (!fc_stats->wq)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
+
+	return 0;
+}
+
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+
+	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
+	destroy_workqueue(dev->priv.fc_stats.wq);
+	dev->priv.fc_stats.wq = NULL;
+
+	list_splice_tail_init(&fc_stats->addlist, &fc_stats->list);
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->list, list) {
+		list_del(&counter->list);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+}
+
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse)
+{
+	struct mlx5_fc_cache c;
+
+	c = counter->cache;
+
+	*bytes = c.bytes - counter->lastbytes;
+	*packets = c.packets - counter->lastpackets;
+	*lastuse = c.lastuse;
+
+	counter->lastbytes = c.bytes;
+	counter->lastpackets = c.packets;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9613143f0561..07b504f7eb84 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
+#include <linux/workqueue.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -457,6 +458,17 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+struct mlx5_fc_stats {
+	struct list_head list;
+	struct list_head addlist;
+	/* protect addlist add/splice operations */
+	spinlock_t addlist_lock;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+};
+
 struct mlx5_eswitch;
 
 struct mlx5_priv {
@@ -520,6 +532,8 @@ struct mlx5_priv {
 	struct mlx5_flow_root_namespace *fdb_root_ns;
 	struct mlx5_flow_root_namespace *esw_egress_root_ns;
 	struct mlx5_flow_root_namespace *esw_ingress_root_ns;
+
+	struct mlx5_fc_stats		fc_stats;
 };
 
 enum mlx5_device_state {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index c8b9ede1c20a..4b7a107d9c19 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -127,4 +127,9 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 				 struct mlx5_flow_destination *dest);
 
 struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse);
+
 #endif
-- 
cgit v1.2.3


From c94987e40ebbae3b7b6c3ece37b6f8338830f6b1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:27 +0200
Subject: bpf: move bpf_jit_enable declaration

Move the bpf_jit_enable declaration to the filter.h file where
most other core code is declared, also since we're going to add
a second knob there.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h    | 2 ++
 include/linux/netdevice.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ec1411c89105..4ff0e647598f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,6 +496,8 @@ void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
+extern int bpf_jit_enable;
+
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
 struct bpf_binary_header *
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c2f5112f08f7..c148edfe4965 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3759,7 +3759,6 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
-extern int		bpf_jit_enable;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
-- 
cgit v1.2.3


From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:30 +0200
Subject: bpf: add bpf_patch_insn_single helper

Move the functionality to patch instructions out of the verifier
code and into the core as the new bpf_patch_insn_single() helper
will be needed later on for blinding as well. No changes in
functionality.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c      | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  | 53 +++++++------------------------------
 3 files changed, 83 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ff0e647598f..c4aae496f376 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -495,6 +495,9 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5313d09d4b62..49b5538a5301 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_JMP  &&
+	       /* Call and Exit are both special jumps with no
+		* target inside the BPF instruction image.
+		*/
+	       BPF_OP(insn->code) != BPF_CALL &&
+	       BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+	struct bpf_insn *insn = prog->insnsi;
+	u32 i, insn_cnt = prog->len;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_is_jmp_and_has_target(insn))
+			continue;
+
+		/* Adjust offset of jmps if we cross boundaries. */
+		if (i < pos && i + insn->off + 1 > pos)
+			insn->off += delta;
+		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+			insn->off -= delta;
+	}
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len)
+{
+	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	struct bpf_prog *prog_adj;
+
+	/* Since our patchlet doesn't expand the image, we're done. */
+	if (insn_delta == 0) {
+		memcpy(prog->insnsi + off, patch, sizeof(*patch));
+		return prog;
+	}
+
+	insn_adj_cnt = prog->len + insn_delta;
+
+	/* Several new instructions need to be inserted. Make room
+	 * for them. Likely, there's no need for a new allocation as
+	 * last page could have large enough tailroom.
+	 */
+	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+				    GFP_USER);
+	if (!prog_adj)
+		return NULL;
+
+	prog_adj->len = insn_adj_cnt;
+
+	/* Patching happens in 3 steps:
+	 *
+	 * 1) Move over tail of insnsi from next instruction onwards,
+	 *    so we can patch the single target insn with one or more
+	 *    new ones (patching is always from 1 to n insns, n > 0).
+	 * 2) Inject new instructions at the target location.
+	 * 3) Adjust branch offsets if necessary.
+	 */
+	insn_rest = insn_adj_cnt - off - len;
+
+	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+		sizeof(*patch) * insn_rest);
+	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+	bpf_adj_branches(prog_adj, off, insn_delta);
+
+	return prog_adj;
+}
+
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 84bff68cf80e..a08d66215245 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
-	struct bpf_insn *insn = prog->insnsi;
-	int insn_cnt = prog->len;
-	int i;
-
-	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (BPF_CLASS(insn->code) != BPF_JMP ||
-		    BPF_OP(insn->code) == BPF_CALL ||
-		    BPF_OP(insn->code) == BPF_EXIT)
-			continue;
-
-		/* adjust offset of jmps if necessary */
-		if (i < pos && i + insn->off + 1 > pos)
-			insn->off += delta;
-		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
-			insn->off -= delta;
-	}
-}
-
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
-	u32 cnt;
-	int i;
 	enum bpf_access_type type;
+	int i;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		u32 insn_delta, cnt;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
 			type = BPF_READ;
 		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (cnt == 1) {
-			memcpy(insn, insn_buf, sizeof(*insn));
-			continue;
-		}
-
-		/* several new insns need to be inserted. Make room for them */
-		insn_cnt += cnt - 1;
-		new_prog = bpf_prog_realloc(env->prog,
-					    bpf_prog_size(insn_cnt),
-					    GFP_USER);
+		new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;
 
-		new_prog->len = insn_cnt;
-
-		memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
-			sizeof(*insn) * (insn_cnt - i - cnt));
-
-		/* copy substitute insns in place of load instruction */
-		memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
-
-		/* adjust branches in the whole program */
-		adjust_branches(new_prog, i, cnt - 1);
+		insn_delta = cnt - 1;
 
 		/* keep walking new program and skip insns we just inserted */
 		env->prog = new_prog;
-		insn = new_prog->insnsi + i + cnt - 1;
-		i += cnt - 1;
+		insn      = new_prog->insnsi + i + insn_delta;
+
+		insn_cnt += insn_delta;
+		i        += insn_delta;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:31 +0200
Subject: bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis

Since the blinding is strictly only called from inside eBPF JITs,
we need to change signatures for bpf_int_jit_compile() and
bpf_prog_select_runtime() first in order to prepare that the
eBPF program we're dealing with can change underneath. Hence,
for call sites, we need to return the latest prog. No functional
change in this patch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c |  7 ++++---
 arch/s390/net/bpf_jit_comp.c  |  8 +++++---
 arch/x86/net/bpf_jit_comp.c   |  7 ++++---
 include/linux/filter.h        |  5 +++--
 kernel/bpf/core.c             | 18 ++++++++++++++----
 kernel/bpf/syscall.c          |  2 +-
 lib/test_bpf.c                |  5 ++++-
 net/core/filter.c             |  6 +++++-
 8 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index ef35e866caf7..dd428807cb30 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -762,7 +762,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 	/* Nothing to do here. We support Internal BPF. */
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header;
 	struct jit_ctx ctx;
@@ -770,14 +770,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	u8 *image_ptr;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.prog = prog;
 
 	ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
 	if (ctx.offset == NULL)
-		return;
+		return prog;
 
 	/* 1. Initial fake pass to compute ctx->idx. */
 
@@ -828,6 +828,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	prog->jited = 1;
 out:
 	kfree(ctx.offset);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *prog)
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 3c0bfc1f2694..fcf301a889e7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,18 +1262,19 @@ void bpf_jit_compile(struct bpf_prog *fp)
 /*
  * Compile eBPF program "fp"
  */
-void bpf_int_jit_compile(struct bpf_prog *fp)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	struct bpf_binary_header *header;
 	struct bpf_jit jit;
 	int pass;
 
 	if (!bpf_jit_enable)
-		return;
+		return fp;
+
 	memset(&jit, 0, sizeof(jit));
 	jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
 	if (jit.addrs == NULL)
-		return;
+		return fp;
 	/*
 	 * Three initial passes:
 	 *   - 1/2: Determine clobbered registers
@@ -1305,6 +1306,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp)
 	}
 free_addrs:
 	kfree(jit.addrs);
+	return fp;
 }
 
 /*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f5bfd4fd28dd..6b2d23ea3590 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1073,7 +1073,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 {
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
 	int proglen, oldproglen = 0;
@@ -1084,11 +1084,11 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	int i;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
 	if (!addrs)
-		return;
+		return prog;
 
 	/* Before first pass, make a rough estimation of addrs[]
 	 * each bpf instruction is translated to less than 64 bytes
@@ -1140,6 +1140,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 out:
 	kfree(addrs);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *fp)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c4aae496f376..891852cf7716 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-int bpf_prog_select_runtime(struct bpf_prog *fp);
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
@@ -492,7 +492,8 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-void bpf_int_jit_compile(struct bpf_prog *fp);
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_skb_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49b5538a5301..70f0821aca47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 /**
  *	bpf_prog_select_runtime - select exec runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
+ *	@err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
  * The BPF program will be executed via BPF_PROG_RUN() macro.
  */
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
-	bpf_int_jit_compile(fp);
+	/* eBPF JITs can rewrite the program in case constant
+	 * blinding is active. However, in case of error during
+	 * blinding, bpf_int_jit_compile() must always return a
+	 * valid program, which in this case would simply not
+	 * be JITed, but falls back to the interpreter.
+	 */
+	fp = bpf_int_jit_compile(fp);
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
 	 * with JITed or non JITed program concatenations and not
 	 * all eBPF JITs might immediately support all features.
 	 */
-	return bpf_check_tail_call(fp);
+	*err = bpf_check_tail_call(fp);
+
+	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
@@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 };
 
 /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	return prog;
 }
 
 bool __weak bpf_helper_changes_skb_data(void *func)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5e9f7ad13a..46ecce4b79ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	err = bpf_prog_select_runtime(prog);
+	prog = bpf_prog_select_runtime(prog, &err);
 	if (err < 0)
 		goto free_used_maps;
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8f22fbedc3a6..93f45011a59d 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		fp->type = BPF_PROG_TYPE_SOCKET_FILTER;
 		memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
 
-		bpf_prog_select_runtime(fp);
+		/* We cannot error here as we don't need type compatibility
+		 * checks.
+		 */
+		fp = bpf_prog_select_runtime(fp, err);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ea51b479cf02..68adb5f52110 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 		 */
 		goto out_err_free;
 
-	bpf_prog_select_runtime(fp);
+	/* We are guaranteed to never error here with cBPF to eBPF
+	 * transitions, since there's no issue with type compatibility
+	 * checks on program arrays.
+	 */
+	fp = bpf_prog_select_runtime(fp, &err);
 
 	kfree(old_prog);
 	return fp;
-- 
cgit v1.2.3


From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:32 +0200
Subject: bpf: add generic constant blinding for use in jits

This work adds a generic facility for use from eBPF JIT compilers
that allows for further hardening of JIT generated images through
blinding constants. In response to the original work on BPF JIT
spraying published by Keegan McAllister [1], most BPF JITs were
changed to make images read-only and start at a randomized offset
in the page, where the rest was filled with trap instructions. We
have this nowadays in x86, arm, arm64 and s390 JIT compilers.
Additionally, later work also made eBPF interpreter images read
only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86,
arm, arm64 and s390 archs as well currently. This is done by
default for mentioned JITs when JITing is enabled. Furthermore,
we had a generic and configurable constant blinding facility on our
todo for quite some time now to further make spraying harder, and
first implementation since around netconf 2016.

We found that for systems where untrusted users can load cBPF/eBPF
code where JIT is enabled, start offset randomization helps a bit
to make jumps into crafted payload harder, but in case where larger
programs that cross page boundary are injected, we again have some
part of the program opcodes at a page start offset. With improved
guessing and more reliable payload injection, chances can increase
to jump into such payload. Elena Reshetova recently wrote a test
case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which
can leave some more room for payloads. Note that for all this,
additional bugs in the kernel are still required to make the jump
(and of course to guess right, to not jump into a trap) and naturally
the JIT must be enabled, which is disabled by default.

For helping mitigation, the general idea is to provide an option
bpf_jit_harden that admins can tweak along with bpf_jit_enable, so
that for cases where JIT should be enabled for performance reasons,
the generated image can be further hardened with blinding constants
for unpriviledged users (bpf_jit_harden == 1), with trading off
performance for these, but not for privileged ones. We also added
the option of blinding for all users (bpf_jit_harden == 2), which
is quite helpful for testing f.e. with test_bpf.ko. There are no
further e.g. hardening levels of bpf_jit_harden switch intended,
rationale is to have it dead simple to use as on/off. Since this
functionality would need to be duplicated over and over for JIT
compilers to use, which are already complex enough, we provide a
generic eBPF byte-code level based blinding implementation, which is
then just transparently JITed. JIT compilers need to make only a few
changes to integrate this facility and can be migrated one by one.

This option is for eBPF JITs and will be used in x86, arm64, s390
without too much effort, and soon ppc64 JITs, thus that native eBPF
can be blinded as well as cBPF to eBPF migrations, so that both can
be covered with a single implementation. The rule for JITs is that
bpf_jit_blind_constants() must be called from bpf_int_jit_compile(),
and in case blinding is disabled, we follow normally with JITing the
passed program. In case blinding is enabled and we fail during the
process of blinding itself, we must return with the interpreter.
Similarly, in case the JITing process after the blinding failed, we
return normally to the interpreter with the non-blinded code. Meaning,
interpreter doesn't change in any way and operates on eBPF code as
usual. For doing this pre-JIT blinding step, we need to make use of
a helper/auxiliary register, here BPF_REG_AX. This is strictly internal
to the JIT and not in any way part of the eBPF architecture. Just like
in the same way as JITs internally make use of some helper registers
when emitting code, only that here the helper register is one
abstraction level higher in eBPF bytecode, but nevertheless in JIT
phase. That helper register is needed since f.e. manually written
program can issue loads to all registers of eBPF architecture.

The core concept with the additional register is: blind out all 32
and 64 bit constants by converting BPF_K based instructions into a
small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this
is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND,
and REG <OP> BPF_REG_AX, so actual operation on the target register
is translated from BPF_K into BPF_X one that is operating on
BPF_REG_AX's content. During rewriting phase when blinding, RND is
newly generated via prandom_u32() for each processed instruction.
64 bit loads are split into two 32 bit loads to make translation and
patching not too complex. Only basic thing required by JITs is to
call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair, and to map BPF_REG_AX into an unused register.

Small bpf_jit_disasm extract from [2] when applied to x86 JIT:

echo 0 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f5e9 + <x>:
  [...]
  39:   mov    $0xa8909090,%eax
  3e:   mov    $0xa8909090,%eax
  43:   mov    $0xa8ff3148,%eax
  48:   mov    $0xa89081b4,%eax
  4d:   mov    $0xa8900bb0,%eax
  52:   mov    $0xa810e0c1,%eax
  57:   mov    $0xa8908eb4,%eax
  5c:   mov    $0xa89020b0,%eax
  [...]

echo 1 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f1e5 + <x>:
  [...]
  39:   mov    $0xe1192563,%r10d
  3f:   xor    $0x4989b5f3,%r10d
  46:   mov    %r10d,%eax
  49:   mov    $0xb8296d93,%r10d
  4f:   xor    $0x10b9fd03,%r10d
  56:   mov    %r10d,%eax
  59:   mov    $0x8c381146,%r10d
  5f:   xor    $0x24c7200e,%r10d
  66:   mov    %r10d,%eax
  69:   mov    $0xeb2a830e,%r10d
  6f:   xor    $0x43ba02ba,%r10d
  76:   mov    %r10d,%eax
  79:   mov    $0xd9730af,%r10d
  7f:   xor    $0xa5073b1f,%r10d
  86:   mov    %r10d,%eax
  89:   mov    $0x9a45662b,%r10d
  8f:   xor    $0x325586ea,%r10d
  96:   mov    %r10d,%eax
  [...]

As can be seen, original constants that carry payload are hidden
when enabled, actual operations are transformed from constant-based
to register-based ones, making jumps into constants ineffective.
Above extract/example uses single BPF load instruction over and
over, but of course all instructions with constants are blinded.

Performance wise, JIT with blinding performs a bit slower than just
JIT and faster than interpreter case. This is expected, since we
still get all the performance benefits from JITing and in normal
use-cases not every single instruction needs to be blinded. Summing
up all 296 test cases averaged over multiple runs from test_bpf.ko
suite, interpreter was 55% slower than JIT only and JIT with blinding
was 8% slower than JIT only. Since there are also some extremes in
the test suite, I expect for ordinary workloads that the performance
for the JIT with blinding case is even closer to JIT only case,
f.e. nmap test case from suite has averaged timings in ns 29 (JIT),
35 (+ blinding), and 151 (interpreter).

BPF test suite, seccomp test suite, eBPF sample code and various
bigger networking eBPF programs have been tested with this and were
running fine. For testing purposes, I also adapted interpreter and
redirected blinded eBPF image to interpreter and also here all tests
pass.

  [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html
  [2] https://github.com/01org/jit-spray-poc-for-ksp/
  [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Elena Reshetova <elena.reshetova@intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt |  11 +++
 include/linux/filter.h       |  42 +++++++++
 kernel/bpf/core.c            | 203 +++++++++++++++++++++++++++++++++++++++++++
 net/Kconfig                  |   7 +-
 net/core/sysctl_net_core.c   |   9 ++
 5 files changed, 270 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 809ab6efcc74..f0480f7ea740 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -43,6 +43,17 @@ Values :
 	1 - enable the JIT
 	2 - enable the JIT and ask the compiler to emit traces on kernel log.
 
+bpf_jit_harden
+--------------
+
+This enables hardening for the Berkeley Packet Filter Just in Time compiler.
+Supported are eBPF JIT backends. Enabling hardening trades off performance,
+but can mitigate JIT spraying.
+Values :
+	0 - disable JIT hardening (default value)
+	1 - enable JIT hardening for unprivileged users only
+	2 - enable JIT hardening for all users
+
 dev_weight
 --------------
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 891852cf7716..6fc31ef1da2d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -13,6 +13,8 @@
 #include <linux/printk.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
+
 #include <net/sch_generic.h>
 
 #include <asm/cacheflush.h>
@@ -42,6 +44,15 @@ struct bpf_prog_aux;
 #define BPF_REG_X	BPF_REG_7
 #define BPF_REG_TMP	BPF_REG_8
 
+/* Kernel hidden auxiliary/helper register for hardening step.
+ * Only used by eBPF JITs. It's nothing more than a temporary
+ * register that JITs use internally, only that here it's part
+ * of eBPF instructions that have been rewritten for blinding
+ * constants. See JIT pre-step in bpf_jit_blind_constants().
+ */
+#define BPF_REG_AX		MAX_BPF_REG
+#define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -501,6 +512,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
+extern int bpf_jit_harden;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -513,6 +525,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr);
 void bpf_jit_compile(struct bpf_prog *fp);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);
+
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
 {
@@ -523,6 +538,33 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
 }
+
+static inline bool bpf_jit_is_ebpf(void)
+{
+# ifdef CONFIG_HAVE_EBPF_JIT
+	return true;
+# else
+	return false;
+# endif
+}
+
+static inline bool bpf_jit_blinding_enabled(void)
+{
+	/* These are the prerequisites, should someone ever have the
+	 * idea to call blinding outside of them, we make sure to
+	 * bail out.
+	 */
+	if (!bpf_jit_is_ebpf())
+		return false;
+	if (!bpf_jit_enable)
+		return false;
+	if (!bpf_jit_harden)
+		return false;
+	if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
+		return false;
+
+	return true;
+}
 #else
 static inline void bpf_jit_compile(struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 70f0821aca47..f1e8a0def99b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	module_memfree(hdr);
 }
+
+int bpf_jit_harden __read_mostly;
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+			      const struct bpf_insn *aux,
+			      struct bpf_insn *to_buff)
+{
+	struct bpf_insn *to = to_buff;
+	u32 imm_rnd = prandom_u32();
+	s16 off;
+
+	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+	if (from->imm == 0 &&
+	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
+	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+		goto out;
+	}
+
+	switch (from->code) {
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU | BPF_OR  | BPF_K:
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_OR  | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_JMP | BPF_JEQ  | BPF_K:
+	case BPF_JMP | BPF_JNE  | BPF_K:
+	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSET | BPF_K:
+		/* Accommodate for extra offset in case of a backjump. */
+		off = from->off;
+		if (off < 0)
+			off -= 2;
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+		break;
+
+	case BPF_LD | BPF_ABS | BPF_W:
+	case BPF_LD | BPF_ABS | BPF_H:
+	case BPF_LD | BPF_ABS | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IND | BPF_W:
+	case BPF_LD | BPF_IND | BPF_H:
+	case BPF_LD | BPF_IND | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IMM | BPF_DW:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+		break;
+	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_DW:
+	case BPF_ST | BPF_MEM | BPF_W:
+	case BPF_ST | BPF_MEM | BPF_H:
+	case BPF_ST | BPF_MEM | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+		break;
+	}
+out:
+	return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+					      gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_prog *fp;
+
+	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
+		/* aux->prog still points to the fp_other one, so
+		 * when promoting the clone to the real program,
+		 * this still needs to be adapted.
+		 */
+		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+	}
+
+	return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+	/* aux was stolen by the other clone, so we cannot free
+	 * it from this path! It will be freed eventually by the
+	 * other program on release.
+	 *
+	 * At this point, we don't need a deferred release since
+	 * clone is guaranteed to not be locked.
+	 */
+	fp->aux = NULL;
+	__bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+	/* We have to repoint aux->prog to self, as we don't
+	 * know whether fp here is the clone or the original.
+	 */
+	fp->aux->prog = fp;
+	bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+	struct bpf_insn insn_buff[16], aux[2];
+	struct bpf_prog *clone, *tmp;
+	int insn_delta, insn_cnt;
+	struct bpf_insn *insn;
+	int i, rewritten;
+
+	if (!bpf_jit_blinding_enabled())
+		return prog;
+
+	clone = bpf_prog_clone_create(prog, GFP_USER);
+	if (!clone)
+		return ERR_PTR(-ENOMEM);
+
+	insn_cnt = clone->len;
+	insn = clone->insnsi;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		/* We temporarily need to hold the original ld64 insn
+		 * so that we can still access the first part in the
+		 * second blinding run.
+		 */
+		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    insn[1].code == 0)
+			memcpy(aux, insn, sizeof(aux));
+
+		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+		if (!rewritten)
+			continue;
+
+		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+		if (!tmp) {
+			/* Patching may have repointed aux->prog during
+			 * realloc from the original one, so we need to
+			 * fix it up here on error.
+			 */
+			bpf_jit_prog_release_other(prog, clone);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		clone = tmp;
+		insn_delta = rewritten - 1;
+
+		/* Walk new program and skip insns we just inserted. */
+		insn = clone->insnsi + i + insn_delta;
+		insn_cnt += insn_delta;
+		i        += insn_delta;
+	}
+
+	return clone;
+}
 #endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
diff --git a/net/Kconfig b/net/Kconfig
index f7148f24f114..ff40562a782c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -295,8 +295,11 @@ config BPF_JIT
 	  Berkeley Packet Filter filtering capabilities are normally handled
 	  by an interpreter. This option allows kernel to generate a native
 	  code when filter is loaded in memory. This should speedup
-	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
-	  this feature changing /proc/sys/net/core/bpf_jit_enable
+	  packet sniffing (libpcap/tcpdump).
+
+	  Note, admin should enable this feature changing:
+	  /proc/sys/net/core/bpf_jit_enable
+	  /proc/sys/net/core/bpf_jit_harden (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a6beb7b6ae55..0df2aa652530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+# ifdef CONFIG_HAVE_EBPF_JIT
+	{
+		.procname	= "bpf_jit_harden",
+		.data		= &bpf_jit_harden,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
+# endif
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-- 
cgit v1.2.3


From 39651abd28146fff2bfac63d68a7a56250a4aead Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Tue, 17 May 2016 06:44:26 -0400
Subject: qed: add support for dcbx.

This patch adds the necessary driver support for Management Firmware to
configure the device/firmware with the dcbx results. Management Firmware
is responsible for communicating the DCBX and driving the negotiation,
but the driver has responsibility of receiving async notification and
configuring the results in hw/fw. This patch also adds the dcbx support for
future protocols (e.g., FCoE) as preparation to their imminent submission.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h             |   2 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |  10 +
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        | 562 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.h        |  80 +++
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 105 +++-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  21 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |  13 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  13 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  25 +
 include/linux/qed/qed_if.h                        |   9 +
 11 files changed, 834 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index a44874562cfd..d1f157e439cf 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -2,5 +2,5 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o
+	 qed_selftest.o qed_dcbx.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 77323fc70927..1042f2af854a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -367,6 +367,8 @@ struct qed_hwfn {
 	struct qed_pf_iov		*pf_iov_info;
 	struct qed_mcp_info		*mcp_info;
 
+	struct qed_dcbx_info		*p_dcbx_info;
+
 	struct qed_hw_cid_data		*p_tx_cids;
 	struct qed_hw_cid_data		*p_rx_cids;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 078ff3fd7920..234c0fa8db2a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -130,6 +130,16 @@ void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn);
 
 void qed_qm_init_pf(struct qed_hwfn *p_hwfn);
 
+/**
+ * @brief Reconfigures QM pf on the fly
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /**
  * @brief qed_cxt_release - Release a cid
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
new file mode 100644
index 000000000000..cbf58e1f9333
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -0,0 +1,562 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dcbx.h"
+#include "qed_hsi.h"
+#include "qed_sp.h"
+
+#define QED_DCBX_MAX_MIB_READ_TRY       (100)
+#define QED_ETH_TYPE_DEFAULT            (0)
+#define QED_ETH_TYPE_ROCE               (0x8915)
+#define QED_UDP_PORT_TYPE_ROCE_V2       (0x12B7)
+#define QED_ETH_TYPE_FCOE               (0x8906)
+#define QED_TCP_PORT_ISCSI              (0xCBC)
+
+#define QED_DCBX_INVALID_PRIORITY       0xFF
+
+/* Get Traffic Class from priority traffic class table, 4 bits represent
+ * the traffic class corresponding to the priority.
+ */
+#define QED_DCBX_PRIO2TC(prio_tc_tbl, prio) \
+	((u32)(prio_tc_tbl >> ((7 - prio) * 4)) & 0x7)
+
+static const struct qed_dcbx_app_metadata qed_dcbx_app_update[] = {
+	{DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH}
+};
+
+static bool qed_dcbx_app_ethtype(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_ETHTYPE);
+}
+
+static bool qed_dcbx_app_port(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_PORT);
+}
+
+static bool qed_dcbx_default_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_DEFAULT);
+}
+
+static bool qed_dcbx_iscsi_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_port(app_info_bitmap) &&
+		  proto_id == QED_TCP_PORT_ISCSI);
+}
+
+static bool qed_dcbx_fcoe_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_FCOE);
+}
+
+static bool qed_dcbx_roce_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_ROCE);
+}
+
+static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_port(app_info_bitmap) &&
+		  proto_id == QED_UDP_PORT_TYPE_ROCE_V2);
+}
+
+static void
+qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data)
+{
+	enum dcbx_protocol_type id;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "DCBX negotiated: %d\n",
+		   p_data->dcbx_enabled);
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+			   "%s info: update %d, enable %d, prio %d, tc %d, num_tc %d\n",
+			   qed_dcbx_app_update[i].name, p_data->arr[id].update,
+			   p_data->arr[id].enable, p_data->arr[id].priority,
+			   p_data->arr[id].tc, p_hwfn->hw_info.num_tc);
+	}
+}
+
+static void
+qed_dcbx_set_params(struct qed_dcbx_results *p_data,
+		    struct qed_hw_info *p_info,
+		    bool enable,
+		    bool update,
+		    u8 prio,
+		    u8 tc,
+		    enum dcbx_protocol_type type,
+		    enum qed_pci_personality personality)
+{
+	/* PF update ramrod data */
+	p_data->arr[type].update = update;
+	p_data->arr[type].enable = enable;
+	p_data->arr[type].priority = prio;
+	p_data->arr[type].tc = tc;
+
+	/* QM reconf data */
+	if (p_info->personality == personality) {
+		if (personality == QED_PCI_ETH)
+			p_info->non_offload_tc = tc;
+		else
+			p_info->offload_tc = tc;
+	}
+}
+
+/* Update app protocol data and hw_info fields with the TLV info */
+static void
+qed_dcbx_update_app_info(struct qed_dcbx_results *p_data,
+			 struct qed_hwfn *p_hwfn,
+			 bool enable,
+			 bool update,
+			 u8 prio, u8 tc, enum dcbx_protocol_type type)
+{
+	struct qed_hw_info *p_info = &p_hwfn->hw_info;
+	enum qed_pci_personality personality;
+	enum dcbx_protocol_type id;
+	char *name;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		if (type != id)
+			continue;
+
+		personality = qed_dcbx_app_update[i].personality;
+		name = qed_dcbx_app_update[i].name;
+
+		qed_dcbx_set_params(p_data, p_info, enable, update,
+				    prio, tc, type, personality);
+	}
+}
+
+static bool
+qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn,
+			       u32 app_prio_bitmap,
+			       u16 id, enum dcbx_protocol_type *type)
+{
+	if (qed_dcbx_fcoe_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_FCOE;
+	} else if (qed_dcbx_roce_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ROCE;
+	} else if (qed_dcbx_iscsi_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ISCSI;
+	} else if (qed_dcbx_default_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ETH;
+	} else if (qed_dcbx_roce_v2_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ROCE_V2;
+	} else {
+		*type = DCBX_MAX_PROTOCOL_TYPE;
+		DP_ERR(p_hwfn,
+		       "No action required, App TLV id = 0x%x app_prio_bitmap = 0x%x\n",
+		       id, app_prio_bitmap);
+		return false;
+	}
+
+	return true;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int
+qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
+		     struct qed_dcbx_results *p_data,
+		     struct dcbx_app_priority_entry *p_tbl,
+		     u32 pri_tc_tbl, int count, bool dcbx_enabled)
+{
+	u8 tc, priority, priority_map;
+	enum dcbx_protocol_type type;
+	u16 protocol_id;
+	bool enable;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
+
+	/* Parse APP TLV */
+	for (i = 0; i < count; i++) {
+		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						DCBX_APP_PROTOCOL_ID);
+		priority_map = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						 DCBX_APP_PRI_MAP);
+		priority = ffs(priority_map) - 1;
+		if (priority < 0) {
+			DP_ERR(p_hwfn, "Invalid priority\n");
+			return -EINVAL;
+		}
+
+		tc = QED_DCBX_PRIO2TC(pri_tc_tbl, priority);
+		if (qed_dcbx_get_app_protocol_type(p_hwfn, p_tbl[i].entry,
+						   protocol_id, &type)) {
+			/* ETH always have the enable bit reset, as it gets
+			 * vlan information per packet. For other protocols,
+			 * should be set according to the dcbx_enabled
+			 * indication, but we only got here if there was an
+			 * app tlv for the protocol, so dcbx must be enabled.
+			 */
+			enable = !!(type == DCBX_PROTOCOL_ETH);
+
+			qed_dcbx_update_app_info(p_data, p_hwfn, enable, true,
+						 priority, tc, type);
+		}
+	}
+
+	/* If RoCE-V2 TLV is not detected, driver need to use RoCE app
+	 * data for RoCE-v2 not the default app data.
+	 */
+	if (!p_data->arr[DCBX_PROTOCOL_ROCE_V2].update &&
+	    p_data->arr[DCBX_PROTOCOL_ROCE].update) {
+		tc = p_data->arr[DCBX_PROTOCOL_ROCE].tc;
+		priority = p_data->arr[DCBX_PROTOCOL_ROCE].priority;
+		qed_dcbx_update_app_info(p_data, p_hwfn, true, true,
+					 priority, tc, DCBX_PROTOCOL_ROCE_V2);
+	}
+
+	/* Update ramrod protocol data and hw_info fields
+	 * with default info when corresponding APP TLV's are not detected.
+	 * The enabled field has a different logic for ethernet as only for
+	 * ethernet dcb should disabled by default, as the information arrives
+	 * from the OS (unless an explicit app tlv was present).
+	 */
+	tc = p_data->arr[DCBX_PROTOCOL_ETH].tc;
+	priority = p_data->arr[DCBX_PROTOCOL_ETH].priority;
+	for (type = 0; type < DCBX_MAX_PROTOCOL_TYPE; type++) {
+		if (p_data->arr[type].update)
+			continue;
+
+		enable = (type == DCBX_PROTOCOL_ETH) ? false : dcbx_enabled;
+		qed_dcbx_update_app_info(p_data, p_hwfn, enable, true,
+					 priority, tc, type);
+	}
+
+	return 0;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn)
+{
+	struct dcbx_app_priority_feature *p_app;
+	struct dcbx_app_priority_entry *p_tbl;
+	struct qed_dcbx_results data = { 0 };
+	struct dcbx_ets_feature *p_ets;
+	struct qed_hw_info *p_info;
+	u32 pri_tc_tbl, flags;
+	bool dcbx_enabled;
+	int num_entries;
+	int rc = 0;
+
+	/* If DCBx version is non zero, then negotiation was
+	 * successfuly performed
+	 */
+	flags = p_hwfn->p_dcbx_info->operational.flags;
+	dcbx_enabled = !!QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION);
+
+	p_app = &p_hwfn->p_dcbx_info->operational.features.app;
+	p_tbl = p_app->app_pri_tbl;
+
+	p_ets = &p_hwfn->p_dcbx_info->operational.features.ets;
+	pri_tc_tbl = p_ets->pri_tc_tbl[0];
+
+	p_info = &p_hwfn->hw_info;
+	num_entries = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_NUM_ENTRIES);
+
+	rc = qed_dcbx_process_tlv(p_hwfn, &data, p_tbl, pri_tc_tbl,
+				  num_entries, dcbx_enabled);
+	if (rc)
+		return rc;
+
+	p_info->num_tc = QED_MFW_GET_FIELD(p_ets->flags, DCBX_ETS_MAX_TCS);
+	data.pf_id = p_hwfn->rel_pf_id;
+	data.dcbx_enabled = dcbx_enabled;
+
+	qed_dcbx_dp_protocol(p_hwfn, &data);
+
+	memcpy(&p_hwfn->p_dcbx_info->results, &data,
+	       sizeof(struct qed_dcbx_results));
+
+	return 0;
+}
+
+static int
+qed_dcbx_copy_mib(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt,
+		  struct qed_dcbx_mib_meta_data *p_data,
+		  enum qed_mib_read_type type)
+{
+	u32 prefix_seq_num, suffix_seq_num;
+	int read_count = 0;
+	int rc = 0;
+
+	/* The data is considered to be valid only if both sequence numbers are
+	 * the same.
+	 */
+	do {
+		if (type == QED_DCBX_REMOTE_LLDP_MIB) {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->lldp_remote,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->lldp_remote->prefix_seq_num;
+			suffix_seq_num = p_data->lldp_remote->suffix_seq_num;
+		} else {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->mib,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->mib->prefix_seq_num;
+			suffix_seq_num = p_data->mib->suffix_seq_num;
+		}
+		read_count++;
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "mib type = %d, try count = %d prefix seq num  = %d suffix seq num = %d\n",
+			   type, read_count, prefix_seq_num, suffix_seq_num);
+	} while ((prefix_seq_num != suffix_seq_num) &&
+		 (read_count < QED_DCBX_MAX_MIB_READ_TRY));
+
+	if (read_count >= QED_DCBX_MAX_MIB_READ_TRY) {
+		DP_ERR(p_hwfn,
+		       "MIB read err, mib type = %d, try count = %d prefix seq num = %d suffix seq num = %d\n",
+		       type, read_count, prefix_seq_num, suffix_seq_num);
+		rc = -EIO;
+	}
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_local_lldp_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_config_params);
+	data.lldp_local = p_hwfn->p_dcbx_info->lldp_local;
+	data.size = sizeof(struct lldp_config_params_s);
+	qed_memcpy_from(p_hwfn, p_ptt, data.lldp_local, data.addr, data.size);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_lldp_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_status_params);
+	data.lldp_remote = p_hwfn->p_dcbx_info->lldp_remote;
+	data.size = sizeof(struct lldp_status_params_s);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_operational_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, operational_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->operational;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_mib(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, remote_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->remote;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_local_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, local_admin_dcbx_mib);
+	data.local_admin = &p_hwfn->p_dcbx_info->local_admin;
+	data.size = sizeof(struct dcbx_local_params);
+	qed_memcpy_from(p_hwfn, p_ptt, data.local_admin, data.addr, data.size);
+
+	return rc;
+}
+
+static int qed_dcbx_read_mib(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = -EINVAL;
+
+	switch (type) {
+	case QED_DCBX_OPERATIONAL_MIB:
+		rc = qed_dcbx_read_operational_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_REMOTE_MIB:
+		rc = qed_dcbx_read_remote_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_MIB:
+		rc = qed_dcbx_read_local_mib(p_hwfn, p_ptt);
+		break;
+	case QED_DCBX_REMOTE_LLDP_MIB:
+		rc = qed_dcbx_read_remote_lldp_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_LLDP_MIB:
+		rc = qed_dcbx_read_local_lldp_mib(p_hwfn, p_ptt);
+		break;
+	default:
+		DP_ERR(p_hwfn, "MIB read err, unknown mib type %d\n", type);
+	}
+
+	return rc;
+}
+
+/* Read updated MIB.
+ * Reconfigure QM and invoke PF update ramrod command if operational MIB
+ * change is detected.
+ */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = 0;
+
+	rc = qed_dcbx_read_mib(p_hwfn, p_ptt, type);
+	if (rc)
+		return rc;
+
+	if (type == QED_DCBX_OPERATIONAL_MIB) {
+		rc = qed_dcbx_process_mib_info(p_hwfn);
+		if (!rc) {
+			/* reconfigure tcs of QM queues according
+			 * to negotiation results
+			 */
+			qed_qm_reconf(p_hwfn, p_ptt);
+
+			/* update storm FW with negotiation results */
+			qed_sp_pf_update(p_hwfn);
+		}
+	}
+
+	return rc;
+}
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn)
+{
+	int rc = 0;
+
+	p_hwfn->p_dcbx_info = kzalloc(sizeof(*p_hwfn->p_dcbx_info), GFP_KERNEL);
+	if (!p_hwfn->p_dcbx_info) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate 'struct qed_dcbx_info'\n");
+		rc = -ENOMEM;
+	}
+
+	return rc;
+}
+
+void qed_dcbx_info_free(struct qed_hwfn *p_hwfn,
+			struct qed_dcbx_info *p_dcbx_info)
+{
+	kfree(p_hwfn->p_dcbx_info);
+}
+
+static void qed_dcbx_update_protocol_data(struct protocol_dcb_data *p_data,
+					  struct qed_dcbx_results *p_src,
+					  enum dcbx_protocol_type type)
+{
+	p_data->dcb_enable_flag = p_src->arr[type].enable;
+	p_data->dcb_priority = p_src->arr[type].priority;
+	p_data->dcb_tc = p_src->arr[type].tc;
+}
+
+/* Set pf update ramrod command params */
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest)
+{
+	struct protocol_dcb_data *p_dcb_data;
+	bool update_flag = false;
+
+	p_dest->pf_id = p_src->pf_id;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_FCOE].update;
+	p_dest->update_fcoe_dcb_data_flag = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE].update;
+	p_dest->update_roce_dcb_data_flag = update_flag;
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE_V2].update;
+	p_dest->update_roce_dcb_data_flag = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ISCSI].update;
+	p_dest->update_iscsi_dcb_data_flag = update_flag;
+	update_flag = p_src->arr[DCBX_PROTOCOL_ETH].update;
+	p_dest->update_eth_dcb_data_flag = update_flag;
+
+	p_dcb_data = &p_dest->fcoe_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_FCOE);
+	p_dcb_data = &p_dest->roce_dcb_data;
+
+	if (p_src->arr[DCBX_PROTOCOL_ROCE].update)
+		qed_dcbx_update_protocol_data(p_dcb_data, p_src,
+					      DCBX_PROTOCOL_ROCE);
+	if (p_src->arr[DCBX_PROTOCOL_ROCE_V2].update)
+		qed_dcbx_update_protocol_data(p_dcb_data, p_src,
+					      DCBX_PROTOCOL_ROCE_V2);
+
+	p_dcb_data = &p_dest->iscsi_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ISCSI);
+	p_dcb_data = &p_dest->eth_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ETH);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
new file mode 100644
index 000000000000..e7f834dbda2d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
@@ -0,0 +1,80 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_DCBX_H
+#define _QED_DCBX_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+#define DCBX_CONFIG_MAX_APP_PROTOCOL    4
+
+enum qed_mib_read_type {
+	QED_DCBX_OPERATIONAL_MIB,
+	QED_DCBX_REMOTE_MIB,
+	QED_DCBX_LOCAL_MIB,
+	QED_DCBX_REMOTE_LLDP_MIB,
+	QED_DCBX_LOCAL_LLDP_MIB
+};
+
+struct qed_dcbx_app_data {
+	bool enable;		/* DCB enabled */
+	bool update;		/* Update indication */
+	u8 priority;		/* Priority */
+	u8 tc;			/* Traffic Class */
+};
+
+struct qed_dcbx_results {
+	bool dcbx_enabled;
+	u8 pf_id;
+	struct qed_dcbx_app_data arr[DCBX_MAX_PROTOCOL_TYPE];
+};
+
+struct qed_dcbx_app_metadata {
+	enum dcbx_protocol_type id;
+	char *name;
+	enum qed_pci_personality personality;
+};
+
+#define QED_MFW_GET_FIELD(name, field) \
+	(((name) & (field ## _MASK)) >> (field ## _SHIFT))
+
+struct qed_dcbx_info {
+	struct lldp_status_params_s lldp_remote[LLDP_MAX_LLDP_AGENTS];
+	struct lldp_config_params_s lldp_local[LLDP_MAX_LLDP_AGENTS];
+	struct dcbx_local_params local_admin;
+	struct qed_dcbx_results results;
+	struct dcbx_mib operational;
+	struct dcbx_mib remote;
+	u8 dcbx_cap;
+};
+
+struct qed_dcbx_mib_meta_data {
+	struct lldp_config_params_s *lldp_local;
+	struct lldp_status_params_s *lldp_remote;
+	struct dcbx_local_params *local_admin;
+	struct dcbx_mib *mib;
+	size_t size;
+	u32 addr;
+};
+
+/* QED local interface routines */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *,
+			  struct qed_ptt *, enum qed_mib_read_type);
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn);
+void qed_dcbx_info_free(struct qed_hwfn *, struct qed_dcbx_info *);
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 6fb6016409c6..089016f46f26 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -22,6 +22,7 @@
 #include <linux/qed/qed_if.h>
 #include "qed.h"
 #include "qed_cxt.h"
+#include "qed_dcbx.h"
 #include "qed_dev_api.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -33,6 +34,9 @@
 #include "qed_sriov.h"
 #include "qed_vf.h"
 
+static spinlock_t qm_lock;
+static bool qm_lock_init = false;
+
 /* API common to all protocols */
 enum BAR_ID {
 	BAR_ID_0,       /* used for GRC */
@@ -147,6 +151,7 @@ void qed_resc_free(struct qed_dev *cdev)
 		qed_int_free(p_hwfn);
 		qed_iov_free(p_hwfn);
 		qed_dmae_info_free(p_hwfn);
+		qed_dcbx_info_free(p_hwfn, p_hwfn->p_dcbx_info);
 	}
 }
 
@@ -200,13 +205,19 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
 
 	/* First init per-TC PQs */
-	for (i = 0; i < multi_cos_tcs; i++, curr_queue++) {
+	for (i = 0; i < multi_cos_tcs; i++) {
 		struct init_qm_pq_params *params =
-		    &qm_info->qm_pq_params[curr_queue];
+		    &qm_info->qm_pq_params[curr_queue++];
 
-		params->vport_id = vport_id;
-		params->tc_id = p_hwfn->hw_info.non_offload_tc;
-		params->wrr_group = 1;
+		if (p_hwfn->hw_info.personality == QED_PCI_ETH) {
+			params->vport_id = vport_id;
+			params->tc_id = p_hwfn->hw_info.non_offload_tc;
+			params->wrr_group = 1;
+		} else {
+			params->vport_id = vport_id;
+			params->tc_id = p_hwfn->hw_info.offload_tc;
+			params->wrr_group = 1;
+		}
 	}
 
 	/* Then init pure-LB PQ */
@@ -266,6 +277,63 @@ alloc_err:
 	return -ENOMEM;
 }
 
+/* This function reconfigures the QM pf on the fly.
+ * For this purpose we:
+ * 1. reconfigure the QM database
+ * 2. set new values to runtime arrat
+ * 3. send an sdm_qm_cmd through the rbc interface to stop the QM
+ * 4. activate init tool in QM_PF stage
+ * 5. send an sdm_qm_cmd through rbc interface to release the QM
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	bool b_rc;
+	int rc;
+
+	/* qm_info is allocated in qed_init_qm_info() which is already called
+	 * from qed_resc_alloc() or previous call of qed_qm_reconf().
+	 * The allocated size may change each init, so we free it before next
+	 * allocation.
+	 */
+	qed_qm_info_free(p_hwfn);
+
+	/* initialize qed's qm data structure */
+	rc = qed_init_qm_info(p_hwfn);
+	if (rc)
+		return rc;
+
+	/* stop PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, false, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	/* clear the QM_PF runtime phase leftovers from previous init */
+	qed_init_clear_rt_data(p_hwfn);
+
+	/* prepare QM portion of runtime array */
+	qed_qm_init_pf(p_hwfn);
+
+	/* activate init tool on runtime array */
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_QM_PF, p_hwfn->rel_pf_id,
+			  p_hwfn->hw_info.hw_mode);
+	if (rc)
+		return rc;
+
+	/* start PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, true, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	return 0;
+}
+
 int qed_resc_alloc(struct qed_dev *cdev)
 {
 	struct qed_consq *p_consq;
@@ -375,6 +443,14 @@ int qed_resc_alloc(struct qed_dev *cdev)
 				  "Failed to allocate memory for dmae_info structure\n");
 			goto alloc_err;
 		}
+
+		/* DCBX initialization */
+		rc = qed_dcbx_info_alloc(p_hwfn);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to allocate memory for dcbx structure\n");
+			goto alloc_err;
+		}
 	}
 
 	cdev->reset_stats = kzalloc(sizeof(*cdev->reset_stats), GFP_KERNEL);
@@ -780,6 +856,11 @@ int qed_hw_init(struct qed_dev *cdev,
 		p_hwfn->first_on_engine = (load_code ==
 					   FW_MSG_CODE_DRV_LOAD_ENGINE);
 
+		if (!qm_lock_init) {
+			spin_lock_init(&qm_lock);
+			qm_lock_init = true;
+		}
+
 		switch (load_code) {
 		case FW_MSG_CODE_DRV_LOAD_ENGINE:
 			rc = qed_hw_init_common(p_hwfn, p_hwfn->p_main_ptt,
@@ -821,6 +902,20 @@ int qed_hw_init(struct qed_dev *cdev,
 			return mfw_rc;
 		}
 
+		/* send DCBX attention request command */
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "sending phony dcbx set command to trigger DCBx attention handling\n");
+		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				     DRV_MSG_CODE_SET_DCBX,
+				     1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT,
+				     &load_code, &param);
+		if (mfw_rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to send DCBX attention request\n");
+			return mfw_rc;
+		}
+
 		p_hwfn->hw_init_done = true;
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 82b7727d090b..9afc15fdbb02 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -634,6 +634,14 @@ struct pf_start_ramrod_data {
 	u8				reserved0[4];
 };
 
+/* Data for port update ramrod */
+struct protocol_dcb_data {
+	u8 dcb_enable_flag;
+	u8 dcb_priority;
+	u8 dcb_tc;
+	u8 reserved;
+};
+
 /* tunnel configuration */
 struct pf_update_tunnel_config {
 	u8	update_rx_pf_clss;
@@ -656,8 +664,17 @@ struct pf_update_tunnel_config {
 };
 
 struct pf_update_ramrod_data {
-	u32				reserved[2];
-	u32				reserved_1[6];
+	u8 pf_id;
+	u8 update_eth_dcb_data_flag;
+	u8 update_fcoe_dcb_data_flag;
+	u8 update_iscsi_dcb_data_flag;
+	u8 update_roce_dcb_data_flag;
+	u8 update_mf_vlan_flag;
+	__le16 mf_vlan;
+	struct protocol_dcb_data eth_dcb_data;
+	struct protocol_dcb_data fcoe_dcb_data;
+	struct protocol_dcb_data iscsi_dcb_data;
+	struct protocol_dcb_data roce_dcb_data;
 	struct pf_update_tunnel_config	tunnel_config;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 2be943b91916..1182361798b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include "qed.h"
+#include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_mcp.h"
@@ -825,6 +826,18 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_VF_DISABLED:
 			qed_mcp_handle_vf_flr(p_hwfn, p_ptt);
 			break;
+		case MFW_DRV_MSG_LLDP_DATA_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_LLDP_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_OPERATIONAL_MIB);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index ab5549f4e5ea..ea4e9ce53e0a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -353,6 +353,19 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
 
+/**
+ * @brief qed_sp_pf_update - PF Function Update Ramrod
+ *
+ * This ramrod updates function-related parameters. Every parameter can be
+ * updated independently, according to configuration flags.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn);
+
 /**
  * @brief qed_sp_pf_stop - PF Function Stop Ramrod
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 8c555ed1f949..67f6ce3c84c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -15,6 +15,7 @@
 #include "qed.h"
 #include <linux/qed/qed_chain.h>
 #include "qed_cxt.h"
+#include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_int.h"
@@ -384,6 +385,30 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_dcbx_set_pf_update_params(&p_hwfn->p_dcbx_info->results,
+				      &p_ent->ramrod.pf_update);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_tunn_update_params *p_tunn,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 0fd8f247e65f..4c29439f54bf 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -25,6 +25,15 @@
 #include <linux/qed/common_hsi.h>
 #include <linux/qed/qed_chain.h>
 
+enum dcbx_protocol_type {
+	DCBX_PROTOCOL_ISCSI,
+	DCBX_PROTOCOL_FCOE,
+	DCBX_PROTOCOL_ROCE,
+	DCBX_PROTOCOL_ROCE_V2,
+	DCBX_PROTOCOL_ETH,
+	DCBX_MAX_PROTOCOL_TYPE
+};
+
 enum qed_led_mode {
 	QED_LED_MODE_OFF,
 	QED_LED_MODE_ON,
-- 
cgit v1.2.3