From c8770bcf5cefa8cbfae21c07c4fe3428f5a9d42a Mon Sep 17 00:00:00 2001
From: Miaoqing Pan <miaoqing@codeaurora.org>
Date: Mon, 7 Mar 2016 10:38:18 +0800
Subject: ath9k: Allow platform override BTCoex pin

Add new platform data to allow override BTCoex default pin.

Signed-off-by: Miaoqing Pan <miaoqing@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 include/linux/ath9k_platform.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index 33eb274cd0e6..e66153d60bd5 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -31,6 +31,10 @@ struct ath9k_platform_data {
 	u32 gpio_mask;
 	u32 gpio_val;
 
+	u32 bt_active_pin;
+	u32 bt_priority_pin;
+	u32 wlan_active_pin;
+
 	bool endian_check;
 	bool is_clk_25mhz;
 	bool tx_gain_buffalo;
-- 
cgit v1.2.3


From 08a33805518e7845486f88287e8aace6f8439391 Mon Sep 17 00:00:00 2001
From: Alison Schofield <amsfield22@gmail.com>
Date: Wed, 9 Mar 2016 11:30:12 -0800
Subject: iio: core: implement iio_device_{claim|release}_direct_mode()

It is often the case that the driver wants to be sure a device stays
in direct mode while it is executing a task or series of tasks.  To
accomplish this today, the driver performs this sequence: 1) take the
device state lock, 2) verify it is not in a buffered mode, 3) execute
some tasks, and 4) release that lock.

This patch introduces a pair of helper functions that simplify these
steps and make it more semantically expressive.

iio_device_claim_direct_mode()
        If the device is not in any buffered mode it is guaranteed
        to stay that way until iio_release_direct_mode() is called.

iio_device_release_direct_mode()
        Release the claim. Device is no longer guaranteed to stay
        in direct mode.

Signed-off-by: Alison Schofield <amsfield22@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/industrialio-core.c | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h         |  2 ++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 70cb7eb0a75c..2e768bc99f05 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
 #include <linux/debugfs.h>
+#include <linux/mutex.h>
 #include <linux/iio/iio.h>
 #include "iio_core.h"
 #include "iio_core_trigger.h"
@@ -1375,6 +1376,44 @@ void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev)
 }
 EXPORT_SYMBOL_GPL(devm_iio_device_unregister);
 
+/**
+ * iio_device_claim_direct_mode - Keep device in direct mode
+ * @indio_dev:	the iio_dev associated with the device
+ *
+ * If the device is in direct mode it is guaranteed to stay
+ * that way until iio_device_release_direct_mode() is called.
+ *
+ * Use with iio_device_release_direct_mode()
+ *
+ * Returns: 0 on success, -EBUSY on failure
+ */
+int iio_device_claim_direct_mode(struct iio_dev *indio_dev)
+{
+	mutex_lock(&indio_dev->mlock);
+
+	if (iio_buffer_enabled(indio_dev)) {
+		mutex_unlock(&indio_dev->mlock);
+		return -EBUSY;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iio_device_claim_direct_mode);
+
+/**
+ * iio_device_release_direct_mode - releases claim on direct mode
+ * @indio_dev:	the iio_dev associated with the device
+ *
+ * Release the claim. Device is no longer guaranteed to stay
+ * in direct mode.
+ *
+ * Use with iio_device_claim_direct_mode()
+ */
+void iio_device_release_direct_mode(struct iio_dev *indio_dev)
+{
+	mutex_unlock(&indio_dev->mlock);
+}
+EXPORT_SYMBOL_GPL(iio_device_release_direct_mode);
+
 subsys_initcall(iio_init);
 module_exit(iio_exit);
 
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index b2b16772c651..0b2773ada0ba 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -527,6 +527,8 @@ void iio_device_unregister(struct iio_dev *indio_dev);
 int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev);
 void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev);
 int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp);
+int iio_device_claim_direct_mode(struct iio_dev *indio_dev);
+void iio_device_release_direct_mode(struct iio_dev *indio_dev);
 
 extern struct bus_type iio_bus_type;
 
-- 
cgit v1.2.3


From 81f4c50607b423a59f8a1b03e1e8fc409a1dcd22 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 14:22:01 -0400
Subject: constify security_path_truncate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 4 ++--
 security/apparmor/lsm.c   | 2 +-
 security/security.c       | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index cdee11cbcdf1..77c3bfdacf16 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1366,7 +1366,7 @@ union security_list_options {
 	int (*path_rmdir)(struct path *dir, struct dentry *dentry);
 	int (*path_mknod)(struct path *dir, struct dentry *dentry,
 				umode_t mode, unsigned int dev);
-	int (*path_truncate)(struct path *path);
+	int (*path_truncate)(const struct path *path);
 	int (*path_symlink)(struct path *dir, struct dentry *dentry,
 				const char *old_name);
 	int (*path_link)(struct dentry *old_dentry, struct path *new_dir,
diff --git a/include/linux/security.h b/include/linux/security.h
index 157f0cb1e4d2..be37ccab2286 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1447,7 +1447,7 @@ int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode);
 int security_path_rmdir(struct path *dir, struct dentry *dentry);
 int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode,
 			unsigned int dev);
-int security_path_truncate(struct path *path);
+int security_path_truncate(const struct path *path);
 int security_path_symlink(struct path *dir, struct dentry *dentry,
 			  const char *old_name);
 int security_path_link(struct dentry *old_dentry, struct path *new_dir,
@@ -1481,7 +1481,7 @@ static inline int security_path_mknod(struct path *dir, struct dentry *dentry,
 	return 0;
 }
 
-static inline int security_path_truncate(struct path *path)
+static inline int security_path_truncate(const struct path *path)
 {
 	return 0;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 9713037e5257..83e9c3c2cfc8 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -269,7 +269,7 @@ static int apparmor_path_mknod(struct path *dir, struct dentry *dentry,
 	return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode);
 }
 
-static int apparmor_path_truncate(struct path *path)
+static int apparmor_path_truncate(const struct path *path)
 {
 	struct path_cond cond = { d_backing_inode(path->dentry)->i_uid,
 				  d_backing_inode(path->dentry)->i_mode
diff --git a/security/security.c b/security/security.c
index 3644b0344d29..23ffb6cc3974 100644
--- a/security/security.c
+++ b/security/security.c
@@ -478,7 +478,7 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 }
 EXPORT_SYMBOL(security_path_rename);
 
-int security_path_truncate(struct path *path)
+int security_path_truncate(const struct path *path)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
 		return 0;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index cbf3df422c87..8573eee2b58e 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -150,7 +150,7 @@ static int tomoyo_inode_getattr(const struct path *path)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_truncate(struct path *path)
+static int tomoyo_path_truncate(const struct path *path)
 {
 	return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL);
 }
-- 
cgit v1.2.3


From 7df818b2370a9aab5fc58a85b70b8af3d835affa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 14:24:09 -0400
Subject: constify vfs_truncate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c          | 2 +-
 include/linux/fs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 17cb6b1dab75..2f49fce5c952 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -65,7 +65,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	return ret;
 }
 
-long vfs_truncate(struct path *path, loff_t length)
+long vfs_truncate(const struct path *path, loff_t length)
 {
 	struct inode *inode;
 	long error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..09a68517e952 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2253,7 +2253,7 @@ struct filename {
 	const char		iname[];
 };
 
-extern long vfs_truncate(struct path *, loff_t);
+extern long vfs_truncate(const struct path *, loff_t);
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
-- 
cgit v1.2.3


From 7fd25dac9ad3970bede16f2834daf9f9d779d1b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 14:44:41 -0400
Subject: constify chown_common/security_path_chown

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c                 | 2 +-
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 4 ++--
 security/apparmor/lsm.c   | 2 +-
 security/security.c       | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 2f49fce5c952..651bf74745a2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -564,7 +564,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 	return sys_fchmodat(AT_FDCWD, filename, mode);
 }
 
-static int chown_common(struct path *path, uid_t user, gid_t group)
+static int chown_common(const struct path *path, uid_t user, gid_t group)
 {
 	struct inode *inode = path->dentry->d_inode;
 	struct inode *delegated_inode = NULL;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 77c3bfdacf16..84f76cbc6d06 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1375,7 +1375,7 @@ union security_list_options {
 				struct path *new_dir,
 				struct dentry *new_dentry);
 	int (*path_chmod)(struct path *path, umode_t mode);
-	int (*path_chown)(struct path *path, kuid_t uid, kgid_t gid);
+	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
 	int (*path_chroot)(struct path *path);
 #endif
 
diff --git a/include/linux/security.h b/include/linux/security.h
index be37ccab2286..f83ca920ed46 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1456,7 +1456,7 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 			 struct path *new_dir, struct dentry *new_dentry,
 			 unsigned int flags);
 int security_path_chmod(struct path *path, umode_t mode);
-int security_path_chown(struct path *path, kuid_t uid, kgid_t gid);
+int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid);
 int security_path_chroot(struct path *path);
 #else	/* CONFIG_SECURITY_PATH */
 static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
@@ -1513,7 +1513,7 @@ static inline int security_path_chmod(struct path *path, umode_t mode)
 	return 0;
 }
 
-static inline int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
+static inline int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
 	return 0;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 21dae6070bb9..3adbff987b77 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -342,7 +342,7 @@ static int apparmor_path_chmod(struct path *path, umode_t mode)
 	return common_perm_mnt_dentry(OP_CHMOD, path->mnt, path->dentry, AA_MAY_CHMOD);
 }
 
-static int apparmor_path_chown(struct path *path, kuid_t uid, kgid_t gid)
+static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
 	struct path_cond cond =  { d_backing_inode(path->dentry)->i_uid,
 				   d_backing_inode(path->dentry)->i_mode
diff --git a/security/security.c b/security/security.c
index 23ffb6cc3974..4a3e7e99abbb 100644
--- a/security/security.c
+++ b/security/security.c
@@ -492,7 +492,7 @@ int security_path_chmod(struct path *path, umode_t mode)
 	return call_int_hook(path_chmod, 0, path, mode);
 }
 
-int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
+int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
 		return 0;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 8573eee2b58e..f0989ec978e1 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -366,7 +366,7 @@ static int tomoyo_path_chmod(struct path *path, umode_t mode)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_chown(struct path *path, kuid_t uid, kgid_t gid)
+static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
 	int error = 0;
 	if (uid_valid(uid))
-- 
cgit v1.2.3


From 8a04c43b8741ebb40508d160cf87ca74b70941af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 14:52:53 -0400
Subject: constify security_sb_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 4 ++--
 security/security.c       | 2 +-
 security/selinux/hooks.c  | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84f76cbc6d06..47117751f4eb 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1343,7 +1343,7 @@ union security_list_options {
 	int (*sb_kern_mount)(struct super_block *sb, int flags, void *data);
 	int (*sb_show_options)(struct seq_file *m, struct super_block *sb);
 	int (*sb_statfs)(struct dentry *dentry);
-	int (*sb_mount)(const char *dev_name, struct path *path,
+	int (*sb_mount)(const char *dev_name, const struct path *path,
 			const char *type, unsigned long flags, void *data);
 	int (*sb_umount)(struct vfsmount *mnt, int flags);
 	int (*sb_pivotroot)(struct path *old_path, struct path *new_path);
diff --git a/include/linux/security.h b/include/linux/security.h
index f83ca920ed46..415a357efe4c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -222,7 +222,7 @@ int security_sb_remount(struct super_block *sb, void *data);
 int security_sb_kern_mount(struct super_block *sb, int flags, void *data);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
-int security_sb_mount(const char *dev_name, struct path *path,
+int security_sb_mount(const char *dev_name, const struct path *path,
 		      const char *type, unsigned long flags, void *data);
 int security_sb_umount(struct vfsmount *mnt, int flags);
 int security_sb_pivotroot(struct path *old_path, struct path *new_path);
@@ -530,7 +530,7 @@ static inline int security_sb_statfs(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_sb_mount(const char *dev_name, struct path *path,
+static inline int security_sb_mount(const char *dev_name, const struct path *path,
 				    const char *type, unsigned long flags,
 				    void *data)
 {
diff --git a/security/security.c b/security/security.c
index 4a3e7e99abbb..fc567656b16f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -302,7 +302,7 @@ int security_sb_statfs(struct dentry *dentry)
 	return call_int_hook(sb_statfs, 0, dentry);
 }
 
-int security_sb_mount(const char *dev_name, struct path *path,
+int security_sb_mount(const char *dev_name, const struct path *path,
                        const char *type, unsigned long flags, void *data)
 {
 	return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 912deee3f01e..e3aeacc13545 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2760,7 +2760,7 @@ static int selinux_sb_statfs(struct dentry *dentry)
 }
 
 static int selinux_mount(const char *dev_name,
-			 struct path *path,
+			 const struct path *path,
 			 const char *type,
 			 unsigned long flags,
 			 void *data)
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index f0989ec978e1..c1177f885247 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -401,7 +401,7 @@ static int tomoyo_path_chroot(struct path *path)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_sb_mount(const char *dev_name, struct path *path,
+static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
 			   const char *type, unsigned long flags, void *data)
 {
 	return tomoyo_mount_permission(dev_name, path, type, flags, data);
-- 
cgit v1.2.3


From be01f9f28e66fa846f02196eb047c6bc445642db Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 14:56:23 -0400
Subject: constify chmod_common/security_path_chmod

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c                 | 2 +-
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 4 ++--
 security/apparmor/lsm.c   | 2 +-
 security/security.c       | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 651bf74745a2..cfdf71a6704e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -499,7 +499,7 @@ out:
 	return error;
 }
 
-static int chmod_common(struct path *path, umode_t mode)
+static int chmod_common(const struct path *path, umode_t mode)
 {
 	struct inode *inode = path->dentry->d_inode;
 	struct inode *delegated_inode = NULL;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 47117751f4eb..294fdfe902bf 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1374,7 +1374,7 @@ union security_list_options {
 	int (*path_rename)(struct path *old_dir, struct dentry *old_dentry,
 				struct path *new_dir,
 				struct dentry *new_dentry);
-	int (*path_chmod)(struct path *path, umode_t mode);
+	int (*path_chmod)(const struct path *path, umode_t mode);
 	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
 	int (*path_chroot)(struct path *path);
 #endif
diff --git a/include/linux/security.h b/include/linux/security.h
index 415a357efe4c..d6593ee2d0a9 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1455,7 +1455,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 			 struct path *new_dir, struct dentry *new_dentry,
 			 unsigned int flags);
-int security_path_chmod(struct path *path, umode_t mode);
+int security_path_chmod(const struct path *path, umode_t mode);
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid);
 int security_path_chroot(struct path *path);
 #else	/* CONFIG_SECURITY_PATH */
@@ -1508,7 +1508,7 @@ static inline int security_path_rename(struct path *old_dir,
 	return 0;
 }
 
-static inline int security_path_chmod(struct path *path, umode_t mode)
+static inline int security_path_chmod(const struct path *path, umode_t mode)
 {
 	return 0;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 3adbff987b77..8d19615dcb73 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -334,7 +334,7 @@ static int apparmor_path_rename(struct path *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-static int apparmor_path_chmod(struct path *path, umode_t mode)
+static int apparmor_path_chmod(const struct path *path, umode_t mode)
 {
 	if (!mediated_filesystem(path->dentry))
 		return 0;
diff --git a/security/security.c b/security/security.c
index fc567656b16f..b333429fe718 100644
--- a/security/security.c
+++ b/security/security.c
@@ -485,7 +485,7 @@ int security_path_truncate(const struct path *path)
 	return call_int_hook(path_truncate, 0, path);
 }
 
-int security_path_chmod(struct path *path, umode_t mode)
+int security_path_chmod(const struct path *path, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
 		return 0;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index c1177f885247..e48d0a4e4128 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -351,7 +351,7 @@ static int tomoyo_file_ioctl(struct file *file, unsigned int cmd,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_chmod(struct path *path, umode_t mode)
+static int tomoyo_path_chmod(const struct path *path, umode_t mode)
 {
 	return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path,
 				       mode & S_IALLUGO);
-- 
cgit v1.2.3


From 989f74e0500a1e136d369bb619adc22786ea5e68 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 15:13:39 -0400
Subject: constify security_path_{unlink,rmdir}

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 4 ++--
 include/linux/security.h  | 8 ++++----
 security/apparmor/lsm.c   | 4 ++--
 security/security.c       | 4 ++--
 security/tomoyo/tomoyo.c  | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 294fdfe902bf..322912cc2da1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1360,10 +1360,10 @@ union security_list_options {
 
 
 #ifdef CONFIG_SECURITY_PATH
-	int (*path_unlink)(struct path *dir, struct dentry *dentry);
+	int (*path_unlink)(const struct path *dir, struct dentry *dentry);
 	int (*path_mkdir)(struct path *dir, struct dentry *dentry,
 				umode_t mode);
-	int (*path_rmdir)(struct path *dir, struct dentry *dentry);
+	int (*path_rmdir)(const struct path *dir, struct dentry *dentry);
 	int (*path_mknod)(struct path *dir, struct dentry *dentry,
 				umode_t mode, unsigned int dev);
 	int (*path_truncate)(const struct path *path);
diff --git a/include/linux/security.h b/include/linux/security.h
index d6593ee2d0a9..e292d8cb21d7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1442,9 +1442,9 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
 #ifdef CONFIG_SECURITY_PATH
-int security_path_unlink(struct path *dir, struct dentry *dentry);
+int security_path_unlink(const struct path *dir, struct dentry *dentry);
 int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode);
-int security_path_rmdir(struct path *dir, struct dentry *dentry);
+int security_path_rmdir(const struct path *dir, struct dentry *dentry);
 int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode,
 			unsigned int dev);
 int security_path_truncate(const struct path *path);
@@ -1459,7 +1459,7 @@ int security_path_chmod(const struct path *path, umode_t mode);
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid);
 int security_path_chroot(struct path *path);
 #else	/* CONFIG_SECURITY_PATH */
-static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
+static inline int security_path_unlink(const struct path *dir, struct dentry *dentry)
 {
 	return 0;
 }
@@ -1470,7 +1470,7 @@ static inline int security_path_mkdir(struct path *dir, struct dentry *dentry,
 	return 0;
 }
 
-static inline int security_path_rmdir(struct path *dir, struct dentry *dentry)
+static inline int security_path_rmdir(const struct path *dir, struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 4d2638f4676d..b760fe026b82 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -245,7 +245,7 @@ static int common_perm_create(int op, const struct path *dir,
 	return common_perm_dir_dentry(op, dir, dentry, mask, &cond);
 }
 
-static int apparmor_path_unlink(struct path *dir, struct dentry *dentry)
+static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry)
 {
 	return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE);
 }
@@ -257,7 +257,7 @@ static int apparmor_path_mkdir(struct path *dir, struct dentry *dentry,
 				  S_IFDIR);
 }
 
-static int apparmor_path_rmdir(struct path *dir, struct dentry *dentry)
+static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry)
 {
 	return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE);
 }
diff --git a/security/security.c b/security/security.c
index b333429fe718..20f2070b3ace 100644
--- a/security/security.c
+++ b/security/security.c
@@ -427,14 +427,14 @@ int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode)
 }
 EXPORT_SYMBOL(security_path_mkdir);
 
-int security_path_rmdir(struct path *dir, struct dentry *dentry)
+int security_path_rmdir(const struct path *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
 		return 0;
 	return call_int_hook(path_rmdir, 0, dir, dentry);
 }
 
-int security_path_unlink(struct path *dir, struct dentry *dentry)
+int security_path_unlink(const struct path *dir, struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
 		return 0;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index e48d0a4e4128..be5b1ae02f02 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -163,7 +163,7 @@ static int tomoyo_path_truncate(const struct path *path)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_unlink(struct path *parent, struct dentry *dentry)
+static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
 {
 	struct path path = { parent->mnt, dentry };
 	return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
@@ -194,7 +194,7 @@ static int tomoyo_path_mkdir(struct path *parent, struct dentry *dentry,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_rmdir(struct path *parent, struct dentry *dentry)
+static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
 {
 	struct path path = { parent->mnt, dentry };
 	return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
-- 
cgit v1.2.3


From d360775217070ff0f4291e47d3f568f0fe0b7374 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 15:21:09 -0400
Subject: constify security_path_{mkdir,mknod,symlink}

... as well as unix_mknod() and may_o_create()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c                |  2 +-
 include/linux/lsm_hooks.h |  6 +++---
 include/linux/security.h  | 12 ++++++------
 net/unix/af_unix.c        |  2 +-
 security/apparmor/lsm.c   |  6 +++---
 security/security.c       |  6 +++---
 security/tomoyo/tomoyo.c  |  6 +++---
 7 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..8c97544d6883 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2783,7 +2783,7 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
+static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
 	int error = security_path_mknod(dir, dentry, mode, 0);
 	if (error)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 322912cc2da1..919fb4f98e4f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1361,13 +1361,13 @@ union security_list_options {
 
 #ifdef CONFIG_SECURITY_PATH
 	int (*path_unlink)(const struct path *dir, struct dentry *dentry);
-	int (*path_mkdir)(struct path *dir, struct dentry *dentry,
+	int (*path_mkdir)(const struct path *dir, struct dentry *dentry,
 				umode_t mode);
 	int (*path_rmdir)(const struct path *dir, struct dentry *dentry);
-	int (*path_mknod)(struct path *dir, struct dentry *dentry,
+	int (*path_mknod)(const struct path *dir, struct dentry *dentry,
 				umode_t mode, unsigned int dev);
 	int (*path_truncate)(const struct path *path);
-	int (*path_symlink)(struct path *dir, struct dentry *dentry,
+	int (*path_symlink)(const struct path *dir, struct dentry *dentry,
 				const char *old_name);
 	int (*path_link)(struct dentry *old_dentry, struct path *new_dir,
 				struct dentry *new_dentry);
diff --git a/include/linux/security.h b/include/linux/security.h
index e292d8cb21d7..ccb8c2a170e3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1443,12 +1443,12 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 
 #ifdef CONFIG_SECURITY_PATH
 int security_path_unlink(const struct path *dir, struct dentry *dentry);
-int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode);
+int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode);
 int security_path_rmdir(const struct path *dir, struct dentry *dentry);
-int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode,
+int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
 			unsigned int dev);
 int security_path_truncate(const struct path *path);
-int security_path_symlink(struct path *dir, struct dentry *dentry,
+int security_path_symlink(const struct path *dir, struct dentry *dentry,
 			  const char *old_name);
 int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 		       struct dentry *new_dentry);
@@ -1464,7 +1464,7 @@ static inline int security_path_unlink(const struct path *dir, struct dentry *de
 	return 0;
 }
 
-static inline int security_path_mkdir(struct path *dir, struct dentry *dentry,
+static inline int security_path_mkdir(const struct path *dir, struct dentry *dentry,
 				      umode_t mode)
 {
 	return 0;
@@ -1475,7 +1475,7 @@ static inline int security_path_rmdir(const struct path *dir, struct dentry *den
 	return 0;
 }
 
-static inline int security_path_mknod(struct path *dir, struct dentry *dentry,
+static inline int security_path_mknod(const struct path *dir, struct dentry *dentry,
 				      umode_t mode, unsigned int dev)
 {
 	return 0;
@@ -1486,7 +1486,7 @@ static inline int security_path_truncate(const struct path *path)
 	return 0;
 }
 
-static inline int security_path_symlink(struct path *dir, struct dentry *dentry,
+static inline int security_path_symlink(const struct path *dir, struct dentry *dentry,
 					const char *old_name)
 {
 	return 0;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8269da73e9e5..80aa6a3e6817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -953,7 +953,7 @@ fail:
 	return NULL;
 }
 
-static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
+static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
 		      struct path *res)
 {
 	int err;
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b760fe026b82..7ae540565097 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -250,7 +250,7 @@ static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry)
 	return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE);
 }
 
-static int apparmor_path_mkdir(struct path *dir, struct dentry *dentry,
+static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry,
 			       umode_t mode)
 {
 	return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE,
@@ -262,7 +262,7 @@ static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry)
 	return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE);
 }
 
-static int apparmor_path_mknod(struct path *dir, struct dentry *dentry,
+static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry,
 			       umode_t mode, unsigned int dev)
 {
 	return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode);
@@ -273,7 +273,7 @@ static int apparmor_path_truncate(const struct path *path)
 	return common_perm_path(OP_TRUNC, path, MAY_WRITE | AA_MAY_META_WRITE);
 }
 
-static int apparmor_path_symlink(struct path *dir, struct dentry *dentry,
+static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry,
 				 const char *old_name)
 {
 	return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE,
diff --git a/security/security.c b/security/security.c
index 20f2070b3ace..7f62e2ed6a28 100644
--- a/security/security.c
+++ b/security/security.c
@@ -410,7 +410,7 @@ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
 EXPORT_SYMBOL(security_old_inode_init_security);
 
 #ifdef CONFIG_SECURITY_PATH
-int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode,
+int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
 			unsigned int dev)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
@@ -419,7 +419,7 @@ int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode,
 }
 EXPORT_SYMBOL(security_path_mknod);
 
-int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode)
+int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
 		return 0;
@@ -442,7 +442,7 @@ int security_path_unlink(const struct path *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL(security_path_unlink);
 
-int security_path_symlink(struct path *dir, struct dentry *dentry,
+int security_path_symlink(const struct path *dir, struct dentry *dentry,
 			  const char *old_name)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index be5b1ae02f02..d44752562b9b 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -178,7 +178,7 @@ static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_mkdir(struct path *parent, struct dentry *dentry,
+static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
 			     umode_t mode)
 {
 	struct path path = { parent->mnt, dentry };
@@ -209,7 +209,7 @@ static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_symlink(struct path *parent, struct dentry *dentry,
+static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
 			       const char *old_name)
 {
 	struct path path = { parent->mnt, dentry };
@@ -226,7 +226,7 @@ static int tomoyo_path_symlink(struct path *parent, struct dentry *dentry,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_mknod(struct path *parent, struct dentry *dentry,
+static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
 			     umode_t mode, unsigned int dev)
 {
 	struct path path = { parent->mnt, dentry };
-- 
cgit v1.2.3


From 3ccee46ab487d5b87d0621824efe2500b2857c58 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 15:27:45 -0400
Subject: constify security_path_{link,rename}

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h |  6 +++---
 include/linux/security.h  | 12 ++++++------
 security/apparmor/lsm.c   |  6 +++---
 security/security.c       |  6 +++---
 security/tomoyo/tomoyo.c  |  6 +++---
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 919fb4f98e4f..52c2ac5f4855 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1369,10 +1369,10 @@ union security_list_options {
 	int (*path_truncate)(const struct path *path);
 	int (*path_symlink)(const struct path *dir, struct dentry *dentry,
 				const char *old_name);
-	int (*path_link)(struct dentry *old_dentry, struct path *new_dir,
+	int (*path_link)(struct dentry *old_dentry, const struct path *new_dir,
 				struct dentry *new_dentry);
-	int (*path_rename)(struct path *old_dir, struct dentry *old_dentry,
-				struct path *new_dir,
+	int (*path_rename)(const struct path *old_dir, struct dentry *old_dentry,
+				const struct path *new_dir,
 				struct dentry *new_dentry);
 	int (*path_chmod)(const struct path *path, umode_t mode);
 	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
diff --git a/include/linux/security.h b/include/linux/security.h
index ccb8c2a170e3..82854115e36b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1450,10 +1450,10 @@ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t m
 int security_path_truncate(const struct path *path);
 int security_path_symlink(const struct path *dir, struct dentry *dentry,
 			  const char *old_name);
-int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 		       struct dentry *new_dentry);
-int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
-			 struct path *new_dir, struct dentry *new_dentry,
+int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
+			 const struct path *new_dir, struct dentry *new_dentry,
 			 unsigned int flags);
 int security_path_chmod(const struct path *path, umode_t mode);
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid);
@@ -1493,15 +1493,15 @@ static inline int security_path_symlink(const struct path *dir, struct dentry *d
 }
 
 static inline int security_path_link(struct dentry *old_dentry,
-				     struct path *new_dir,
+				     const struct path *new_dir,
 				     struct dentry *new_dentry)
 {
 	return 0;
 }
 
-static inline int security_path_rename(struct path *old_dir,
+static inline int security_path_rename(const struct path *old_dir,
 				       struct dentry *old_dentry,
-				       struct path *new_dir,
+				       const struct path *new_dir,
 				       struct dentry *new_dentry,
 				       unsigned int flags)
 {
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index eadaa58bd6fd..2660fbcf94d1 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -280,7 +280,7 @@ static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry,
 				  S_IFLNK);
 }
 
-static int apparmor_path_link(struct dentry *old_dentry, struct path *new_dir,
+static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir,
 			      struct dentry *new_dentry)
 {
 	struct aa_profile *profile;
@@ -295,8 +295,8 @@ static int apparmor_path_link(struct dentry *old_dentry, struct path *new_dir,
 	return error;
 }
 
-static int apparmor_path_rename(struct path *old_dir, struct dentry *old_dentry,
-				struct path *new_dir, struct dentry *new_dentry)
+static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry,
+				const struct path *new_dir, struct dentry *new_dentry)
 {
 	struct aa_profile *profile;
 	int error = 0;
diff --git a/security/security.c b/security/security.c
index 7f62e2ed6a28..33b85a960128 100644
--- a/security/security.c
+++ b/security/security.c
@@ -450,7 +450,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry,
 	return call_int_hook(path_symlink, 0, dir, dentry, old_name);
 }
 
-int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 		       struct dentry *new_dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
@@ -458,8 +458,8 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 	return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
 }
 
-int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
-			 struct path *new_dir, struct dentry *new_dentry,
+int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
+			 const struct path *new_dir, struct dentry *new_dentry,
 			 unsigned int flags)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index d44752562b9b..6a858f2f4063 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -265,7 +265,7 @@ static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_link(struct dentry *old_dentry, struct path *new_dir,
+static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir,
 			    struct dentry *new_dentry)
 {
 	struct path path1 = { new_dir->mnt, old_dentry };
@@ -283,9 +283,9 @@ static int tomoyo_path_link(struct dentry *old_dentry, struct path *new_dir,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_rename(struct path *old_parent,
+static int tomoyo_path_rename(const struct path *old_parent,
 			      struct dentry *old_dentry,
-			      struct path *new_parent,
+			      const struct path *new_parent,
 			      struct dentry *new_dentry)
 {
 	struct path path1 = { old_parent->mnt, old_dentry };
-- 
cgit v1.2.3


From 77b286c0d26a5399912f5affd90ed73e2d8b42a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 15:28:43 -0400
Subject: constify security_path_chroot()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 4 ++--
 security/security.c       | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 52c2ac5f4855..e2baca48e596 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1376,7 +1376,7 @@ union security_list_options {
 				struct dentry *new_dentry);
 	int (*path_chmod)(const struct path *path, umode_t mode);
 	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
-	int (*path_chroot)(struct path *path);
+	int (*path_chroot)(const struct path *path);
 #endif
 
 	int (*inode_alloc_security)(struct inode *inode);
diff --git a/include/linux/security.h b/include/linux/security.h
index 82854115e36b..cb53cffbfae4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1457,7 +1457,7 @@ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
 			 unsigned int flags);
 int security_path_chmod(const struct path *path, umode_t mode);
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid);
-int security_path_chroot(struct path *path);
+int security_path_chroot(const struct path *path);
 #else	/* CONFIG_SECURITY_PATH */
 static inline int security_path_unlink(const struct path *dir, struct dentry *dentry)
 {
@@ -1518,7 +1518,7 @@ static inline int security_path_chown(const struct path *path, kuid_t uid, kgid_
 	return 0;
 }
 
-static inline int security_path_chroot(struct path *path)
+static inline int security_path_chroot(const struct path *path)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 33b85a960128..cf6f31df524a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -499,7 +499,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 	return call_int_hook(path_chown, 0, path, uid, gid);
 }
 
-int security_path_chroot(struct path *path)
+int security_path_chroot(const struct path *path)
 {
 	return call_int_hook(path_chroot, 0, path);
 }
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 6a858f2f4063..c7764bb747aa 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -385,7 +385,7 @@ static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_path_chroot(struct path *path)
+static int tomoyo_path_chroot(const struct path *path)
 {
 	return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL);
 }
-- 
cgit v1.2.3


From 3b73b68c05db0b3c9b282c6e8e6eb71acc589a02 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 15:31:19 -0400
Subject: constify security_sb_pivotroot()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 2 +-
 include/linux/security.h  | 6 +++---
 security/security.c       | 2 +-
 security/tomoyo/tomoyo.c  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index e2baca48e596..41c0aa6d39ea 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1346,7 +1346,7 @@ union security_list_options {
 	int (*sb_mount)(const char *dev_name, const struct path *path,
 			const char *type, unsigned long flags, void *data);
 	int (*sb_umount)(struct vfsmount *mnt, int flags);
-	int (*sb_pivotroot)(struct path *old_path, struct path *new_path);
+	int (*sb_pivotroot)(const struct path *old_path, const struct path *new_path);
 	int (*sb_set_mnt_opts)(struct super_block *sb,
 				struct security_mnt_opts *opts,
 				unsigned long kern_flags,
diff --git a/include/linux/security.h b/include/linux/security.h
index cb53cffbfae4..fcfa211c694f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -225,7 +225,7 @@ int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(const char *dev_name, const struct path *path,
 		      const char *type, unsigned long flags, void *data);
 int security_sb_umount(struct vfsmount *mnt, int flags);
-int security_sb_pivotroot(struct path *old_path, struct path *new_path);
+int security_sb_pivotroot(const struct path *old_path, const struct path *new_path);
 int security_sb_set_mnt_opts(struct super_block *sb,
 				struct security_mnt_opts *opts,
 				unsigned long kern_flags,
@@ -542,8 +542,8 @@ static inline int security_sb_umount(struct vfsmount *mnt, int flags)
 	return 0;
 }
 
-static inline int security_sb_pivotroot(struct path *old_path,
-					struct path *new_path)
+static inline int security_sb_pivotroot(const struct path *old_path,
+					const struct path *new_path)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index cf6f31df524a..f7af0aaa173e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -313,7 +313,7 @@ int security_sb_umount(struct vfsmount *mnt, int flags)
 	return call_int_hook(sb_umount, 0, mnt, flags);
 }
 
-int security_sb_pivotroot(struct path *old_path, struct path *new_path)
+int security_sb_pivotroot(const struct path *old_path, const struct path *new_path)
 {
 	return call_int_hook(sb_pivotroot, 0, old_path, new_path);
 }
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index c7764bb747aa..75c998700190 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -429,7 +429,7 @@ static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_sb_pivotroot(struct path *old_path, struct path *new_path)
+static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path)
 {
 	return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path);
 }
-- 
cgit v1.2.3


From 2da62906b1e298695e1bb725927041cd59942c98 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 14 Mar 2015 21:13:46 -0400
Subject: [net] drop 'size' argument of sock_recvmsg()

all callers have it equal to msg_data_left(msg).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/target/iscsi/iscsi_target_util.c |  5 ++---
 include/linux/net.h                      |  3 +--
 net/socket.c                             | 23 ++++++++++-------------
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 428b0d9e3dba..57720385a751 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1283,9 +1283,8 @@ static int iscsit_do_rx_data(
 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC,
 		      count->iov, count->iov_count, data);
 
-	while (total_rx < data) {
-		rx_loop = sock_recvmsg(conn->sock, &msg,
-				      (data - total_rx), MSG_WAITALL);
+	while (msg_data_left(&msg)) {
+		rx_loop = sock_recvmsg(conn->sock, &msg, MSG_WAITALL);
 		if (rx_loop <= 0) {
 			pr_debug("rx_loop: %d total_rx: %d\n",
 				rx_loop, total_rx);
diff --git a/include/linux/net.h b/include/linux/net.h
index 49175e4ced11..72c1e0622ce2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -218,8 +218,7 @@ int sock_create_lite(int family, int type, int proto, struct socket **res);
 struct socket *sock_alloc(void);
 void sock_release(struct socket *sock);
 int sock_sendmsg(struct socket *sock, struct msghdr *msg);
-int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-		 int flags);
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
 struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname);
 struct socket *sockfd_lookup(int fd, int *err);
 struct socket *sock_from_file(struct file *file, int *err);
diff --git a/net/socket.c b/net/socket.c
index 5f77a8e93830..956426e347af 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -709,17 +709,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
-				     size_t size, int flags)
+				     int flags)
 {
-	return sock->ops->recvmsg(sock, msg, size, flags);
+	return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
 }
 
-int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-		 int flags)
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
 {
-	int err = security_socket_recvmsg(sock, msg, size, flags);
+	int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
 
-	return err ?: sock_recvmsg_nosec(sock, msg, size, flags);
+	return err ?: sock_recvmsg_nosec(sock, msg, flags);
 }
 EXPORT_SYMBOL(sock_recvmsg);
 
@@ -746,7 +745,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
 	set_fs(KERNEL_DS);
-	result = sock_recvmsg(sock, msg, size, flags);
+	result = sock_recvmsg(sock, msg, flags);
 	set_fs(oldfs);
 	return result;
 }
@@ -796,7 +795,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
 		return 0;
 
-	res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags);
+	res = sock_recvmsg(sock, &msg, msg.msg_flags);
 	*to = msg.msg_iter;
 	return res;
 }
@@ -1696,7 +1695,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 	msg.msg_iocb = NULL;
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
+	err = sock_recvmsg(sock, &msg, flags);
 
 	if (err >= 0 && addr != NULL) {
 		err2 = move_addr_to_user(&address,
@@ -2073,7 +2072,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	unsigned long cmsg_ptr;
-	int total_len, len;
+	int len;
 	ssize_t err;
 
 	/* kernel mode address */
@@ -2091,7 +2090,6 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 		err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
 	if (err < 0)
 		return err;
-	total_len = iov_iter_count(&msg_sys->msg_iter);
 
 	cmsg_ptr = (unsigned long)msg_sys->msg_control;
 	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
@@ -2101,8 +2099,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
-							  total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
-- 
cgit v1.2.3


From 49db08c358873af11ba3c25401de88156fa5d365 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 19 Feb 2016 15:36:07 +0100
Subject: chrdev: emit a warning when we go below dynamic major range

Currently a dynamically allocated character device major is taken
from 254 and downward. This mechanism is used for RTC, IIO and a
few other subsystems.

The kernel currently has no check prevening these dynamic
allocations from eating into the assigned numbers at 233 and
downward.

In a recent test it was reported that so many dynamic device
majors were used on a test server, that the major number for
infiniband (231) was stolen. This occurred when allocating a new
major number for GPIO chips. The error messages from the kernel
were not helpful. (See: https://lkml.org/lkml/2016/2/14/124)

This patch adds a defined lower limit of the dynamic major
allocation region will henceforth emit a warning if we start to
eat into the assigned numbers. It does not do any semantic
changes and will not change the kernels behaviour: numbers will
still continue to be stolen, but we will know from dmesg what
is going on.

This also updates the Documentation/devices.txt to clearly
reflect that we are using this range of major numbers for dynamic
allocation.

Reported-by: Ying Huang <ying.huang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/devices.txt | 6 +++---
 fs/char_dev.c             | 4 ++++
 include/linux/fs.h        | 2 ++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 87b4c5e82d39..0a3588a9798d 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3099,9 +3099,9 @@ Your cooperation is appreciated.
 		129 = /dev/ipath_sma    Device used by Subnet Management Agent
 		130 = /dev/ipath_diag   Device used by diagnostics programs
 
-234-239		UNASSIGNED
-
-240-254 char	LOCAL/EXPERIMENTAL USE
+234-254	char	RESERVED FOR DYNAMIC ASSIGNMENT
+		Character devices that request a dynamic allocation of major number will
+		take numbers starting from 254 and downward.
 
 240-254 block	LOCAL/EXPERIMENTAL USE
 		Allocated for local/experimental use.  For devices not
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 24b142569ca9..687471dc04a0 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 				break;
 		}
 
+		if (i < CHRDEV_MAJOR_DYN_END)
+			pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range",
+				name, i);
+
 		if (i == 0) {
 			ret = -EBUSY;
 			goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..60082be96de8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2385,6 +2385,8 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 
 /* fs/char_dev.c */
 #define CHRDEV_MAJOR_HASH_SIZE	255
+/* Marks the bottom of the first segment of free char majors */
+#define CHRDEV_MAJOR_DYN_END 234
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int __register_chrdev(unsigned int major, unsigned int baseminor,
-- 
cgit v1.2.3


From b3c1be1b789cca6d3e39c950dfed690f0511fe76 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Fri, 22 Jan 2016 11:28:07 -0500
Subject: base: isa: Remove X86_32 dependency

Many motherboards utilize a LPC to ISA bridge in order to decode
ISA-style port-mapped I/O addresses. This is particularly true for
embedded motherboards supporting the PC/104 bus (a bus specification
derived from ISA).

These motherboards are now commonly running 64-bit x86 processors. The
X86_32 dependency should be removed from the ISA bus configuration
option in order to support these newer motherboards.

A new config option, CONFIG_ISA_BUS, is introduced to allow for the
compilation of the ISA bus driver independent of the CONFIG_ISA option.
Devices which communicate via ISA-compatible buses can now be supported
independent of the dependencies of the CONFIG_ISA option.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig      | 6 ++++++
 drivers/base/Makefile | 2 +-
 include/linux/isa.h   | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..a5977986f38b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2472,10 +2472,16 @@ config ISA_DMA_API
 	  Enables ISA-style DMA support for devices requiring such controllers.
 	  If unsure, say Y.
 
+config ISA_BUS
+	bool "ISA bus support"
+	help
+	  Enables ISA bus support for devices requiring such controllers.
+
 if X86_32
 
 config ISA
 	bool "ISA support"
+	depends on ISA_BUS
 	---help---
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 6b2a84e7f2be..4ebfb81cc7e9 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_ISA)	+= isa.o
+obj-$(CONFIG_ISA_BUS)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
diff --git a/include/linux/isa.h b/include/linux/isa.h
index b0270e3814c8..2a02862775eb 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,7 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
-#ifdef CONFIG_ISA
+#ifdef CONFIG_ISA_BUS
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
 #else
-- 
cgit v1.2.3


From f235541699bcf14fb8be797c6bc1d7106df0eb64 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 22 Jan 2016 01:32:26 -0500
Subject: export.h: allow for per-symbol configurable EXPORT_SYMBOL()

Similar to include/generated/autoconf.h, include/generated/autoksyms.h
will contain a list of defines for each EXPORT_SYMBOL() that we want
active. The format is:

  #define __KSYM_<symbol_name> 1

This list will be auto-generated with another patch.  For now we only
include the preprocessor magic to automatically create or omit the
corresponding struct kernel_symbol declaration.

Given the content of include/generated/autoksyms.h may not be known in
advance, an empty file is created early on to let the build proceed.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Makefile               |  2 ++
 include/linux/export.h | 22 ++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 916b26e999d8..451acbebee97 100644
--- a/Makefile
+++ b/Makefile
@@ -998,6 +998,8 @@ prepare2: prepare3 outputmakefile asm-generic
 prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
                    include/config/auto.conf
 	$(cmd_crmodverdir)
+	$(Q)test -e include/generated/autoksyms.h || \
+	    touch   include/generated/autoksyms.h
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
diff --git a/include/linux/export.h b/include/linux/export.h
index 96e45ea463e7..77afdb2a2506 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -38,7 +38,7 @@ extern struct module __this_module;
 
 #ifdef CONFIG_MODULES
 
-#ifndef __GENKSYMS__
+#if defined(__KERNEL__) && !defined(__GENKSYMS__)
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
@@ -53,7 +53,7 @@ extern struct module __this_module;
 #endif
 
 /* For every exported symbol, place a struct in the __ksymtab section */
-#define __EXPORT_SYMBOL(sym, sec)				\
+#define ___EXPORT_SYMBOL(sym, sec)				\
 	extern typeof(sym) sym;					\
 	__CRC_SYMBOL(sym, sec)					\
 	static const char __kstrtab_##sym[]			\
@@ -65,6 +65,24 @@ extern struct module __this_module;
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
+#ifdef CONFIG_TRIM_UNUSED_KSYMS
+
+#include <linux/kconfig.h>
+#include <generated/autoksyms.h>
+
+#define __EXPORT_SYMBOL(sym, sec)				\
+	__cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+#define __cond_export_sym(sym, sec, conf)			\
+	___cond_export_sym(sym, sec, conf)
+#define ___cond_export_sym(sym, sec, enabled)			\
+	__cond_export_sym_##enabled(sym, sec)
+#define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
+#define __cond_export_sym_0(sym, sec) /* nothing */
+
+#else
+#define __EXPORT_SYMBOL ___EXPORT_SYMBOL
+#endif
+
 #define EXPORT_SYMBOL(sym)					\
 	__EXPORT_SYMBOL(sym, "")
 
-- 
cgit v1.2.3


From c1a95fda2a40ae8c7aad3fa44fa7718a3710eb2d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 22 Jan 2016 13:41:57 -0500
Subject: kbuild: add fine grained build dependencies for exported symbols

Like with kconfig options, we now have the ability to compile in and
out individual EXPORT_SYMBOL() declarations based on the content of
include/generated/autoksyms.h.  However we don't want the entire
world to be rebuilt whenever that file is touched.

Let's apply the same build dependency trick used for CONFIG_* symbols
where the time stamp of empty files whose paths matching those symbols
is used to trigger fine grained rebuilds. In our case the key is the
symbol name passed to EXPORT_SYMBOL().

However, unlike config options, we cannot just use fixdep to parse
the source code for EXPORT_SYMBOL(ksym) because several variants exist
and parsing them all in a separate tool, and keeping it in synch, is
not trivially maintainable.  Furthermore, there are variants such as

	EXPORT_SYMBOL_GPL(pci_user_read_config_##size);

that are instanciated via a macro for which we can't easily determine
the actual exported symbol name(s) short of actually running the
preprocessor on them.

Storing the symbol name string in a special ELF section doesn't work
for targets that output assembly or preprocessed source.

So the best way is really to leverage the preprocessor by having it
output actual symbol names anchored by a special sequence that can be
easily filtered out. Then the list of symbols is simply fed to fixdep
to be merged with the other dependencies.

That implies the preprocessor is executed twice for each source file.
A previous attempt relied on a warning pragma for each EXPORT_SYMBOL()
instance that was filtered apart from stderr by the build system with
a sed script during the actual compilation pass. Unfortunately the
preprocessor/compiler diagnostic output isn't stable between versions
and this solution, although more efficient, was deemed too fragile.

Because of the lowercasing performed by fixdep, there might be name
collisions triggering spurious rebuilds for similar symbols. But this
shouldn't be a big issue in practice. (This is the case for CONFIG_*
symbols and I didn't want to be different here, whatever the original
reason for doing so.)

To avoid needless build overhead, the exported symbol name gathering is
performed only when CONFIG_TRIM_UNUSED_KSYMS is selected.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/export.h | 13 ++++++++++++-
 scripts/Kbuild.include | 27 +++++++++++++++++++++++++++
 scripts/basic/fixdep.c |  1 +
 3 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 77afdb2a2506..2f9ccbe6a639 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -65,7 +65,18 @@ extern struct module __this_module;
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
-#ifdef CONFIG_TRIM_UNUSED_KSYMS
+#if defined(__KSYM_DEPS__)
+
+/*
+ * For fine grained build dependencies, we want to tell the build system
+ * about each possible exported symbol even if they're not actually exported.
+ * We use a string pattern that is unlikely to be valid code that the build
+ * system filters out from the preprocessor output (see ksym_dep_filter
+ * in scripts/Kbuild.include).
+ */
+#define __EXPORT_SYMBOL(sym, sec)	=== __KSYM_##sym ===
+
+#elif defined(CONFIG_TRIM_UNUSED_KSYMS)
 
 #include <linux/kconfig.h>
 #include <generated/autoksyms.h>
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 80ca538bfba9..a09927e02713 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -258,12 +258,39 @@ if_changed_dep = $(if $(strip $(any-prereq) $(arg-check) ),                  \
 	@set -e;                                                             \
 	$(cmd_and_fixdep), @:)
 
+ifndef CONFIG_TRIM_UNUSED_KSYMS
+
 cmd_and_fixdep =                                                             \
 	$(echo-cmd) $(cmd_$(1));                                             \
 	scripts/basic/fixdep $(depfile) $@ '$(make-cmd)' > $(dot-target).tmp;\
 	rm -f $(depfile);                                                    \
 	mv -f $(dot-target).tmp $(dot-target).cmd;
 
+else
+
+# Filter out exported kernel symbol names from the preprocessor output.
+# See also __KSYM_DEPS__ in include/linux/export.h.
+# We disable the depfile generation here, so as not to overwrite the existing
+# depfile while fixdep is parsing it.
+flags_nodeps = $(filter-out -Wp$(comma)-M%, $($(1)))
+ksym_dep_filter =                                                            \
+	case "$(1)" in                                                       \
+	cc_*_c) $(CPP) $(call flags_nodeps,c_flags) -D__KSYM_DEPS__ $< ;;    \
+	as_*_S) $(CPP) $(call flags_nodeps,a_flags) -D__KSYM_DEPS__ $< ;;    \
+	boot*|build*|*cpp_lds_S|dtc|host*|vdso*) : ;;                        \
+	*) echo "Don't know how to preprocess $(1)" >&2; false ;;            \
+	esac | sed -rn 's/^.*=== __KSYM_(.*) ===.*$$/KSYM_\1/p'
+
+cmd_and_fixdep =                                                             \
+	$(echo-cmd) $(cmd_$(1));                                             \
+	$(ksym_dep_filter) |                                                 \
+		scripts/basic/fixdep -e $(depfile) $@ '$(make-cmd)'          \
+			> $(dot-target).tmp;	                             \
+	rm -f $(depfile);                                                    \
+	mv -f $(dot-target).tmp $(dot-target).cmd;
+
+endif
+
 # Usage: $(call if_changed_rule,foo)
 # Will check if $(cmd_foo) or any of the prerequisites changed,
 # and if so will execute $(rule_foo).
diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
index 7e90a1f7de0f..746ec1ece614 100644
--- a/scripts/basic/fixdep.c
+++ b/scripts/basic/fixdep.c
@@ -358,6 +358,7 @@ static void parse_dep_file(void *map, size_t len)
 
 			/* Ignore certain dependencies */
 			if (strrcmp(s, "include/generated/autoconf.h") &&
+			    strrcmp(s, "include/generated/autoksyms.h") &&
 			    strrcmp(s, "arch/um/include/uml-config.h") &&
 			    strrcmp(s, "include/linux/kconfig.h") &&
 			    strrcmp(s, ".ver")) {
-- 
cgit v1.2.3


From 6c96f05c8bb8bc4177613ef3c23a56b455e75887 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 23 Feb 2016 18:46:24 +0100
Subject: reset: Make [of_]reset_control_get[_foo] functions wrappers

With both the regular, _by_index and _optional variants we already have
quite a few variants of [of_]reset_control_get[_foo], the upcoming
addition of shared reset lines support makes this worse.

This commit changes all the variants into wrappers around common core
functions. For completeness sake this commit also adds a new
devm_get_reset_control_by_index wrapper.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  |  84 +++++++--------------------------
 include/linux/reset.h | 126 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 107 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index f15f150b79da..bdf1763da87a 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -136,18 +136,8 @@ int reset_control_status(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_status);
 
-/**
- * of_reset_control_get_by_index - Lookup and obtain a reference to a reset
- * controller by index.
- * @node: device to be reset by the controller
- * @index: index of the reset controller
- *
- * This is to be used to perform a list of resets for a device or power domain
- * in whatever order. Returns a struct reset_control or IS_ERR() condition
- * containing errno.
- */
-struct reset_control *of_reset_control_get_by_index(struct device_node *node,
-					   int index)
+struct reset_control *__of_reset_control_get(struct device_node *node,
+					     const char *id, int index)
 {
 	struct reset_control *rstc;
 	struct reset_controller_dev *r, *rcdev;
@@ -155,6 +145,16 @@ struct reset_control *of_reset_control_get_by_index(struct device_node *node,
 	int rstc_id;
 	int ret;
 
+	if (!node)
+		return ERR_PTR(-EINVAL);
+
+	if (id) {
+		index = of_property_match_string(node,
+						 "reset-names", id);
+		if (index < 0)
+			return ERR_PTR(-ENOENT);
+	}
+
 	ret = of_parse_phandle_with_args(node, "resets", "#reset-cells",
 					 index, &args);
 	if (ret)
@@ -200,49 +200,7 @@ struct reset_control *of_reset_control_get_by_index(struct device_node *node,
 
 	return rstc;
 }
-EXPORT_SYMBOL_GPL(of_reset_control_get_by_index);
-
-/**
- * of_reset_control_get - Lookup and obtain a reference to a reset controller.
- * @node: device to be reset by the controller
- * @id: reset line name
- *
- * Returns a struct reset_control or IS_ERR() condition containing errno.
- *
- * Use of id names is optional.
- */
-struct reset_control *of_reset_control_get(struct device_node *node,
-					   const char *id)
-{
-	int index = 0;
-
-	if (id) {
-		index = of_property_match_string(node,
-						 "reset-names", id);
-		if (index < 0)
-			return ERR_PTR(-ENOENT);
-	}
-	return of_reset_control_get_by_index(node, index);
-}
-EXPORT_SYMBOL_GPL(of_reset_control_get);
-
-/**
- * reset_control_get - Lookup and obtain a reference to a reset controller.
- * @dev: device to be reset by the controller
- * @id: reset line name
- *
- * Returns a struct reset_control or IS_ERR() condition containing errno.
- *
- * Use of id names is optional.
- */
-struct reset_control *reset_control_get(struct device *dev, const char *id)
-{
-	if (!dev)
-		return ERR_PTR(-EINVAL);
-
-	return of_reset_control_get(dev->of_node, id);
-}
-EXPORT_SYMBOL_GPL(reset_control_get);
+EXPORT_SYMBOL_GPL(__of_reset_control_get);
 
 /**
  * reset_control_put - free the reset controller
@@ -264,16 +222,8 @@ static void devm_reset_control_release(struct device *dev, void *res)
 	reset_control_put(*(struct reset_control **)res);
 }
 
-/**
- * devm_reset_control_get - resource managed reset_control_get()
- * @dev: device to be reset by the controller
- * @id: reset line name
- *
- * Managed reset_control_get(). For reset controllers returned from this
- * function, reset_control_put() is called automatically on driver detach.
- * See reset_control_get() for more information.
- */
-struct reset_control *devm_reset_control_get(struct device *dev, const char *id)
+struct reset_control *__devm_reset_control_get(struct device *dev,
+					       const char *id, int index)
 {
 	struct reset_control **ptr, *rstc;
 
@@ -282,7 +232,7 @@ struct reset_control *devm_reset_control_get(struct device *dev, const char *id)
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = reset_control_get(dev, id);
+	rstc = __of_reset_control_get(dev ? dev->of_node : NULL, id, index);
 	if (!IS_ERR(rstc)) {
 		*ptr = rstc;
 		devres_add(dev, ptr);
@@ -292,7 +242,7 @@ struct reset_control *devm_reset_control_get(struct device *dev, const char *id)
 
 	return rstc;
 }
-EXPORT_SYMBOL_GPL(devm_reset_control_get);
+EXPORT_SYMBOL_GPL(__devm_reset_control_get);
 
 /**
  * device_reset - find reset controller associated with the device
diff --git a/include/linux/reset.h b/include/linux/reset.h
index c4c097de0ba9..1bb69a29d6db 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -1,8 +1,8 @@
 #ifndef _LINUX_RESET_H_
 #define _LINUX_RESET_H_
 
-struct device;
-struct device_node;
+#include <linux/device.h>
+
 struct reset_control;
 
 #ifdef CONFIG_RESET_CONTROLLER
@@ -12,9 +12,11 @@ int reset_control_assert(struct reset_control *rstc);
 int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
 
-struct reset_control *reset_control_get(struct device *dev, const char *id);
+struct reset_control *__of_reset_control_get(struct device_node *node,
+					     const char *id, int index);
 void reset_control_put(struct reset_control *rstc);
-struct reset_control *devm_reset_control_get(struct device *dev, const char *id);
+struct reset_control *__devm_reset_control_get(struct device *dev,
+					       const char *id, int index);
 
 int __must_check device_reset(struct device *dev);
 
@@ -23,24 +25,6 @@ static inline int device_reset_optional(struct device *dev)
 	return device_reset(dev);
 }
 
-static inline struct reset_control *reset_control_get_optional(
-					struct device *dev, const char *id)
-{
-	return reset_control_get(dev, id);
-}
-
-static inline struct reset_control *devm_reset_control_get_optional(
-					struct device *dev, const char *id)
-{
-	return devm_reset_control_get(dev, id);
-}
-
-struct reset_control *of_reset_control_get(struct device_node *node,
-					   const char *id);
-
-struct reset_control *of_reset_control_get_by_index(
-					struct device_node *node, int index);
-
 #else
 
 static inline int reset_control_reset(struct reset_control *rstc)
@@ -77,44 +61,114 @@ static inline int device_reset_optional(struct device *dev)
 	return -ENOTSUPP;
 }
 
-static inline struct reset_control *__must_check reset_control_get(
-					struct device *dev, const char *id)
+static inline struct reset_control *__of_reset_control_get(
+					struct device_node *node,
+					const char *id, int index)
 {
-	WARN_ON(1);
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct reset_control *__must_check devm_reset_control_get(
-					struct device *dev, const char *id)
+static inline struct reset_control *__devm_reset_control_get(
+					struct device *dev,
+					const char *id, int index)
 {
-	WARN_ON(1);
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct reset_control *reset_control_get_optional(
+#endif /* CONFIG_RESET_CONTROLLER */
+
+/**
+ * reset_control_get - Lookup and obtain a reference to a reset controller.
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Returns a struct reset_control or IS_ERR() condition containing errno.
+ *
+ * Use of id names is optional.
+ */
+static inline struct reset_control *__must_check reset_control_get(
 					struct device *dev, const char *id)
 {
-	return ERR_PTR(-ENOTSUPP);
+#ifndef CONFIG_RESET_CONTROLLER
+	WARN_ON(1);
+#endif
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0);
 }
 
-static inline struct reset_control *devm_reset_control_get_optional(
+static inline struct reset_control *reset_control_get_optional(
 					struct device *dev, const char *id)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0);
 }
 
+/**
+ * of_reset_control_get - Lookup and obtain a reference to a reset controller.
+ * @node: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Returns a struct reset_control or IS_ERR() condition containing errno.
+ *
+ * Use of id names is optional.
+ */
 static inline struct reset_control *of_reset_control_get(
 				struct device_node *node, const char *id)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return __of_reset_control_get(node, id, 0);
 }
 
+/**
+ * of_reset_control_get_by_index - Lookup and obtain a reference to a reset
+ * controller by index.
+ * @node: device to be reset by the controller
+ * @index: index of the reset controller
+ *
+ * This is to be used to perform a list of resets for a device or power domain
+ * in whatever order. Returns a struct reset_control or IS_ERR() condition
+ * containing errno.
+ */
 static inline struct reset_control *of_reset_control_get_by_index(
-				struct device_node *node, int index)
+					struct device_node *node, int index)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return __of_reset_control_get(node, NULL, index);
 }
 
-#endif /* CONFIG_RESET_CONTROLLER */
+/**
+ * devm_reset_control_get - resource managed reset_control_get()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get(). For reset controllers returned from this
+ * function, reset_control_put() is called automatically on driver detach.
+ * See reset_control_get() for more information.
+ */
+static inline struct reset_control *__must_check devm_reset_control_get(
+					struct device *dev, const char *id)
+{
+#ifndef CONFIG_RESET_CONTROLLER
+	WARN_ON(1);
+#endif
+	return __devm_reset_control_get(dev, id, 0);
+}
+
+static inline struct reset_control *devm_reset_control_get_optional(
+					struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0);
+}
+
+/**
+ * devm_reset_control_get_by_index - resource managed reset_control_get
+ * @dev: device to be reset by the controller
+ * @index: index of the reset controller
+ *
+ * Managed reset_control_get(). For reset controllers returned from this
+ * function, reset_control_put() is called automatically on driver detach.
+ * See reset_control_get() for more information.
+ */
+static inline struct reset_control *devm_reset_control_get_by_index(
+					struct device *dev, int index)
+{
+	return __devm_reset_control_get(dev, NULL, index);
+}
 
 #endif
-- 
cgit v1.2.3


From c15ddec2ca06076a11195313aa1fce47d2a28c5d Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 23 Feb 2016 18:46:25 +0100
Subject: reset: Share struct reset_control between reset_control_get calls

Now that struct reset_control no longer stores the device pointer for
the device calling reset_control_get we can share a single struct
reset_control when multiple calls to reset_control_get are made for
the same reset line (same id / index).

This is a preparation patch for adding support for shared reset lines.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c             | 84 ++++++++++++++++++++++++++++++----------
 include/linux/reset-controller.h |  2 +
 2 files changed, 65 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index bdf1763da87a..957750600ff3 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -18,19 +18,23 @@
 #include <linux/reset-controller.h>
 #include <linux/slab.h>
 
-static DEFINE_MUTEX(reset_controller_list_mutex);
+static DEFINE_MUTEX(reset_list_mutex);
 static LIST_HEAD(reset_controller_list);
 
 /**
  * struct reset_control - a reset control
  * @rcdev: a pointer to the reset controller device
  *         this reset control belongs to
+ * @list: list entry for the rcdev's reset controller list
  * @id: ID of the reset controller in the reset
  *      controller device
+ * @refcnt: Number of gets of this reset_control
  */
 struct reset_control {
 	struct reset_controller_dev *rcdev;
+	struct list_head list;
 	unsigned int id;
+	unsigned int refcnt;
 };
 
 /**
@@ -62,9 +66,11 @@ int reset_controller_register(struct reset_controller_dev *rcdev)
 		rcdev->of_xlate = of_reset_simple_xlate;
 	}
 
-	mutex_lock(&reset_controller_list_mutex);
+	INIT_LIST_HEAD(&rcdev->reset_control_head);
+
+	mutex_lock(&reset_list_mutex);
 	list_add(&rcdev->list, &reset_controller_list);
-	mutex_unlock(&reset_controller_list_mutex);
+	mutex_unlock(&reset_list_mutex);
 
 	return 0;
 }
@@ -76,9 +82,9 @@ EXPORT_SYMBOL_GPL(reset_controller_register);
  */
 void reset_controller_unregister(struct reset_controller_dev *rcdev)
 {
-	mutex_lock(&reset_controller_list_mutex);
+	mutex_lock(&reset_list_mutex);
 	list_del(&rcdev->list);
-	mutex_unlock(&reset_controller_list_mutex);
+	mutex_unlock(&reset_list_mutex);
 }
 EXPORT_SYMBOL_GPL(reset_controller_unregister);
 
@@ -136,6 +142,48 @@ int reset_control_status(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_status);
 
+static struct reset_control *__reset_control_get(
+				struct reset_controller_dev *rcdev,
+				unsigned int index)
+{
+	struct reset_control *rstc;
+
+	lockdep_assert_held(&reset_list_mutex);
+
+	list_for_each_entry(rstc, &rcdev->reset_control_head, list) {
+		if (rstc->id == index) {
+			rstc->refcnt++;
+			return rstc;
+		}
+	}
+
+	rstc = kzalloc(sizeof(*rstc), GFP_KERNEL);
+	if (!rstc)
+		return ERR_PTR(-ENOMEM);
+
+	try_module_get(rcdev->owner);
+
+	rstc->rcdev = rcdev;
+	list_add(&rstc->list, &rcdev->reset_control_head);
+	rstc->id = index;
+	rstc->refcnt = 1;
+
+	return rstc;
+}
+
+static void __reset_control_put(struct reset_control *rstc)
+{
+	lockdep_assert_held(&reset_list_mutex);
+
+	if (--rstc->refcnt)
+		return;
+
+	module_put(rstc->rcdev->owner);
+
+	list_del(&rstc->list);
+	kfree(rstc);
+}
+
 struct reset_control *__of_reset_control_get(struct device_node *node,
 					     const char *id, int index)
 {
@@ -160,7 +208,7 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	if (ret)
 		return ERR_PTR(ret);
 
-	mutex_lock(&reset_controller_list_mutex);
+	mutex_lock(&reset_list_mutex);
 	rcdev = NULL;
 	list_for_each_entry(r, &reset_controller_list, list) {
 		if (args.np == r->of_node) {
@@ -171,32 +219,25 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	of_node_put(args.np);
 
 	if (!rcdev) {
-		mutex_unlock(&reset_controller_list_mutex);
+		mutex_unlock(&reset_list_mutex);
 		return ERR_PTR(-EPROBE_DEFER);
 	}
 
 	if (WARN_ON(args.args_count != rcdev->of_reset_n_cells)) {
-		mutex_unlock(&reset_controller_list_mutex);
+		mutex_unlock(&reset_list_mutex);
 		return ERR_PTR(-EINVAL);
 	}
 
 	rstc_id = rcdev->of_xlate(rcdev, &args);
 	if (rstc_id < 0) {
-		mutex_unlock(&reset_controller_list_mutex);
+		mutex_unlock(&reset_list_mutex);
 		return ERR_PTR(rstc_id);
 	}
 
-	try_module_get(rcdev->owner);
-	mutex_unlock(&reset_controller_list_mutex);
-
-	rstc = kzalloc(sizeof(*rstc), GFP_KERNEL);
-	if (!rstc) {
-		module_put(rcdev->owner);
-		return ERR_PTR(-ENOMEM);
-	}
+	/* reset_list_mutex also protects the rcdev's reset_control list */
+	rstc = __reset_control_get(rcdev, rstc_id);
 
-	rstc->rcdev = rcdev;
-	rstc->id = rstc_id;
+	mutex_unlock(&reset_list_mutex);
 
 	return rstc;
 }
@@ -212,8 +253,9 @@ void reset_control_put(struct reset_control *rstc)
 	if (IS_ERR(rstc))
 		return;
 
-	module_put(rstc->rcdev->owner);
-	kfree(rstc);
+	mutex_lock(&reset_list_mutex);
+	__reset_control_put(rstc);
+	mutex_unlock(&reset_list_mutex);
 }
 EXPORT_SYMBOL_GPL(reset_control_put);
 
diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h
index a3a5bcdb1d02..b91ba932bbd4 100644
--- a/include/linux/reset-controller.h
+++ b/include/linux/reset-controller.h
@@ -31,6 +31,7 @@ struct of_phandle_args;
  * @ops: a pointer to device specific struct reset_control_ops
  * @owner: kernel module of the reset controller driver
  * @list: internal list of reset controller devices
+ * @reset_control_head: head of internal list of requested reset controls
  * @of_node: corresponding device tree node as phandle target
  * @of_reset_n_cells: number of cells in reset line specifiers
  * @of_xlate: translation function to translate from specifier as found in the
@@ -41,6 +42,7 @@ struct reset_controller_dev {
 	const struct reset_control_ops *ops;
 	struct module *owner;
 	struct list_head list;
+	struct list_head reset_control_head;
 	struct device_node *of_node;
 	int of_reset_n_cells;
 	int (*of_xlate)(struct reset_controller_dev *rcdev,
-- 
cgit v1.2.3


From 0b52297f2288ca239e598afe6c92db83d8d2bfcd Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 23 Feb 2016 18:46:26 +0100
Subject: reset: Add support for shared reset controls

In some SoCs some hw-blocks share a reset control. Add support for this
setup by adding new:

reset_control_get_shared()
devm_reset_control_get_shared()
devm_reset_control_get_shared_by_index()

methods to get a reset_control. Note that this patch omits adding of_
variants, if these are needed later they can be easily added.

This patch also changes the behavior of the existing exclusive
reset_control_get() variants, if these are now called more then once
for the same reset_control they will return -EBUSY. To catch existing
drivers triggering this error (there should not be any) a WARN_ON(1)
is added in this path.

When a reset_control is shared, the behavior of reset_control_assert /
deassert is changed, for shared reset_controls these will work like the
clock-enable/disable and regulator-on/off functions. They will keep a
deassert_count, and only (re-)assert the reset after reset_control_assert
has been called as many times as reset_control_deassert was called.

Calling reset_control_assert without first calling reset_control_deassert
is not allowed on a shared reset control. Calling reset_control_reset is
also not allowed on a shared reset control.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 59 +++++++++++++++++++++++++------
 include/linux/reset.h | 96 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 129 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 957750600ff3..72b32bd15549 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -8,6 +8,7 @@
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  */
+#include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/export.h>
@@ -29,12 +30,16 @@ static LIST_HEAD(reset_controller_list);
  * @id: ID of the reset controller in the reset
  *      controller device
  * @refcnt: Number of gets of this reset_control
+ * @shared: Is this a shared (1), or an exclusive (0) reset_control?
+ * @deassert_cnt: Number of times this reset line has been deasserted
  */
 struct reset_control {
 	struct reset_controller_dev *rcdev;
 	struct list_head list;
 	unsigned int id;
 	unsigned int refcnt;
+	int shared;
+	atomic_t deassert_count;
 };
 
 /**
@@ -91,9 +96,14 @@ EXPORT_SYMBOL_GPL(reset_controller_unregister);
 /**
  * reset_control_reset - reset the controlled device
  * @rstc: reset controller
+ *
+ * Calling this on a shared reset controller is an error.
  */
 int reset_control_reset(struct reset_control *rstc)
 {
+	if (WARN_ON(rstc->shared))
+		return -EINVAL;
+
 	if (rstc->rcdev->ops->reset)
 		return rstc->rcdev->ops->reset(rstc->rcdev, rstc->id);
 
@@ -104,26 +114,48 @@ EXPORT_SYMBOL_GPL(reset_control_reset);
 /**
  * reset_control_assert - asserts the reset line
  * @rstc: reset controller
+ *
+ * Calling this on an exclusive reset controller guarantees that the reset
+ * will be asserted. When called on a shared reset controller the line may
+ * still be deasserted, as long as other users keep it so.
+ *
+ * For shared reset controls a driver cannot expect the hw's registers and
+ * internal state to be reset, but must be prepared for this to happen.
  */
 int reset_control_assert(struct reset_control *rstc)
 {
-	if (rstc->rcdev->ops->assert)
-		return rstc->rcdev->ops->assert(rstc->rcdev, rstc->id);
+	if (!rstc->rcdev->ops->assert)
+		return -ENOTSUPP;
 
-	return -ENOTSUPP;
+	if (rstc->shared) {
+		if (WARN_ON(atomic_read(&rstc->deassert_count) == 0))
+			return -EINVAL;
+
+		if (atomic_dec_return(&rstc->deassert_count) != 0)
+			return 0;
+	}
+
+	return rstc->rcdev->ops->assert(rstc->rcdev, rstc->id);
 }
 EXPORT_SYMBOL_GPL(reset_control_assert);
 
 /**
  * reset_control_deassert - deasserts the reset line
  * @rstc: reset controller
+ *
+ * After calling this function, the reset is guaranteed to be deasserted.
  */
 int reset_control_deassert(struct reset_control *rstc)
 {
-	if (rstc->rcdev->ops->deassert)
-		return rstc->rcdev->ops->deassert(rstc->rcdev, rstc->id);
+	if (!rstc->rcdev->ops->deassert)
+		return -ENOTSUPP;
 
-	return -ENOTSUPP;
+	if (rstc->shared) {
+		if (atomic_inc_return(&rstc->deassert_count) != 1)
+			return 0;
+	}
+
+	return rstc->rcdev->ops->deassert(rstc->rcdev, rstc->id);
 }
 EXPORT_SYMBOL_GPL(reset_control_deassert);
 
@@ -144,7 +176,7 @@ EXPORT_SYMBOL_GPL(reset_control_status);
 
 static struct reset_control *__reset_control_get(
 				struct reset_controller_dev *rcdev,
-				unsigned int index)
+				unsigned int index, int shared)
 {
 	struct reset_control *rstc;
 
@@ -152,6 +184,9 @@ static struct reset_control *__reset_control_get(
 
 	list_for_each_entry(rstc, &rcdev->reset_control_head, list) {
 		if (rstc->id == index) {
+			if (WARN_ON(!rstc->shared || !shared))
+				return ERR_PTR(-EBUSY);
+
 			rstc->refcnt++;
 			return rstc;
 		}
@@ -167,6 +202,7 @@ static struct reset_control *__reset_control_get(
 	list_add(&rstc->list, &rcdev->reset_control_head);
 	rstc->id = index;
 	rstc->refcnt = 1;
+	rstc->shared = shared;
 
 	return rstc;
 }
@@ -185,7 +221,7 @@ static void __reset_control_put(struct reset_control *rstc)
 }
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-					     const char *id, int index)
+				     const char *id, int index, int shared)
 {
 	struct reset_control *rstc;
 	struct reset_controller_dev *r, *rcdev;
@@ -235,7 +271,7 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	}
 
 	/* reset_list_mutex also protects the rcdev's reset_control list */
-	rstc = __reset_control_get(rcdev, rstc_id);
+	rstc = __reset_control_get(rcdev, rstc_id, shared);
 
 	mutex_unlock(&reset_list_mutex);
 
@@ -265,7 +301,7 @@ static void devm_reset_control_release(struct device *dev, void *res)
 }
 
 struct reset_control *__devm_reset_control_get(struct device *dev,
-					       const char *id, int index)
+				     const char *id, int index, int shared)
 {
 	struct reset_control **ptr, *rstc;
 
@@ -274,7 +310,8 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = __of_reset_control_get(dev ? dev->of_node : NULL, id, index);
+	rstc = __of_reset_control_get(dev ? dev->of_node : NULL,
+				      id, index, shared);
 	if (!IS_ERR(rstc)) {
 		*ptr = rstc;
 		devres_add(dev, ptr);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 1bb69a29d6db..a552134a209e 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -13,10 +13,10 @@ int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-					     const char *id, int index);
+				     const char *id, int index, int shared);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
-					       const char *id, int index);
+				     const char *id, int index, int shared);
 
 int __must_check device_reset(struct device *dev);
 
@@ -63,14 +63,14 @@ static inline int device_reset_optional(struct device *dev)
 
 static inline struct reset_control *__of_reset_control_get(
 					struct device_node *node,
-					const char *id, int index)
+					const char *id, int index, int shared)
 {
 	return ERR_PTR(-EINVAL);
 }
 
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev,
-					const char *id, int index)
+					const char *id, int index, int shared)
 {
 	return ERR_PTR(-EINVAL);
 }
@@ -78,11 +78,17 @@ static inline struct reset_control *__devm_reset_control_get(
 #endif /* CONFIG_RESET_CONTROLLER */
 
 /**
- * reset_control_get - Lookup and obtain a reference to a reset controller.
+ * reset_control_get - Lookup and obtain an exclusive reference to a
+ *                     reset controller.
  * @dev: device to be reset by the controller
  * @id: reset line name
  *
  * Returns a struct reset_control or IS_ERR() condition containing errno.
+ * If this function is called more then once for the same reset_control it will
+ * return -EBUSY.
+ *
+ * See reset_control_get_shared for details on shared references to
+ * reset-controls.
  *
  * Use of id names is optional.
  */
@@ -92,17 +98,46 @@ static inline struct reset_control *__must_check reset_control_get(
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 0);
 }
 
 static inline struct reset_control *reset_control_get_optional(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 0);
 }
 
 /**
- * of_reset_control_get - Lookup and obtain a reference to a reset controller.
+ * reset_control_get_shared - Lookup and obtain a shared reference to a
+ *                            reset controller.
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Returns a struct reset_control or IS_ERR() condition containing errno.
+ * This function is intended for use with reset-controls which are shared
+ * between hardware-blocks.
+ *
+ * When a reset-control is shared, the behavior of reset_control_assert /
+ * deassert is changed, the reset-core will keep track of a deassert_count
+ * and only (re-)assert the reset after reset_control_assert has been called
+ * as many times as reset_control_deassert was called. Also see the remark
+ * about shared reset-controls in the reset_control_assert docs.
+ *
+ * Calling reset_control_assert without first calling reset_control_deassert
+ * is not allowed on a shared reset control. Calling reset_control_reset is
+ * also not allowed on a shared reset control.
+ *
+ * Use of id names is optional.
+ */
+static inline struct reset_control *reset_control_get_shared(
+					struct device *dev, const char *id)
+{
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 1);
+}
+
+/**
+ * of_reset_control_get - Lookup and obtain an exclusive reference to a
+ *                        reset controller.
  * @node: device to be reset by the controller
  * @id: reset line name
  *
@@ -113,12 +148,12 @@ static inline struct reset_control *reset_control_get_optional(
 static inline struct reset_control *of_reset_control_get(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0);
+	return __of_reset_control_get(node, id, 0, 0);
 }
 
 /**
- * of_reset_control_get_by_index - Lookup and obtain a reference to a reset
- * controller by index.
+ * of_reset_control_get_by_index - Lookup and obtain an exclusive reference to
+ *                                 a reset controller by index.
  * @node: device to be reset by the controller
  * @index: index of the reset controller
  *
@@ -129,7 +164,7 @@ static inline struct reset_control *of_reset_control_get(
 static inline struct reset_control *of_reset_control_get_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index);
+	return __of_reset_control_get(node, NULL, index, 0);
 }
 
 /**
@@ -147,13 +182,13 @@ static inline struct reset_control *__must_check devm_reset_control_get(
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __devm_reset_control_get(dev, id, 0);
+	return __devm_reset_control_get(dev, id, 0, 0);
 }
 
 static inline struct reset_control *devm_reset_control_get_optional(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0);
+	return __devm_reset_control_get(dev, id, 0, 0);
 }
 
 /**
@@ -168,7 +203,38 @@ static inline struct reset_control *devm_reset_control_get_optional(
 static inline struct reset_control *devm_reset_control_get_by_index(
 					struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index);
+	return __devm_reset_control_get(dev, NULL, index, 0);
+}
+
+/**
+ * devm_reset_control_get_shared - resource managed reset_control_get_shared()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get_shared(). For reset controllers returned from
+ * this function, reset_control_put() is called automatically on driver detach.
+ * See reset_control_get_shared() for more information.
+ */
+static inline struct reset_control *devm_reset_control_get_shared(
+					struct device *dev, const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, 1);
+}
+
+/**
+ * devm_reset_control_get_shared_by_index - resource managed
+ * reset_control_get_shared
+ * @dev: device to be reset by the controller
+ * @index: index of the reset controller
+ *
+ * Managed reset_control_get_shared(). For reset controllers returned from
+ * this function, reset_control_put() is called automatically on driver detach.
+ * See reset_control_get_shared() for more information.
+ */
+static inline struct reset_control *devm_reset_control_get_shared_by_index(
+					struct device *dev, int index)
+{
+	return __devm_reset_control_get(dev, NULL, index, 1);
 }
 
 #endif
-- 
cgit v1.2.3


From 44debe7a123cc760fc90ccbe253210798c917fa7 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 30 Mar 2016 11:26:35 +0200
Subject: vgacon: dummy implementation for vgacon_text_force
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows us to ditch a ton of ugly #ifdefs from a bunch of drm modeset
drivers.

v2: Make the dummy function actually return a sane value, spotted by
Ville.

v3: Because the patch is still in limbo there's no more drivers to
convert, noticed by Emil.

v4: Rebase once more, because hooray. I'll just go ahead an apply this
one later on to drm-misc.

Cc: Emil Velikov <emil.l.velikov@gmail.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 --
 drivers/gpu/drm/ast/ast_drv.c           | 2 --
 drivers/gpu/drm/cirrus/cirrus_drv.c     | 2 --
 drivers/gpu/drm/i915/i915_drv.c         | 2 --
 drivers/gpu/drm/mgag200/mgag200_drv.c   | 2 --
 drivers/gpu/drm/nouveau/nouveau_drm.c   | 2 --
 drivers/gpu/drm/qxl/qxl_drv.c           | 2 --
 drivers/gpu/drm/radeon/radeon_drv.c     | 2 --
 drivers/gpu/drm/virtio/virtgpu_drv.c    | 2 --
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c     | 2 --
 include/linux/console.h                 | 2 ++
 11 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ce79a8b605a0..fba20bd59cfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -539,12 +539,10 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 
 static int __init amdgpu_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force()) {
 		DRM_ERROR("VGACON disables amdgpu kernel modesetting.\n");
 		return -EINVAL;
 	}
-#endif
 	DRM_INFO("amdgpu kernel modesetting enabled.\n");
 	driver = &kms_driver;
 	pdriver = &amdgpu_kms_pci_driver;
diff --git a/drivers/gpu/drm/ast/ast_drv.c b/drivers/gpu/drm/ast/ast_drv.c
index 9a32d9dfdd26..fcd9c0714836 100644
--- a/drivers/gpu/drm/ast/ast_drv.c
+++ b/drivers/gpu/drm/ast/ast_drv.c
@@ -218,10 +218,8 @@ static struct drm_driver driver = {
 
 static int __init ast_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && ast_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (ast_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.c b/drivers/gpu/drm/cirrus/cirrus_drv.c
index b1619e29a564..b394e6d8f01e 100644
--- a/drivers/gpu/drm/cirrus/cirrus_drv.c
+++ b/drivers/gpu/drm/cirrus/cirrus_drv.c
@@ -162,10 +162,8 @@ static struct pci_driver cirrus_pci_driver = {
 
 static int __init cirrus_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && cirrus_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (cirrus_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 44912ecebc1a..8a62690e6513 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1750,10 +1750,8 @@ static int __init i915_init(void)
 	if (i915.modeset == 0)
 		driver.driver_features &= ~DRIVER_MODESET;
 
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && i915.modeset == -1)
 		driver.driver_features &= ~DRIVER_MODESET;
-#endif
 
 	if (!(driver.driver_features & DRIVER_MODESET)) {
 		/* Silently fail loading to not upset userspace. */
diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.c b/drivers/gpu/drm/mgag200/mgag200_drv.c
index b0af77454d52..ebb470ff7200 100644
--- a/drivers/gpu/drm/mgag200/mgag200_drv.c
+++ b/drivers/gpu/drm/mgag200/mgag200_drv.c
@@ -116,10 +116,8 @@ static struct pci_driver mgag200_pci_driver = {
 
 static int __init mgag200_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && mgag200_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (mgag200_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index bb8498c9b13e..731c5c2a8933 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -1082,10 +1082,8 @@ nouveau_drm_init(void)
 	nouveau_display_options();
 
 	if (nouveau_modeset == -1) {
-#ifdef CONFIG_VGA_CONSOLE
 		if (vgacon_text_force())
 			nouveau_modeset = 0;
-#endif
 	}
 
 	if (!nouveau_modeset)
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index 7307b07fe06b..dc9df5fe50ba 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -272,10 +272,8 @@ static struct drm_driver qxl_driver = {
 
 static int __init qxl_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && qxl_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (qxl_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index cad25557650f..ad136fc081c8 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -558,12 +558,10 @@ static struct pci_driver radeon_kms_pci_driver = {
 
 static int __init radeon_init(void)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && radeon_modeset == -1) {
 		DRM_INFO("VGACON disable radeon kernel modesetting.\n");
 		radeon_modeset = 0;
 	}
-#endif
 	/* set to modesetting by default if not nomodeset */
 	if (radeon_modeset == -1)
 		radeon_modeset = 1;
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index 7f898cfdc746..3cc7afa77a35 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -42,10 +42,8 @@ module_param_named(modeset, virtio_gpu_modeset, int, 0400);
 
 static int virtio_gpu_probe(struct virtio_device *vdev)
 {
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force() && virtio_gpu_modeset == -1)
 		return -EINVAL;
-#endif
 
 	if (virtio_gpu_modeset == 0)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 0ee76e523a90..fa10395e2a18 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1529,10 +1529,8 @@ static int __init vmwgfx_init(void)
 {
 	int ret;
 
-#ifdef CONFIG_VGA_CONSOLE
 	if (vgacon_text_force())
 		return -EINVAL;
-#endif
 
 	ret = drm_pci_init(&driver, &vmw_pci_driver);
 	if (ret)
diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af2451e..e49cc1ef19be 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -191,6 +191,8 @@ void vcs_remove_sysfs(int index);
 
 #ifdef CONFIG_VGA_CONSOLE
 extern bool vgacon_text_force(void);
+#else
+static inline bool vgacon_text_force(void) { return false; }
 #endif
 
 #endif /* _LINUX_CONSOLE_H */
-- 
cgit v1.2.3


From e8b123e6008480b2b8d80dae060315d84b79f4bb Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn@kryo.se>
Date: Thu, 24 Dec 2015 00:28:38 -0800
Subject: soc: qcom: smem_state: Add stubs for disabled smem_state

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 include/linux/soc/qcom/smem_state.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smem_state.h b/include/linux/soc/qcom/smem_state.h
index f35e1512fcaa..7b88697929e9 100644
--- a/include/linux/soc/qcom/smem_state.h
+++ b/include/linux/soc/qcom/smem_state.h
@@ -1,12 +1,17 @@
 #ifndef __QCOM_SMEM_STATE__
 #define __QCOM_SMEM_STATE__
 
+#include <linux/errno.h>
+
+struct device_node;
 struct qcom_smem_state;
 
 struct qcom_smem_state_ops {
 	int (*update_bits)(void *, u32, u32);
 };
 
+#ifdef CONFIG_QCOM_SMEM_STATE
+
 struct qcom_smem_state *qcom_smem_state_get(struct device *dev, const char *con_id, unsigned *bit);
 void qcom_smem_state_put(struct qcom_smem_state *);
 
@@ -15,4 +20,34 @@ int qcom_smem_state_update_bits(struct qcom_smem_state *state, u32 mask, u32 val
 struct qcom_smem_state *qcom_smem_state_register(struct device_node *of_node, const struct qcom_smem_state_ops *ops, void *data);
 void qcom_smem_state_unregister(struct qcom_smem_state *state);
 
+#else
+
+static inline struct qcom_smem_state *qcom_smem_state_get(struct device *dev,
+	const char *con_id, unsigned *bit)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void qcom_smem_state_put(struct qcom_smem_state *state)
+{
+}
+
+static inline int qcom_smem_state_update_bits(struct qcom_smem_state *state,
+	u32 mask, u32 value)
+{
+	return -EINVAL;
+}
+
+static inline struct qcom_smem_state *qcom_smem_state_register(struct device_node *of_node,
+	const struct qcom_smem_state_ops *ops, void *data)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void qcom_smem_state_unregister(struct qcom_smem_state *state)
+{
+}
+
+#endif
+
 #endif
-- 
cgit v1.2.3


From 39f0db298e7c02a29371fb39cabdd5d76e6b726c Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Wed, 17 Feb 2016 22:39:02 -0800
Subject: soc: qcom: smd: Introduce callback setter

Introduce a setter for the callback function pointer to clarify the
locking around the operation and to reduce some duplication.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd.c       | 25 +++++++++++++++++--------
 include/linux/soc/qcom/smd.h |  4 +++-
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index 498fd0581a45..c357842b92e1 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -186,7 +186,7 @@ struct qcom_smd_channel {
 	int fifo_size;
 
 	void *bounce_buffer;
-	int (*cb)(struct qcom_smd_device *, const void *, size_t);
+	qcom_smd_cb_t cb;
 
 	spinlock_t recv_lock;
 
@@ -377,6 +377,19 @@ static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
 	channel->pkt_size = 0;
 }
 
+/*
+ * Set the callback for a channel, with appropriate locking
+ */
+static void qcom_smd_channel_set_callback(struct qcom_smd_channel *channel,
+					  qcom_smd_cb_t cb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&channel->recv_lock, flags);
+	channel->cb = cb;
+	spin_unlock_irqrestore(&channel->recv_lock, flags);
+};
+
 /*
  * Calculate the amount of data available in the rx fifo
  */
@@ -814,8 +827,7 @@ static int qcom_smd_dev_probe(struct device *dev)
 	if (!channel->bounce_buffer)
 		return -ENOMEM;
 
-	channel->cb = qsdrv->callback;
-
+	qcom_smd_channel_set_callback(channel, qsdrv->callback);
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENING);
 
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENED);
@@ -831,7 +843,7 @@ static int qcom_smd_dev_probe(struct device *dev)
 err:
 	dev_err(&qsdev->dev, "probe failed\n");
 
-	channel->cb = NULL;
+	qcom_smd_channel_set_callback(channel, NULL);
 	kfree(channel->bounce_buffer);
 	channel->bounce_buffer = NULL;
 
@@ -850,16 +862,13 @@ static int qcom_smd_dev_remove(struct device *dev)
 	struct qcom_smd_device *qsdev = to_smd_device(dev);
 	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
 	struct qcom_smd_channel *channel = qsdev->channel;
-	unsigned long flags;
 
 	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSING);
 
 	/*
 	 * Make sure we don't race with the code receiving data.
 	 */
-	spin_lock_irqsave(&channel->recv_lock, flags);
-	channel->cb = NULL;
-	spin_unlock_irqrestore(&channel->recv_lock, flags);
+	qcom_smd_channel_set_callback(channel, NULL);
 
 	/* Wake up any sleepers in qcom_smd_send() */
 	wake_up_interruptible(&channel->fblockread_event);
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index d0cb6d189a0a..65a64fcdb1aa 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -26,6 +26,8 @@ struct qcom_smd_device {
 	struct qcom_smd_channel *channel;
 };
 
+typedef int (*qcom_smd_cb_t)(struct qcom_smd_device *, const void *, size_t);
+
 /**
  * struct qcom_smd_driver - smd driver struct
  * @driver:	underlying device driver
@@ -42,7 +44,7 @@ struct qcom_smd_driver {
 
 	int (*probe)(struct qcom_smd_device *dev);
 	void (*remove)(struct qcom_smd_device *dev);
-	int (*callback)(struct qcom_smd_device *, const void *, size_t);
+	qcom_smd_cb_t callback;
 };
 
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
-- 
cgit v1.2.3


From 028021d29ea069390e1f60c6aa5b3511d218454b Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Wed, 17 Feb 2016 22:39:06 -0800
Subject: soc: qcom: smd: Support opening additional channels

With the qcom_smd_open_channel() API we allow SMD devices to open
additional SMD channels, to allow implementation of multi-channel SMD
devices - like Bluetooth.

Channels are opened from the same edge as the calling SMD device is tied
to.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd.c       | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smd.h |  4 +++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index c3fa0fd724f7..b6434c4be86a 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -129,6 +129,8 @@ struct qcom_smd_edge {
 
 	unsigned smem_available;
 
+	wait_queue_head_t new_channel_event;
+
 	struct work_struct scan_work;
 	struct work_struct state_work;
 };
@@ -1042,6 +1044,77 @@ void qcom_smd_driver_unregister(struct qcom_smd_driver *qsdrv)
 }
 EXPORT_SYMBOL(qcom_smd_driver_unregister);
 
+static struct qcom_smd_channel *
+qcom_smd_find_channel(struct qcom_smd_edge *edge, const char *name)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_channel *ret = NULL;
+	unsigned long flags;
+	unsigned state;
+
+	spin_lock_irqsave(&edge->channels_lock, flags);
+	list_for_each_entry(channel, &edge->channels, list) {
+		if (strcmp(channel->name, name))
+			continue;
+
+		state = GET_RX_CHANNEL_INFO(channel, state);
+		if (state != SMD_CHANNEL_OPENING &&
+		    state != SMD_CHANNEL_OPENED)
+			continue;
+
+		ret = channel;
+		break;
+	}
+	spin_unlock_irqrestore(&edge->channels_lock, flags);
+
+	return ret;
+}
+
+/**
+ * qcom_smd_open_channel() - claim additional channels on the same edge
+ * @sdev:	smd_device handle
+ * @name:	channel name
+ * @cb:		callback method to use for incoming data
+ *
+ * Returns a channel handle on success, or -EPROBE_DEFER if the channel isn't
+ * ready.
+ */
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+					       const char *name,
+					       qcom_smd_cb_t cb)
+{
+	struct qcom_smd_channel *channel;
+	struct qcom_smd_edge *edge = sdev->channel->edge;
+	int ret;
+
+	/* Wait up to HZ for the channel to appear */
+	ret = wait_event_interruptible_timeout(edge->new_channel_event,
+			(channel = qcom_smd_find_channel(edge, name)) != NULL,
+			HZ);
+	if (!ret)
+		return ERR_PTR(-ETIMEDOUT);
+
+	if (channel->state != SMD_CHANNEL_CLOSED) {
+		dev_err(&sdev->dev, "channel %s is busy\n", channel->name);
+		return ERR_PTR(-EBUSY);
+	}
+
+	channel->qsdev = sdev;
+	ret = qcom_smd_channel_open(channel, cb);
+	if (ret) {
+		channel->qsdev = NULL;
+		return ERR_PTR(ret);
+	}
+
+	/*
+	 * Append the list of channel to the channels associated with the sdev
+	 */
+	list_add_tail(&channel->dev_list, &sdev->channel->dev_list);
+
+	return channel;
+}
+EXPORT_SYMBOL(qcom_smd_open_channel);
+
 /*
  * Allocate the qcom_smd_channel object for a newly found smd channel,
  * retrieving and validating the smem items involved.
@@ -1178,6 +1251,8 @@ static void qcom_channel_scan_worker(struct work_struct *work)
 
 			dev_dbg(smd->dev, "new channel found: '%s'\n", channel->name);
 			set_bit(i, edge->allocated[tbl]);
+
+			wake_up_interruptible(&edge->new_channel_event);
 		}
 	}
 
@@ -1341,6 +1416,7 @@ static int qcom_smd_probe(struct platform_device *pdev)
 	for_each_available_child_of_node(pdev->dev.of_node, node) {
 		edge = &smd->edges[i++];
 		edge->smd = smd;
+		init_waitqueue_head(&edge->new_channel_event);
 
 		ret = qcom_smd_parse_edge(&pdev->dev, node, edge);
 		if (ret)
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index 65a64fcdb1aa..bd51c8a9d807 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -56,4 +56,8 @@ void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
 int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+					       const char *name,
+					       qcom_smd_cb_t cb);
+
 #endif
-- 
cgit v1.2.3


From b8a7a3a6674725d7ca0ff6e322f6c1cab6e6a11d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 24 Mar 2016 14:38:37 +0100
Subject: posix_acl: Inode acl caching fixes

When get_acl() is called for an inode whose ACL is not cached yet, the
get_acl inode operation is called to fetch the ACL from the filesystem.
The inode operation is responsible for updating the cached acl with
set_cached_acl().  This is done without locking at the VFS level, so
another task can call set_cached_acl() or forget_cached_acl() before the
get_acl inode operation gets to calling set_cached_acl(), and then
get_acl's call to set_cached_acl() results in caching an outdate ACL.

Prevent this from happening by setting the cached ACL pointer to a
task-specific sentinel value before calling the get_acl inode operation.
Move the responsibility for updating the cached ACL from the get_acl
inode operations to get_acl().  There, only set the cached ACL if the
sentinel value hasn't changed.

The sentinel values are chosen to have odd values.  Likewise, the value
of ACL_NOT_CACHED is odd.  In contrast, ACL object pointers always have
an even value (ACLs are aligned in memory).  This allows to distinguish
uncached ACLs values from ACL objects.

In addition, switch from guarding inode->i_acl and inode->i_default_acl
upates by the inode->i_lock spinlock to using xchg() and cmpxchg().

Filesystems that do not want ACLs returned from their get_acl inode
operations to be cached must call forget_cached_acl() to prevent the VFS
from doing so.

(Patch written by Al Viro and Andreas Gruenbacher.)

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c             |   2 +-
 fs/btrfs/acl.c          |   3 --
 fs/ceph/acl.c           |   2 +
 fs/ext2/acl.c           |   3 --
 fs/ext4/acl.c           |   3 --
 fs/f2fs/acl.c           |   3 --
 fs/hfsplus/posix_acl.c  |   3 --
 fs/inode.c              |   4 +-
 fs/jffs2/acl.c          |   2 -
 fs/jfs/acl.c            |   2 -
 fs/namei.c              |   2 +-
 fs/nfs/nfs3acl.c        |  43 +++++++++++++++++++-
 fs/ocfs2/dlmglue.c      |   3 ++
 fs/posix_acl.c          | 103 ++++++++++++++++++++++++++++++++----------------
 fs/reiserfs/xattr_acl.c |   6 +--
 fs/xfs/xfs_acl.c        |  20 +++-------
 include/linux/fs.h      |  12 ++++++
 17 files changed, 138 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 9da967f38387..2d94e94b6b59 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -93,7 +93,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	 * instantiating the inode (v9fs_inode_from_fid)
 	 */
 	acl = get_cached_acl(inode, type);
-	BUG_ON(acl == ACL_NOT_CACHED);
+	BUG_ON(is_uncached_acl(acl));
 	return acl;
 }
 
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6d263bb1621c..67a607709d4f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -63,9 +63,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	}
 	kfree(value);
 
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
 	return acl;
 }
 
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f19708487e2f..5457f216e2e5 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -37,6 +37,8 @@ static inline void ceph_set_cached_acl(struct inode *inode,
 	spin_lock(&ci->i_ceph_lock);
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
 		set_cached_acl(inode, type, acl);
+	else
+		forget_cached_acl(inode, type);
 	spin_unlock(&ci->i_ceph_lock);
 }
 
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 27695e6f4e46..42f1d1814083 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -172,9 +172,6 @@ ext2_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
 	return acl;
 }
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 69b1e73026a5..c6601a476c02 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -172,9 +172,6 @@ ext4_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
 	return acl;
 }
 
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index c8f25f7241f0..6f1fdda977b3 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -190,9 +190,6 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
 	return acl;
 }
 
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index afb33eda6d7d..ab7ea2506b4d 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -48,9 +48,6 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
 
 	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
 
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
-
 	return acl;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 69b8b526c194..4202aac99464 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -238,9 +238,9 @@ void __destroy_inode(struct inode *inode)
 	}
 
 #ifdef CONFIG_FS_POSIX_ACL
-	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
 		posix_acl_release(inode->i_acl);
-	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
 		posix_acl_release(inode->i_default_acl);
 #endif
 	this_cpu_dec(nr_inodes);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 2f7a3c090489..bc2693d56298 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -203,8 +203,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(rc);
 	}
 	kfree(value);
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
 	return acl;
 }
 
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index ab4882801b24..21fa92ba2c19 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -63,8 +63,6 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 		acl = posix_acl_from_xattr(&init_user_ns, value, size);
 	}
 	kfree(value);
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
 	return acl;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..3498d53de26f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -265,7 +265,7 @@ static int check_acl(struct inode *inode, int mask)
 	        if (!acl)
 	                return -EAGAIN;
 		/* no ->get_acl() calls in RCU mode... */
-		if (acl == ACL_NOT_CACHED)
+		if (is_uncached_acl(acl))
 			return -ECHILD;
 	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 	}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 17c0fa1eccfa..720d92f5abfb 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -11,6 +11,38 @@
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
+/*
+ * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for
+ * caching get_acl results in a race-free way.  See fs/posix_acl.c:get_acl()
+ * for explanations.
+ */
+static void nfs3_prepare_get_acl(struct posix_acl **p)
+{
+	struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+	if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) {
+		/* Not the first reader or sentinel already in place. */
+	}
+}
+
+static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
+{
+	struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+	/* Only cache the ACL if our sentinel is still in place. */
+	posix_acl_dup(acl);
+	if (cmpxchg(p, sentinel, acl) != sentinel)
+		posix_acl_release(acl);
+}
+
+static void nfs3_abort_get_acl(struct posix_acl **p)
+{
+	struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+	/* Remove our sentinel upon failure. */
+	cmpxchg(p, sentinel, ACL_NOT_CACHED);
+}
+
 struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -55,6 +87,11 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 	if (res.fattr == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	if (args.mask & NFS_ACL)
+		nfs3_prepare_get_acl(&inode->i_acl);
+	if (args.mask & NFS_DFACL)
+		nfs3_prepare_get_acl(&inode->i_default_acl);
+
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	dprintk("NFS reply getacl: %d\n", status);
 
@@ -89,12 +126,12 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 	}
 
 	if (res.mask & NFS_ACL)
-		set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
+		nfs3_complete_get_acl(&inode->i_acl, res.acl_access);
 	else
 		forget_cached_acl(inode, ACL_TYPE_ACCESS);
 
 	if (res.mask & NFS_DFACL)
-		set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
+		nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default);
 	else
 		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
 
@@ -108,6 +145,8 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 	}
 
 getout:
+	nfs3_abort_get_acl(&inode->i_acl);
+	nfs3_abort_get_acl(&inode->i_default_acl);
 	posix_acl_release(res.acl_access);
 	posix_acl_release(res.acl_default);
 	nfs_free_fattr(res.fattr);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 474e57f834e6..1eaa9100c889 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -54,6 +54,7 @@
 #include "uptodate.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "acl.h"
 
 #include "buffer_head_io.h"
 
@@ -3623,6 +3624,8 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 		filemap_fdatawait(mapping);
 	}
 
+	forget_all_cached_acls(inode);
+
 out:
 	return UNBLOCK_CONTINUE;
 }
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 711dd5170376..bc6736d60786 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -37,14 +37,18 @@ EXPORT_SYMBOL(acl_by_type);
 struct posix_acl *get_cached_acl(struct inode *inode, int type)
 {
 	struct posix_acl **p = acl_by_type(inode, type);
-	struct posix_acl *acl = ACCESS_ONCE(*p);
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *p;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
+	struct posix_acl *acl;
+
+	for (;;) {
+		rcu_read_lock();
+		acl = rcu_dereference(*p);
+		if (!acl || is_uncached_acl(acl) ||
+		    atomic_inc_not_zero(&acl->a_refcount))
+			break;
+		rcu_read_unlock();
+		cpu_relax();
 	}
+	rcu_read_unlock();
 	return acl;
 }
 EXPORT_SYMBOL(get_cached_acl);
@@ -59,58 +63,72 @@ void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
 	struct posix_acl **p = acl_by_type(inode, type);
 	struct posix_acl *old;
-	spin_lock(&inode->i_lock);
-	old = *p;
-	rcu_assign_pointer(*p, posix_acl_dup(acl));
-	spin_unlock(&inode->i_lock);
-	if (old != ACL_NOT_CACHED)
+
+	old = xchg(p, posix_acl_dup(acl));
+	if (!is_uncached_acl(old))
 		posix_acl_release(old);
 }
 EXPORT_SYMBOL(set_cached_acl);
 
-void forget_cached_acl(struct inode *inode, int type)
+static void __forget_cached_acl(struct posix_acl **p)
 {
-	struct posix_acl **p = acl_by_type(inode, type);
 	struct posix_acl *old;
-	spin_lock(&inode->i_lock);
-	old = *p;
-	*p = ACL_NOT_CACHED;
-	spin_unlock(&inode->i_lock);
-	if (old != ACL_NOT_CACHED)
+
+	old = xchg(p, ACL_NOT_CACHED);
+	if (!is_uncached_acl(old))
 		posix_acl_release(old);
 }
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+	__forget_cached_acl(acl_by_type(inode, type));
+}
 EXPORT_SYMBOL(forget_cached_acl);
 
 void forget_all_cached_acls(struct inode *inode)
 {
-	struct posix_acl *old_access, *old_default;
-	spin_lock(&inode->i_lock);
-	old_access = inode->i_acl;
-	old_default = inode->i_default_acl;
-	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
-	spin_unlock(&inode->i_lock);
-	if (old_access != ACL_NOT_CACHED)
-		posix_acl_release(old_access);
-	if (old_default != ACL_NOT_CACHED)
-		posix_acl_release(old_default);
+	__forget_cached_acl(&inode->i_acl);
+	__forget_cached_acl(&inode->i_default_acl);
 }
 EXPORT_SYMBOL(forget_all_cached_acls);
 
 struct posix_acl *get_acl(struct inode *inode, int type)
 {
+	void *sentinel;
+	struct posix_acl **p;
 	struct posix_acl *acl;
 
+	/*
+	 * The sentinel is used to detect when another operation like
+	 * set_cached_acl() or forget_cached_acl() races with get_acl().
+	 * It is guaranteed that is_uncached_acl(sentinel) is true.
+	 */
+
 	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
+	if (!is_uncached_acl(acl))
 		return acl;
 
 	if (!IS_POSIXACL(inode))
 		return NULL;
 
+	sentinel = uncached_acl_sentinel(current);
+	p = acl_by_type(inode, type);
+
 	/*
-	 * A filesystem can force a ACL callback by just never filling the
-	 * ACL cache. But normally you'd fill the cache either at inode
-	 * instantiation time, or on the first ->get_acl call.
+	 * If the ACL isn't being read yet, set our sentinel.  Otherwise, the
+	 * current value of the ACL will not be ACL_NOT_CACHED and so our own
+	 * sentinel will not be set; another task will update the cache.  We
+	 * could wait for that other task to complete its job, but it's easier
+	 * to just call ->get_acl to fetch the ACL ourself.  (This is going to
+	 * be an unlikely race.)
+	 */
+	if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
+		/* fall through */ ;
+
+	/*
+	 * Normally, the ACL returned by ->get_acl will be cached.
+	 * A filesystem can prevent that by calling
+	 * forget_cached_acl(inode, type) in ->get_acl.
 	 *
 	 * If the filesystem doesn't have a get_acl() function at all, we'll
 	 * just create the negative cache entry.
@@ -119,7 +137,24 @@ struct posix_acl *get_acl(struct inode *inode, int type)
 		set_cached_acl(inode, type, NULL);
 		return NULL;
 	}
-	return inode->i_op->get_acl(inode, type);
+	acl = inode->i_op->get_acl(inode, type);
+
+	if (IS_ERR(acl)) {
+		/*
+		 * Remove our sentinel so that we don't block future attempts
+		 * to cache the ACL.
+		 */
+		cmpxchg(p, sentinel, ACL_NOT_CACHED);
+		return acl;
+	}
+
+	/*
+	 * Cache the result, but only if our sentinel is still in place.
+	 */
+	posix_acl_dup(acl);
+	if (unlikely(cmpxchg(p, sentinel, acl) != sentinel))
+		posix_acl_release(acl);
+	return acl;
 }
 EXPORT_SYMBOL(get_acl);
 
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index ec74bbedc873..dbed42f755e0 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -197,10 +197,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
-		if (size == -ENODATA || size == -ENOSYS) {
-			set_cached_acl(inode, type, NULL);
+		if (size == -ENODATA || size == -ENOSYS)
 			return NULL;
-		}
 		return ERR_PTR(size);
 	}
 
@@ -220,8 +218,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	} else {
 		acl = reiserfs_posix_acl_from_disk(value, retval);
 	}
-	if (!IS_ERR(acl))
-		set_cached_acl(inode, type, acl);
 
 	kfree(value);
 	return acl;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 2d5df1f23bbc..b6e527b8eccb 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -158,22 +158,14 @@ xfs_get_acl(struct inode *inode, int type)
 	if (error) {
 		/*
 		 * If the attribute doesn't exist make sure we have a negative
-		 * cache entry, for any other error assume it is transient and
-		 * leave the cache entry as ACL_NOT_CACHED.
+		 * cache entry, for any other error assume it is transient.
 		 */
-		if (error == -ENOATTR)
-			goto out_update_cache;
-		acl = ERR_PTR(error);
-		goto out;
+		if (error != -ENOATTR)
+			acl = ERR_PTR(error);
+	} else  {
+		acl = xfs_acl_from_disk(xfs_acl, len,
+					XFS_ACL_MAX_ENTRIES(ip->i_mount));
 	}
-
-	acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
-	if (IS_ERR(acl))
-		goto out;
-
-out_update_cache:
-	set_cached_acl(inode, type, acl);
-out:
 	kmem_free(xfs_acl);
 	return acl;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..329ed372d708 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -577,6 +577,18 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
 
+static inline struct posix_acl *
+uncached_acl_sentinel(struct task_struct *task)
+{
+	return (void *)task + 1;
+}
+
+static inline bool
+is_uncached_acl(struct posix_acl *acl)
+{
+	return (long)acl & 1;
+}
+
 #define IOP_FASTPERM	0x0001
 #define IOP_LOOKUP	0x0002
 #define IOP_NOFOLLOW	0x0004
-- 
cgit v1.2.3


From 04c57f4501909b60353031cfe5b991751d745658 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 24 Mar 2016 14:38:38 +0100
Subject: posix_acl: Unexport acl_by_type and make it static

acl_by_type(inode, type) returns a pointer to either inode->i_acl or
inode->i_default_acl depending on type.  This is useful in
fs/posix_acl.c, but should never have been visible outside that file.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/posix_acl.c            | 3 +--
 include/linux/posix_acl.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index bc6736d60786..db1fb0f5d9ff 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -21,7 +21,7 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 
-struct posix_acl **acl_by_type(struct inode *inode, int type)
+static struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
@@ -32,7 +32,6 @@ struct posix_acl **acl_by_type(struct inode *inode, int type)
 		BUG();
 	}
 }
-EXPORT_SYMBOL(acl_by_type);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type)
 {
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 3e96a6a76103..5b5a80cc5926 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -99,7 +99,6 @@ extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
 
-struct posix_acl **acl_by_type(struct inode *inode, int type);
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
 struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
-- 
cgit v1.2.3


From 1ce15ef4f60529cf1313f80f4338c88bd65cc572 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Tue, 22 Mar 2016 20:03:16 -0400
Subject: module: preserve Elf information for livepatch modules

For livepatch modules, copy Elf section, symbol, and string information
from the load_info struct in the module loader. Persist copies of the
original symbol table and string table.

Livepatch manages its own relocation sections in order to reuse module
loader code to write relocations. Livepatch modules must preserve Elf
information such as section indices in order to apply livepatch relocation
sections using the module loader's apply_relocate_add() function.

In order to apply livepatch relocation sections, livepatch modules must
keep a complete copy of their original symbol table in memory. Normally, a
stripped down copy of a module's symbol table (containing only "core"
symbols) is made available through module->core_symtab. But for livepatch
modules, the symbol table copied into memory on module load must be exactly
the same as the symbol table produced when the patch module was compiled.
This is because the relocations in each livepatch relocation section refer
to their respective symbols with their symbol indices, and the original
symbol indices (and thus the symtab ordering) must be preserved in order
for apply_relocate_add() to find the right symbol.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/module.h |  25 ++++++++++
 kernel/module.c        | 125 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 147 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 2bb0c3085706..3daf2b3a09d2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -330,6 +330,15 @@ struct mod_kallsyms {
 	char *strtab;
 };
 
+#ifdef CONFIG_LIVEPATCH
+struct klp_modinfo {
+	Elf_Ehdr hdr;
+	Elf_Shdr *sechdrs;
+	char *secstrings;
+	unsigned int symndx;
+};
+#endif
+
 struct module {
 	enum module_state state;
 
@@ -456,7 +465,11 @@ struct module {
 #endif
 
 #ifdef CONFIG_LIVEPATCH
+	bool klp; /* Is this a livepatch module? */
 	bool klp_alive;
+
+	/* Elf information */
+	struct klp_modinfo *klp_info;
 #endif
 
 #ifdef CONFIG_MODULE_UNLOAD
@@ -630,6 +643,18 @@ static inline bool module_requested_async_probing(struct module *module)
 	return module && module->async_probe_requested;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static inline bool is_livepatch_module(struct module *mod)
+{
+	return mod->klp;
+}
+#else /* !CONFIG_LIVEPATCH */
+static inline bool is_livepatch_module(struct module *mod)
+{
+	return false;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 #else /* !CONFIG_MODULES... */
 
 /* Given an address, look for it in the exception tables. */
diff --git a/kernel/module.c b/kernel/module.c
index 041200ca4a2d..5f71aa63ed2a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1973,6 +1973,83 @@ static void module_enable_nx(const struct module *mod) { }
 static void module_disable_nx(const struct module *mod) { }
 #endif
 
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	unsigned int size, symndx;
+	int ret;
+
+	size = sizeof(*mod->klp_info);
+	mod->klp_info = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info == NULL)
+		return -ENOMEM;
+
+	/* Elf header */
+	size = sizeof(mod->klp_info->hdr);
+	memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+	/* Elf section header table */
+	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+	mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info->sechdrs == NULL) {
+		ret = -ENOMEM;
+		goto free_info;
+	}
+	memcpy(mod->klp_info->sechdrs, info->sechdrs, size);
+
+	/* Elf section name string table */
+	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+	mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info->secstrings == NULL) {
+		ret = -ENOMEM;
+		goto free_sechdrs;
+	}
+	memcpy(mod->klp_info->secstrings, info->secstrings, size);
+
+	/* Elf symbol section index */
+	symndx = info->index.sym;
+	mod->klp_info->symndx = symndx;
+
+	/*
+	 * For livepatch modules, core_kallsyms.symtab is a complete
+	 * copy of the original symbol table. Adjust sh_addr to point
+	 * to core_kallsyms.symtab since the copy of the symtab in module
+	 * init memory is freed at the end of do_init_module().
+	 */
+	mod->klp_info->sechdrs[symndx].sh_addr = \
+		(unsigned long) mod->core_kallsyms.symtab;
+
+	return 0;
+
+free_sechdrs:
+	kfree(mod->klp_info->sechdrs);
+free_info:
+	kfree(mod->klp_info);
+	return ret;
+}
+
+static void free_module_elf(struct module *mod)
+{
+	kfree(mod->klp_info->sechdrs);
+	kfree(mod->klp_info->secstrings);
+	kfree(mod->klp_info);
+}
+#else /* !CONFIG_LIVEPATCH */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	return 0;
+}
+
+static void free_module_elf(struct module *mod)
+{
+}
+#endif /* CONFIG_LIVEPATCH */
+
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
@@ -2011,6 +2088,9 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 
+	if (is_livepatch_module(mod))
+		free_module_elf(mod);
+
 	/* Now we can delete it from the lists */
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
@@ -2126,6 +2206,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 			       (long)sym[i].st_value);
 			break;
 
+		case SHN_LIVEPATCH:
+			/* Livepatch symbols are resolved by livepatch */
+			break;
+
 		case SHN_UNDEF:
 			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
@@ -2174,6 +2258,10 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
 		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
 			continue;
 
+		/* Livepatch relocation sections are applied by livepatch */
+		if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+			continue;
+
 		if (info->sechdrs[i].sh_type == SHT_REL)
 			err = apply_relocate(info->sechdrs, info->strtab,
 					     info->index.sym, i, mod);
@@ -2469,7 +2557,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* Compute total space required for the core symbols' strtab. */
 	for (ndst = i = 0; i < nsrc; i++) {
-		if (i == 0 ||
+		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
@@ -2528,7 +2616,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
 	src = mod->kallsyms->symtab;
 	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
-		if (i == 0 ||
+		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			dst[ndst] = src[i];
@@ -2667,6 +2755,26 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
 	return 0;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+{
+	mod->klp = get_modinfo(info, "livepatch") ? true : false;
+
+	return 0;
+}
+#else /* !CONFIG_LIVEPATCH */
+static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+{
+	if (get_modinfo(info, "livepatch")) {
+		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+		       mod->name);
+		return -ENOEXEC;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 /* Sets info->hdr and info->len. */
 static int copy_module_from_user(const void __user *umod, unsigned long len,
 				  struct load_info *info)
@@ -2821,6 +2929,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 			"is unknown, you have been warned.\n", mod->name);
 	}
 
+	err = find_livepatch_modinfo(mod, info);
+	if (err)
+		return err;
+
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(info, "license"));
 
@@ -3494,6 +3606,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	if (err < 0)
 		goto coming_cleanup;
 
+	if (is_livepatch_module(mod)) {
+		err = copy_module_elf(mod, info);
+		if (err < 0)
+			goto sysfs_cleanup;
+	}
+
 	/* Get rid of temporary copy. */
 	free_copy(info);
 
@@ -3502,11 +3620,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	return do_init_module(mod);
 
+ sysfs_cleanup:
+	mod_sysfs_teardown(mod);
  coming_cleanup:
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	klp_module_going(mod);
-
  bug_cleanup:
 	/* module_bug_cleanup needs module_mutex protection */
 	mutex_lock(&module_mutex);
-- 
cgit v1.2.3


From 425595a7fc2096ab46c741b5ed5372c5ab5bbeac Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Tue, 22 Mar 2016 20:03:18 -0400
Subject: livepatch: reuse module loader code to write relocations

Reuse module loader code to write relocations, thereby eliminating the need
for architecture specific relocation code in livepatch. Specifically, reuse
the apply_relocate_add() function in the module loader to write relocations
instead of duplicating functionality in livepatch's arch-dependent
klp_write_module_reloc() function.

In order to accomplish this, livepatch modules manage their own relocation
sections (marked with the SHF_RELA_LIVEPATCH section flag) and
livepatch-specific symbols (marked with SHN_LIVEPATCH symbol section
index). To apply livepatch relocation sections, livepatch symbols
referenced by relocs are resolved and then apply_relocate_add() is called
to apply those relocations.

In addition, remove x86 livepatch relocation code and the s390
klp_write_module_reloc() function stub. They are no longer needed since
relocation work has been offloaded to module loader.

Lastly, mark the module as a livepatch module so that the module loader
canappropriately identify and initialize it.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>   # for s390 changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/s390/include/asm/livepatch.h    |   7 --
 arch/x86/include/asm/livepatch.h     |   2 -
 arch/x86/kernel/Makefile             |   1 -
 arch/x86/kernel/livepatch.c          |  70 -----------------
 include/linux/livepatch.h            |  20 -----
 kernel/livepatch/core.c              | 148 ++++++++++++++++++++++-------------
 samples/livepatch/livepatch-sample.c |   1 +
 7 files changed, 95 insertions(+), 154 deletions(-)
 delete mode 100644 arch/x86/kernel/livepatch.c

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
index d5427c78b1b3..2c1213785892 100644
--- a/arch/s390/include/asm/livepatch.h
+++ b/arch/s390/include/asm/livepatch.h
@@ -24,13 +24,6 @@ static inline int klp_check_compiler_support(void)
 	return 0;
 }
 
-static inline int klp_write_module_reloc(struct module *mod, unsigned long
-		type, unsigned long loc, unsigned long value)
-{
-	/* not supported yet */
-	return -ENOSYS;
-}
-
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
 	regs->psw.addr = ip;
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 7e68f9558552..a7f9181f63f3 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,6 @@ static inline int klp_check_compiler_support(void)
 #endif
 	return 0;
 }
-int klp_write_module_reloc(struct module *mod, unsigned long type,
-			   unsigned long loc, unsigned long value);
 
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..c5e9a5cf976b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -67,7 +67,6 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
deleted file mode 100644
index 92fc1a51f994..000000000000
--- a/arch/x86/kernel/livepatch.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * livepatch.c - x86-specific Kernel Live Patching Core
- *
- * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
- * Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/elf.h>
-#include <asm/livepatch.h>
-
-/**
- * klp_write_module_reloc() - write a relocation in a module
- * @mod:	module in which the section to be modified is found
- * @type:	ELF relocation type (see asm/elf.h)
- * @loc:	address that the relocation should be written to
- * @value:	relocation value (sym address + addend)
- *
- * This function writes a relocation to the specified location for
- * a particular module.
- */
-int klp_write_module_reloc(struct module *mod, unsigned long type,
-			   unsigned long loc, unsigned long value)
-{
-	size_t size = 4;
-	unsigned long val;
-	unsigned long core = (unsigned long)mod->core_layout.base;
-	unsigned long core_size = mod->core_layout.size;
-
-	switch (type) {
-	case R_X86_64_NONE:
-		return 0;
-	case R_X86_64_64:
-		val = value;
-		size = 8;
-		break;
-	case R_X86_64_32:
-		val = (u32)value;
-		break;
-	case R_X86_64_32S:
-		val = (s32)value;
-		break;
-	case R_X86_64_PC32:
-		val = (u32)(value - loc);
-		break;
-	default:
-		/* unsupported relocation type */
-		return -EINVAL;
-	}
-
-	if (loc < core || loc >= core + core_size)
-		/* loc does not point to any symbol inside the module */
-		return -EINVAL;
-
-	return probe_kernel_write((void *)loc, &val, size);
-}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index bd830d590465..0933ca47791c 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -64,28 +64,9 @@ struct klp_func {
 	struct list_head stack_node;
 };
 
-/**
- * struct klp_reloc - relocation structure for live patching
- * @loc:	address where the relocation will be written
- * @sympos:	position in kallsyms to disambiguate symbols (optional)
- * @type:	ELF relocation type
- * @name:	name of the referenced symbol (for lookup/verification)
- * @addend:	offset from the referenced symbol
- * @external:	symbol is either exported or within the live patch module itself
- */
-struct klp_reloc {
-	unsigned long loc;
-	unsigned long sympos;
-	unsigned long type;
-	const char *name;
-	int addend;
-	int external;
-};
-
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
- * @relocs:	relocation entries to be applied at load time
  * @funcs:	function entries for functions to be patched in the object
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
@@ -95,7 +76,6 @@ struct klp_reloc {
 struct klp_object {
 	/* external */
 	const char *name;
-	struct klp_reloc *relocs;
 	struct klp_func *funcs;
 
 	/* internal */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index d68fbf63b083..eb5db6e837aa 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,8 @@
 #include <linux/list.h>
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
 #include <asm/cacheflush.h>
 
 /**
@@ -204,75 +206,109 @@ static int klp_find_object_symbol(const char *objname, const char *name,
 	return -EINVAL;
 }
 
-/*
- * external symbols are located outside the parent object (where the parent
- * object is either vmlinux or the kmod being patched).
- */
-static int klp_find_external_symbol(struct module *pmod, const char *name,
-				    unsigned long *addr)
+static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
 {
-	const struct kernel_symbol *sym;
-
-	/* first, check if it's an exported symbol */
-	preempt_disable();
-	sym = find_symbol(name, NULL, NULL, true, true);
-	if (sym) {
-		*addr = sym->value;
-		preempt_enable();
-		return 0;
-	}
-	preempt_enable();
+	int i, cnt, vmlinux, ret;
+	char objname[MODULE_NAME_LEN];
+	char symname[KSYM_NAME_LEN];
+	char *strtab = pmod->core_kallsyms.strtab;
+	Elf_Rela *relas;
+	Elf_Sym *sym;
+	unsigned long sympos, addr;
 
 	/*
-	 * Check if it's in another .o within the patch module. This also
-	 * checks that the external symbol is unique.
+	 * Since the field widths for objname and symname in the sscanf()
+	 * call are hard-coded and correspond to MODULE_NAME_LEN and
+	 * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN
+	 * and KSYM_NAME_LEN have the values we expect them to have.
+	 *
+	 * Because the value of MODULE_NAME_LEN can differ among architectures,
+	 * we use the smallest/strictest upper bound possible (56, based on
+	 * the current definition of MODULE_NAME_LEN) to prevent overflows.
 	 */
-	return klp_find_object_symbol(pmod->name, name, 0, addr);
+	BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+	relas = (Elf_Rela *) relasec->sh_addr;
+	/* For each rela in this klp relocation section */
+	for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
+		sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
+		if (sym->st_shndx != SHN_LIVEPATCH) {
+			pr_err("symbol %s is not marked as a livepatch symbol",
+			       strtab + sym->st_name);
+			return -EINVAL;
+		}
+
+		/* Format: .klp.sym.objname.symname,sympos */
+		cnt = sscanf(strtab + sym->st_name,
+			     ".klp.sym.%55[^.].%127[^,],%lu",
+			     objname, symname, &sympos);
+		if (cnt != 3) {
+			pr_err("symbol %s has an incorrectly formatted name",
+			       strtab + sym->st_name);
+			return -EINVAL;
+		}
+
+		/* klp_find_object_symbol() treats a NULL objname as vmlinux */
+		vmlinux = !strcmp(objname, "vmlinux");
+		ret = klp_find_object_symbol(vmlinux ? NULL : objname,
+					     symname, sympos, &addr);
+		if (ret)
+			return ret;
+
+		sym->st_value = addr;
+	}
+
+	return 0;
 }
 
 static int klp_write_object_relocations(struct module *pmod,
 					struct klp_object *obj)
 {
-	int ret = 0;
-	unsigned long val;
-	struct klp_reloc *reloc;
+	int i, cnt, ret = 0;
+	const char *objname, *secname;
+	char sec_objname[MODULE_NAME_LEN];
+	Elf_Shdr *sec;
 
 	if (WARN_ON(!klp_is_object_loaded(obj)))
 		return -EINVAL;
 
-	if (WARN_ON(!obj->relocs))
-		return -EINVAL;
+	objname = klp_is_module(obj) ? obj->name : "vmlinux";
 
 	module_disable_ro(pmod);
+	/* For each klp relocation section */
+	for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
+		sec = pmod->klp_info->sechdrs + i;
+		secname = pmod->klp_info->secstrings + sec->sh_name;
+		if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
+			continue;
 
-	for (reloc = obj->relocs; reloc->name; reloc++) {
-		/* discover the address of the referenced symbol */
-		if (reloc->external) {
-			if (reloc->sympos > 0) {
-				pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
-				       reloc->name);
-				ret = -EINVAL;
-				goto out;
-			}
-			ret = klp_find_external_symbol(pmod, reloc->name, &val);
-		} else
-			ret = klp_find_object_symbol(obj->name,
-						     reloc->name,
-						     reloc->sympos,
-						     &val);
+		/*
+		 * Format: .klp.rela.sec_objname.section_name
+		 * See comment in klp_resolve_symbols() for an explanation
+		 * of the selected field width value.
+		 */
+		cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
+		if (cnt != 1) {
+			pr_err("section %s has an incorrectly formatted name",
+			       secname);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (strcmp(objname, sec_objname))
+			continue;
+
+		ret = klp_resolve_symbols(sec, pmod);
 		if (ret)
-			goto out;
+			break;
 
-		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
-					     val + reloc->addend);
-		if (ret) {
-			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
-			       reloc->name, val, ret);
-			goto out;
-		}
+		ret = apply_relocate_add(pmod->klp_info->sechdrs,
+					 pmod->core_kallsyms.strtab,
+					 pmod->klp_info->symndx, i, pmod);
+		if (ret)
+			break;
 	}
 
-out:
 	module_enable_ro(pmod);
 	return ret;
 }
@@ -703,11 +739,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 	struct klp_func *func;
 	int ret;
 
-	if (obj->relocs) {
-		ret = klp_write_object_relocations(patch->mod, obj);
-		if (ret)
-			return ret;
-	}
+	ret = klp_write_object_relocations(patch->mod, obj);
+	if (ret)
+		return ret;
 
 	klp_for_each_func(obj, func) {
 		ret = klp_find_object_symbol(obj->name, func->old_name,
@@ -842,6 +876,12 @@ int klp_register_patch(struct klp_patch *patch)
 {
 	int ret;
 
+	if (!is_livepatch_module(patch->mod)) {
+		pr_err("module %s is not marked as a livepatch module",
+		       patch->mod->name);
+		return -EINVAL;
+	}
+
 	if (!klp_initialized())
 		return -ENODEV;
 
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index fb8c8614e728..e34f871e69b1 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -89,3 +89,4 @@ static void livepatch_exit(void)
 module_init(livepatch_init);
 module_exit(livepatch_exit);
 MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
-- 
cgit v1.2.3


From ee2ae1ed46251dcbdcc2c59b5e30f664ddfbacb1 Mon Sep 17 00:00:00 2001
From: Alexandre TORGUE <alexandre.torgue@st.com>
Date: Fri, 1 Apr 2016 11:37:33 +0200
Subject: stmmac: add new DT platform entries for GMAC4

This is to support the snps,dwmac-4.00 and snps,dwmac-4.10a
and related features on the platform driver.
See binding doc for further details.

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 7 +++++++
 include/linux/stmmac.h                                | 2 ++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 6605d19601c2..4d302db657c0 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -59,6 +59,8 @@ Optional properties:
 	- snps,fb: fixed-burst
 	- snps,mb: mixed-burst
 	- snps,rb: rebuild INCRx Burst
+	- snps,tso: this enables the TSO feature otherwise it will be managed by
+	    MAC HW capability register.
 - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
 
 Examples:
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index cf37ea558ecc..effaa4ff5ab7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -284,6 +284,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		plat->pmt = 1;
 	}
 
+	if (of_device_is_compatible(np, "snps,dwmac-4.00") ||
+	    of_device_is_compatible(np, "snps,dwmac-4.10a")) {
+		plat->has_gmac4 = 1;
+		plat->pmt = 1;
+		plat->tso_en = of_property_read_bool(np, "snps,tso");
+	}
+
 	if (of_device_is_compatible(np, "snps,dwmac-3.610") ||
 		of_device_is_compatible(np, "snps,dwmac-3.710")) {
 		plat->enh_desc = 1;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e6bc30a42a74..ffdaca9c01af 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -137,5 +137,7 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 	struct stmmac_axi *axi;
+	int has_gmac4;
+	bool tso_en;
 };
 #endif
-- 
cgit v1.2.3


From 8cb359e3a1f6318f971bec281623613f48b711be Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Wed, 23 Mar 2016 12:34:41 +0000
Subject: iio: buffer: add missing descriptions in iio_buffer_access_funcs

The members buffer_group and attrs of iio_buffer_access_funcs have no
descriptions for the documentation. Adding them.

Fixes: 08e7e0adaa17 ("iio: buffer: Allocate standard attributes in the core")
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/buffer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 2ec3ad58e8a0..70a5164f4728 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -83,10 +83,12 @@ struct iio_buffer_access_funcs {
  * @access:		[DRIVER] buffer access functions associated with the
  *			implementation.
  * @scan_el_dev_attr_list:[INTERN] list of scan element related attributes.
+ * @buffer_group:	[INTERN] attributes of the buffer group
  * @scan_el_group:	[DRIVER] attribute group for those attributes not
  *			created from the iio_chan_info array.
  * @pollq:		[INTERN] wait queue to allow for polling on the buffer.
  * @stufftoread:	[INTERN] flag to indicate new data.
+ * @attrs:		[INTERN] standard attributes of the buffer
  * @demux_list:		[INTERN] list of operations required to demux the scan.
  * @demux_bounce:	[INTERN] buffer for doing gather from incoming scan.
  * @buffer_list:	[INTERN] entry in the devices list of current buffers.
-- 
cgit v1.2.3


From fddcca5107051adf9e4481d2a79ae0616577fd2c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Feb 2016 13:20:28 +0100
Subject: mtd: avoid stack overflow in MTD CFI code

When map_word gets too large, we use a lot of kernel stack, and for
MTD_MAP_BANK_WIDTH_32, this means we use more than the recommended
1024 bytes in a number of functions:

drivers/mtd/chips/cfi_cmdset_0020.c: In function 'cfi_staa_write_buffers':
drivers/mtd/chips/cfi_cmdset_0020.c:651:1: warning: the frame size of 1336 bytes is larger than 1024 bytes [-Wframe-larger-than=]
drivers/mtd/chips/cfi_cmdset_0020.c: In function 'cfi_staa_erase_varsize':
drivers/mtd/chips/cfi_cmdset_0020.c:972:1: warning: the frame size of 1208 bytes is larger than 1024 bytes [-Wframe-larger-than=]
drivers/mtd/chips/cfi_cmdset_0001.c: In function 'do_write_buffer':
drivers/mtd/chips/cfi_cmdset_0001.c:1835:1: warning: the frame size of 1240 bytes is larger than 1024 bytes [-Wframe-larger-than=]

This can be avoided if all operations on the map word are done
indirectly and the stack gets reused between the calls. We can
mostly achieve this by selecting MTD_COMPLEX_MAPPINGS whenever
MTD_MAP_BANK_WIDTH_32 is set, but for the case that no other
bank width is enabled, we also need to use a non-constant
map_bankwidth() to convince the compiler to use less stack.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[Brian: this patch mostly achieves its goal by forcing
    MTD_COMPLEX_MAPPINGS (and the accompanying indirection) for 256-bit
    mappings; the rest of the change is mostly a wash, though it helps
    reduce stack size slightly. If we really care about supporting
    256-bit mappings though, we should consider rewriting some of this
    code to avoid keeping and assigning so many 256-bit objects on the
    stack.]
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/chips/Kconfig |  1 +
 include/linux/mtd/map.h   | 19 +++++++------------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig
index 3b3dabce58de..bbfa1f129266 100644
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -115,6 +115,7 @@ config MTD_MAP_BANK_WIDTH_16
 
 config MTD_MAP_BANK_WIDTH_32
 	bool "Support 256-bit buswidth" if MTD_CFI_GEOMETRY
+	select MTD_COMPLEX_MAPPINGS if HAS_IOMEM
 	default n
 	help
 	  If you wish to support CFI devices on a physical bus which is
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 5e0eb7ccabd4..3aa56e3104bb 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -122,18 +122,13 @@
 #endif
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_32
-# ifdef map_bankwidth
-#  undef map_bankwidth
-#  define map_bankwidth(map) ((map)->bankwidth)
-#  undef map_bankwidth_is_large
-#  define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
-#  undef map_words
-#  define map_words(map) map_calc_words(map)
-# else
-#  define map_bankwidth(map) 32
-#  define map_bankwidth_is_large(map) (1)
-#  define map_words(map) map_calc_words(map)
-# endif
+/* always use indirect access for 256-bit to preserve kernel stack */
+# undef map_bankwidth
+# define map_bankwidth(map) ((map)->bankwidth)
+# undef map_bankwidth_is_large
+# define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
+# undef map_words
+# define map_words(map) map_calc_words(map)
 #define map_bankwidth_is_32(map) (map_bankwidth(map) == 32)
 #undef MAX_MAP_BANKWIDTH
 #define MAX_MAP_BANKWIDTH 32
-- 
cgit v1.2.3


From 5651d6aaf489d1db48c253cf884b40214e91c2c5 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 26 Feb 2016 11:50:28 +0100
Subject: mtd: bcm47xxsflash: use ioremap_cache() instead of KSEG0ADDR()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using KSEG0ADDR makes code highly MIPS dependent and not portable.
Thanks to the fix a68f376 ("MIPS: io.h: Define `ioremap_cache'") we can
use ioremap_cache which is generic and supported on MIPS as well now.

KSEG0ADDR was translating 0x1c000000 into 0x9c000000. With ioremap_cache
we use MIPS's __ioremap (and then remap_area_pages). This results in
different address (e.g. 0xc0080000) but it still should be cached as
expected and it was successfully tested with BCM47186B0.

Other than that drivers/bcma/driver_chipcommon_sflash.c nicely setups a
struct resource for access window, but we wren't using it. Use it now
and drop duplicated info.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
---
 drivers/bcma/driver_chipcommon_sflash.c     |  1 -
 drivers/mtd/devices/bcm47xxsflash.c         | 29 ++++++++++++++++++++++++-----
 drivers/mtd/devices/bcm47xxsflash.h         |  3 ++-
 include/linux/bcma/bcma_driver_chipcommon.h |  1 -
 4 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_chipcommon_sflash.c b/drivers/bcma/driver_chipcommon_sflash.c
index 04d706ca5f43..35b13a08ca3e 100644
--- a/drivers/bcma/driver_chipcommon_sflash.c
+++ b/drivers/bcma/driver_chipcommon_sflash.c
@@ -146,7 +146,6 @@ int bcma_sflash_init(struct bcma_drv_cc *cc)
 		return -ENOTSUPP;
 	}
 
-	sflash->window = BCMA_SOC_FLASH2;
 	sflash->blocksize = e->blocksize;
 	sflash->numblocks = e->numblocks;
 	sflash->size = sflash->blocksize * sflash->numblocks;
diff --git a/drivers/mtd/devices/bcm47xxsflash.c b/drivers/mtd/devices/bcm47xxsflash.c
index 347bb83db864..1c65c15b31a1 100644
--- a/drivers/mtd/devices/bcm47xxsflash.c
+++ b/drivers/mtd/devices/bcm47xxsflash.c
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/platform_device.h>
 #include <linux/bcma/bcma.h>
@@ -109,8 +110,7 @@ static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
 	if ((from + len) > mtd->size)
 		return -EINVAL;
 
-	memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(b47s->window + from),
-		      len);
+	memcpy_fromio(buf, b47s->window + from, len);
 	*retlen = len;
 
 	return len;
@@ -275,15 +275,33 @@ static void bcm47xxsflash_bcma_cc_write(struct bcm47xxsflash *b47s, u16 offset,
 
 static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 {
-	struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+	struct device *dev = &pdev->dev;
+	struct bcma_sflash *sflash = dev_get_platdata(dev);
 	struct bcm47xxsflash *b47s;
+	struct resource *res;
 	int err;
 
-	b47s = devm_kzalloc(&pdev->dev, sizeof(*b47s), GFP_KERNEL);
+	b47s = devm_kzalloc(dev, sizeof(*b47s), GFP_KERNEL);
 	if (!b47s)
 		return -ENOMEM;
 	sflash->priv = b47s;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "invalid resource\n");
+		return -EINVAL;
+	}
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				     res->name)) {
+		dev_err(dev, "can't request region for resource %pR\n", res);
+		return -EBUSY;
+	}
+	b47s->window = ioremap_cache(res->start, resource_size(res));
+	if (!b47s->window) {
+		dev_err(dev, "ioremap failed for resource %pR\n", res);
+		return -ENOMEM;
+	}
+
 	b47s->bcma_cc = container_of(sflash, struct bcma_drv_cc, sflash);
 	b47s->cc_read = bcm47xxsflash_bcma_cc_read;
 	b47s->cc_write = bcm47xxsflash_bcma_cc_write;
@@ -297,7 +315,6 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 		break;
 	}
 
-	b47s->window = sflash->window;
 	b47s->blocksize = sflash->blocksize;
 	b47s->numblocks = sflash->numblocks;
 	b47s->size = sflash->size;
@@ -306,6 +323,7 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 	err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
 	if (err) {
 		pr_err("Failed to register MTD device: %d\n", err);
+		iounmap(b47s->window);
 		return err;
 	}
 
@@ -321,6 +339,7 @@ static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
 	struct bcm47xxsflash *b47s = sflash->priv;
 
 	mtd_device_unregister(&b47s->mtd);
+	iounmap(b47s->window);
 
 	return 0;
 }
diff --git a/drivers/mtd/devices/bcm47xxsflash.h b/drivers/mtd/devices/bcm47xxsflash.h
index fe93daf4f489..1564b62b412e 100644
--- a/drivers/mtd/devices/bcm47xxsflash.h
+++ b/drivers/mtd/devices/bcm47xxsflash.h
@@ -65,7 +65,8 @@ struct bcm47xxsflash {
 
 	enum bcm47xxsflash_type type;
 
-	u32 window;
+	void __iomem *window;
+
 	u32 blocksize;
 	u16 numblocks;
 	u32 size;
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 846513c73606..a5ac2cad5cb7 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -587,7 +587,6 @@ struct mtd_info;
 
 struct bcma_sflash {
 	bool present;
-	u32 window;
 	u32 blocksize;
 	u16 numblocks;
 	u32 size;
-- 
cgit v1.2.3


From 5b01e4b9efa0b78672cbbea830c9fbcc7f239e29 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:43:54 +0200
Subject: libata: Implement NCQ autosense

Some newer devices support NCQ autosense (cf ACS-4), so we should
be using it to retrieve the sense code and speed up recovery.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c   | 12 ++++++++++++
 drivers/ata/libata-scsi.c |  7 ++++++-
 drivers/ata/libata.h      |  1 +
 include/linux/ata.h       |  2 ++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 961acc788f44..8c8355f0792e 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1600,6 +1600,8 @@ static int ata_eh_read_log_10h(struct ata_device *dev,
 	tf->hob_lbah = buf[10];
 	tf->nsect = buf[12];
 	tf->hob_nsect = buf[13];
+	if (ata_id_has_ncq_autosense(dev->id))
+		tf->auxiliary = buf[14] << 16 | buf[15] << 8 | buf[16];
 
 	return 0;
 }
@@ -1797,6 +1799,16 @@ void ata_eh_analyze_ncq_error(struct ata_link *link)
 	memcpy(&qc->result_tf, &tf, sizeof(tf));
 	qc->result_tf.flags = ATA_TFLAG_ISADDR | ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
 	qc->err_mask |= AC_ERR_DEV | AC_ERR_NCQ;
+	if (qc->result_tf.auxiliary) {
+		char sense_key, asc, ascq;
+
+		sense_key = (qc->result_tf.auxiliary >> 16) & 0xff;
+		asc = (qc->result_tf.auxiliary >> 8) & 0xff;
+		ascq = qc->result_tf.auxiliary & 0xff;
+		ata_scsi_set_sense(qc->scsicmd, sense_key, asc, ascq);
+		qc->flags |= ATA_QCFLAG_SENSE_VALID;
+	}
+
 	ehc->i.err_mask &= ~AC_ERR_DEV;
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 567859ce0512..6dc2fadfd7c5 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -270,8 +270,11 @@ DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
 	    ata_scsi_park_show, ata_scsi_park_store);
 EXPORT_SYMBOL_GPL(dev_attr_unload_heads);
 
-static void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
+void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
 {
+	if (!cmd)
+		return;
+
 	cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
 
 	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
@@ -1784,6 +1787,8 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 	if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) &&
 	    ((cdb[2] & 0x20) || need_sense))
 		ata_gen_passthru_sense(qc);
+	else if (qc->flags & ATA_QCFLAG_SENSE_VALID)
+		cmd->result = SAM_STAT_CHECK_CONDITION;
 	else if (need_sense)
 		ata_gen_ata_sense(qc);
 	else
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index f840ca18a7c0..8cfdd9616d16 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -137,6 +137,7 @@ extern int ata_scsi_add_hosts(struct ata_host *host,
 			      struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
+extern void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
 extern void ata_scsi_media_change_notify(struct ata_device *dev);
 extern void ata_scsi_hotplug(struct work_struct *work);
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index c1a2f345cbe6..e797e1b53006 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -528,6 +528,8 @@ struct ata_bmdma_prd {
 #define ata_id_cdb_intr(id)	(((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
 #define ata_id_has_da(id)	((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))
 #define ata_id_has_devslp(id)	((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))
+#define ata_id_has_ncq_autosense(id) \
+				((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))
 
 static inline bool ata_id_has_hipm(const u16 *id)
 {
-- 
cgit v1.2.3


From e87fd28cf9a2d9018ac4b6dd92f0b417714bc18d Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:43:55 +0200
Subject: libata: Implement support for sense data reporting

ACS-4 defines a sense data reporting feature set.
This patch implements support for it.

tj: Cosmetic formatting updates.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 20 +++++++++++++-
 drivers/ata/libata-eh.c   | 68 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/ata.h       | 16 +++++++++++
 3 files changed, 99 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 55e257c268dd..f991f786227e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2148,6 +2148,24 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 	return 0;
 }
 
+static void ata_dev_config_sense_reporting(struct ata_device *dev)
+{
+	unsigned int err_mask;
+
+	if (!ata_id_has_sense_reporting(dev->id))
+		return;
+
+	if (ata_id_sense_reporting_enabled(dev->id))
+		return;
+
+	err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to enable Sense Data Reporting, Emask 0x%x\n",
+			    err_mask);
+	}
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -2370,7 +2388,7 @@ int ata_dev_configure(struct ata_device *dev)
 					dev->devslp_timing[i] = sata_setting[j];
 				}
 		}
-
+		ata_dev_config_sense_reporting(dev);
 		dev->cdb_len = 16;
 	}
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 8c8355f0792e..170e891e79af 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1637,6 +1637,56 @@ unsigned int atapi_eh_tur(struct ata_device *dev, u8 *r_sense_key)
 	return err_mask;
 }
 
+/**
+ *	ata_eh_request_sense - perform REQUEST_SENSE_DATA_EXT
+ *	@dev: device to perform REQUEST_SENSE_SENSE_DATA_EXT to
+ *	@cmd: scsi command for which the sense code should be set
+ *
+ *	Perform REQUEST_SENSE_DATA_EXT after the device reported CHECK
+ *	SENSE.  This function is an EH helper.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+static void ata_eh_request_sense(struct ata_queued_cmd *qc,
+				 struct scsi_cmnd *cmd)
+{
+	struct ata_device *dev = qc->dev;
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+
+	if (qc->ap->pflags & ATA_PFLAG_FROZEN) {
+		ata_dev_warn(dev, "sense data available but port frozen\n");
+		return;
+	}
+
+	if (!cmd)
+		return;
+
+	if (!ata_id_sense_reporting_enabled(dev->id)) {
+		ata_dev_warn(qc->dev, "sense data reporting disabled\n");
+		return;
+	}
+
+	DPRINTK("ATA request sense\n");
+
+	ata_tf_init(dev, &tf);
+	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+	tf.flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
+	tf.command = ATA_CMD_REQ_SENSE_DATA;
+	tf.protocol = ATA_PROT_NODATA;
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	/* Ignore err_mask; ATA_ERR might be set */
+	if (tf.command & ATA_SENSE) {
+		ata_scsi_set_sense(cmd, tf.lbah, tf.lbam, tf.lbal);
+		qc->flags |= ATA_QCFLAG_SENSE_VALID;
+	} else {
+		ata_dev_warn(dev, "request sense failed stat %02x emask %x\n",
+			     tf.command, err_mask);
+	}
+}
+
 /**
  *	atapi_eh_request_sense - perform ATAPI REQUEST_SENSE
  *	@dev: device to perform REQUEST_SENSE to
@@ -1838,14 +1888,23 @@ static unsigned int ata_eh_analyze_tf(struct ata_queued_cmd *qc,
 		return ATA_EH_RESET;
 	}
 
-	if (stat & (ATA_ERR | ATA_DF))
+	if (stat & (ATA_ERR | ATA_DF)) {
 		qc->err_mask |= AC_ERR_DEV;
-	else
+		/*
+		 * Sense data reporting does not work if the
+		 * device fault bit is set.
+		 */
+		if (stat & ATA_DF)
+			stat &= ~ATA_SENSE;
+	} else {
 		return 0;
+	}
 
 	switch (qc->dev->class) {
 	case ATA_DEV_ATA:
 	case ATA_DEV_ZAC:
+		if (stat & ATA_SENSE)
+			ata_eh_request_sense(qc, qc->scsicmd);
 		if (err & ATA_ICRC)
 			qc->err_mask |= AC_ERR_ATA_BUS;
 		if (err & (ATA_UNC | ATA_AMNF))
@@ -2581,14 +2640,15 @@ static void ata_eh_link_report(struct ata_link *link)
 
 #ifdef CONFIG_ATA_VERBOSE_ERROR
 		if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ |
-				    ATA_ERR)) {
+				    ATA_SENSE | ATA_ERR)) {
 			if (res->command & ATA_BUSY)
 				ata_dev_err(qc->dev, "status: { Busy }\n");
 			else
-				ata_dev_err(qc->dev, "status: { %s%s%s%s}\n",
+				ata_dev_err(qc->dev, "status: { %s%s%s%s%s}\n",
 				  res->command & ATA_DRDY ? "DRDY " : "",
 				  res->command & ATA_DF ? "DF " : "",
 				  res->command & ATA_DRQ ? "DRQ " : "",
+				  res->command & ATA_SENSE ? "SENSE " : "",
 				  res->command & ATA_ERR ? "ERR " : "");
 		}
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e797e1b53006..00aebc4c83ad 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -385,6 +385,8 @@ enum {
 	SATA_SSP		= 0x06,	/* Software Settings Preservation */
 	SATA_DEVSLP		= 0x09,	/* Device Sleep */
 
+	SETFEATURE_SENSE_DATA	= 0xC3, /* Sense Data Reporting feature */
+
 	/* feature values for SET_MAX */
 	ATA_SET_MAX_ADDR	= 0x00,
 	ATA_SET_MAX_PASSWD	= 0x01,
@@ -718,6 +720,20 @@ static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
 	return false;
 }
 
+static inline bool ata_id_has_sense_reporting(const u16 *id)
+{
+	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+		return false;
+	return id[ATA_ID_COMMAND_SET_3] & (1 << 6);
+}
+
+static inline bool ata_id_sense_reporting_enabled(const u16 *id)
+{
+	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+		return false;
+	return id[ATA_ID_COMMAND_SET_4] & (1 << 6);
+}
+
 /**
  *	ata_id_major_version	-	get ATA level of drive
  *	@id: Identify data
-- 
cgit v1.2.3


From 06dbde5f3a44248fc02e24d662ac4849202abb48 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 4 Apr 2016 11:44:03 +0200
Subject: libata: Implement control mode page to select sense format

Implement MODE SELECT for the control mode page to allow the OS
to switch to descriptor sense.

tj: Dropped s/sb/cmd->sense_buffer/ in ata_gen_ata_sense().  Added
    @dev description to ata_msense_ctl_mode().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c   |   4 +-
 drivers/ata/libata-scsi.c | 116 ++++++++++++++++++++++++++++++++++------------
 drivers/ata/libata.h      |   3 +-
 include/linux/libata.h    |   1 +
 4 files changed, 91 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e37258b78e01..5b340ce4eeac 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1679,7 +1679,7 @@ static void ata_eh_request_sense(struct ata_queued_cmd *qc,
 	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
 	/* Ignore err_mask; ATA_ERR might be set */
 	if (tf.command & ATA_SENSE) {
-		ata_scsi_set_sense(cmd, tf.lbah, tf.lbam, tf.lbal);
+		ata_scsi_set_sense(dev, cmd, tf.lbah, tf.lbam, tf.lbal);
 		qc->flags |= ATA_QCFLAG_SENSE_VALID;
 	} else {
 		ata_dev_warn(dev, "request sense failed stat %02x emask %x\n",
@@ -1855,7 +1855,7 @@ void ata_eh_analyze_ncq_error(struct ata_link *link)
 		sense_key = (qc->result_tf.auxiliary >> 16) & 0xff;
 		asc = (qc->result_tf.auxiliary >> 8) & 0xff;
 		ascq = qc->result_tf.auxiliary & 0xff;
-		ata_scsi_set_sense(qc->scsicmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(dev, qc->scsicmd, sense_key, asc, ascq);
 		ata_scsi_set_sense_information(dev, qc->scsicmd,
 					       &qc->result_tf);
 		qc->flags |= ATA_QCFLAG_SENSE_VALID;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 0da03c019f27..2389247bdf6f 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -270,14 +270,17 @@ DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
 	    ata_scsi_park_show, ata_scsi_park_store);
 EXPORT_SYMBOL_GPL(dev_attr_unload_heads);
 
-void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
+void ata_scsi_set_sense(struct ata_device *dev, struct scsi_cmnd *cmd,
+			u8 sk, u8 asc, u8 ascq)
 {
+	bool d_sense = (dev->flags & ATA_DFLAG_D_SENSE);
+
 	if (!cmd)
 		return;
 
 	cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
 
-	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
+	scsi_build_sense_buffer(d_sense, cmd->sense_buffer, sk, asc, ascq);
 }
 
 void ata_scsi_set_sense_information(struct ata_device *dev,
@@ -384,9 +387,10 @@ struct device_attribute *ata_common_sdev_attrs[] = {
 };
 EXPORT_SYMBOL_GPL(ata_common_sdev_attrs);
 
-static void ata_scsi_invalid_field(struct scsi_cmnd *cmd)
+static void ata_scsi_invalid_field(struct ata_device *dev,
+				   struct scsi_cmnd *cmd)
 {
-	ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	cmd->scsi_done(cmd);
 }
@@ -1014,7 +1018,7 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
 	    tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
 				   &sense_key, &asc, &ascq, verbose);
-		ata_scsi_set_sense(cmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq);
 	} else {
 		/*
 		 * ATA PASS-THROUGH INFORMATION AVAILABLE
@@ -1112,12 +1116,12 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
 	    tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
 				   &sense_key, &asc, &ascq, verbose);
-		ata_scsi_set_sense(cmd, sense_key, asc, ascq);
+		ata_scsi_set_sense(dev, cmd, sense_key, asc, ascq);
 	} else {
 		/* Could not decode error */
 		ata_dev_warn(dev, "could not decode error status 0x%x err_mask 0x%x\n",
 			     tf->command, qc->err_mask);
-		ata_scsi_set_sense(cmd, ABORTED_COMMAND, 0, 0);
+		ata_scsi_set_sense(dev, cmd, ABORTED_COMMAND, 0, 0);
 		return;
 	}
 
@@ -1440,7 +1444,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
  skip:
@@ -1679,12 +1683,12 @@ static unsigned int ata_scsi_verify_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
 invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 out_of_range:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x21, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
 	/* "Logical Block Address out of range" */
 	return 1;
 
@@ -1781,12 +1785,12 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
 		goto out_of_range;
 	/* treat all other errors as -EINVAL, fall through */
 invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 out_of_range:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x21, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
 	/* "Logical Block Address out of range" */
 	return 1;
 
@@ -2358,6 +2362,7 @@ static unsigned int ata_msense_caching(u16 *id, u8 *buf, bool changeable)
 
 /**
  *	ata_msense_ctl_mode - Simulate MODE SENSE control mode page
+ *	@dev: ATA device of interest
  *	@buf: output buffer
  *	@changeable: whether changeable parameters are requested
  *
@@ -2366,9 +2371,12 @@ static unsigned int ata_msense_caching(u16 *id, u8 *buf, bool changeable)
  *	LOCKING:
  *	None.
  */
-static unsigned int ata_msense_ctl_mode(u8 *buf, bool changeable)
+static unsigned int ata_msense_ctl_mode(struct ata_device *dev, u8 *buf,
+					bool changeable)
 {
 	modecpy(buf, def_control_mpage, sizeof(def_control_mpage), changeable);
+	if (changeable && (dev->flags & ATA_DFLAG_D_SENSE))
+		buf[2] |= (1 << 2);	/* Descriptor sense requested */
 	return sizeof(def_control_mpage);
 }
 
@@ -2482,13 +2490,13 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
 		break;
 
 	case CONTROL_MPAGE:
-		p += ata_msense_ctl_mode(p, page_control == 1);
+		p += ata_msense_ctl_mode(args->dev, p, page_control == 1);
 		break;
 
 	case ALL_MPAGES:
 		p += ata_msense_rw_recovery(p, page_control == 1);
 		p += ata_msense_caching(args->id, p, page_control == 1);
-		p += ata_msense_ctl_mode(p, page_control == 1);
+		p += ata_msense_ctl_mode(args->dev, p, page_control == 1);
 		break;
 
 	default:		/* invalid page code */
@@ -2521,12 +2529,12 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 
 invalid_fld:
-	ata_scsi_set_sense(args->cmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
 	return 1;
 
 saving_not_supp:
-	ata_scsi_set_sense(args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
+	ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
 	 /* "Saving parameters not supported" */
 	return 1;
 }
@@ -3163,7 +3171,7 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x24, 0x00);
 	/* "Invalid field in cdb" */
 	return 1;
 }
@@ -3228,7 +3236,7 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	return 0;
 
  invalid_fld:
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x24, 0x00);
 	/* "Invalid field in cdb" */
 	return 1;
 }
@@ -3279,6 +3287,51 @@ static int ata_mselect_caching(struct ata_queued_cmd *qc,
 	return 0;
 }
 
+/**
+ *	ata_mselect_control - Simulate MODE SELECT for control page
+ *	@qc: Storage for translated ATA taskfile
+ *	@buf: input buffer
+ *	@len: number of valid bytes in the input buffer
+ *
+ *	Prepare a taskfile to modify caching information for the device.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static int ata_mselect_control(struct ata_queued_cmd *qc,
+			       const u8 *buf, int len)
+{
+	struct ata_device *dev = qc->dev;
+	char mpage[CONTROL_MPAGE_LEN];
+	u8 d_sense;
+
+	/*
+	 * The first two bytes of def_control_mpage are a header, so offsets
+	 * in mpage are off by 2 compared to buf.  Same for len.
+	 */
+
+	if (len != CONTROL_MPAGE_LEN - 2)
+		return -EINVAL;
+
+	d_sense = buf[0] & (1 << 2);
+
+	/*
+	 * Check that read-only bits are not modified.
+	 */
+	ata_msense_ctl_mode(dev, mpage, false);
+	mpage[2] &= ~(1 << 2);
+	mpage[2] |= d_sense;
+	if (memcmp(mpage + 2, buf, CONTROL_MPAGE_LEN - 2) != 0)
+		return -EINVAL;
+	if (d_sense & (1 << 2))
+		dev->flags |= ATA_DFLAG_D_SENSE;
+	else
+		dev->flags &= ~ATA_DFLAG_D_SENSE;
+	qc->scsicmd->result = SAM_STAT_GOOD;
+	qc->scsicmd->scsi_done(qc->scsicmd);
+	return 0;
+}
+
 /**
  *	ata_scsiop_mode_select - Simulate MODE SELECT 6, 10 commands
  *	@qc: Storage for translated ATA taskfile
@@ -3381,7 +3434,10 @@ static unsigned int ata_scsi_mode_select_xlat(struct ata_queued_cmd *qc)
 		if (ata_mselect_caching(qc, p, pg_len) < 0)
 			goto invalid_param;
 		break;
-
+	case CONTROL_MPAGE:
+		if (ata_mselect_control(qc, p, pg_len) < 0)
+			goto invalid_param;
+		break;
 	default:		/* invalid page code */
 		goto invalid_param;
 	}
@@ -3397,17 +3453,17 @@ static unsigned int ata_scsi_mode_select_xlat(struct ata_queued_cmd *qc)
 
  invalid_fld:
 	/* "Invalid field in CDB" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	return 1;
 
  invalid_param:
 	/* "Invalid field in parameter list" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x26, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x26, 0x0);
 	return 1;
 
  invalid_param_len:
 	/* "Parameter list length error" */
-	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
 	return 1;
 
  skip:
@@ -3611,12 +3667,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 	switch(scsicmd[0]) {
 	/* TODO: worth improving? */
 	case FORMAT_UNIT:
-		ata_scsi_invalid_field(cmd);
+		ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	case INQUIRY:
 		if (scsicmd[1] & 2)	           /* is CmdDt set?  */
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		else if ((scsicmd[1] & 1) == 0)    /* is EVPD clear? */
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_std);
 		else switch (scsicmd[2]) {
@@ -3642,7 +3698,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
 			break;
 		default:
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 			break;
 		}
 		break;
@@ -3660,7 +3716,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		if ((scsicmd[1] & 0x1f) == SAI_READ_CAPACITY_16)
 			ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
 		else
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	case REPORT_LUNS:
@@ -3668,7 +3724,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		break;
 
 	case REQUEST_SENSE:
-		ata_scsi_set_sense(cmd, 0, 0, 0);
+		ata_scsi_set_sense(dev, cmd, 0, 0, 0);
 		cmd->result = (DRIVER_SENSE << 24);
 		cmd->scsi_done(cmd);
 		break;
@@ -3692,12 +3748,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		if ((tmp8 == 0x4) && (!scsicmd[3]) && (!scsicmd[4]))
 			ata_scsi_rbuf_fill(&args, ata_scsiop_noop);
 		else
-			ata_scsi_invalid_field(cmd);
+			ata_scsi_invalid_field(dev, cmd);
 		break;
 
 	/* all other commands */
 	default:
-		ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x20, 0x0);
+		ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0);
 		/* "Invalid command operation code" */
 		cmd->scsi_done(cmd);
 		break;
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index dbc67604b3c5..3b301a48007c 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -138,7 +138,8 @@ extern int ata_scsi_add_hosts(struct ata_host *host,
 			      struct scsi_host_template *sht);
 extern void ata_scsi_scan_host(struct ata_port *ap, int sync);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
-extern void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
+extern void ata_scsi_set_sense(struct ata_device *dev,
+			       struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq);
 extern void ata_scsi_set_sense_information(struct ata_device *dev,
 					   struct scsi_cmnd *cmd,
 					   const struct ata_taskfile *tf);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2c4ebef79d0c..a418bca0df0d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -180,6 +180,7 @@ enum {
 	ATA_DFLAG_DA		= (1 << 26), /* device supports Device Attention */
 	ATA_DFLAG_DEVSLP	= (1 << 27), /* device supports Device Sleep */
 	ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
+	ATA_DFLAG_D_SENSE	= (1 << 29), /* Descriptor sense requested */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
-- 
cgit v1.2.3


From 4113652252fad972e0c191b1e536dc74a6faebdc Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 1 Apr 2016 21:38:16 +0200
Subject: reset: Add missing function stub for device_reset

The Mediatek's thermal driver fails to compile when the RESET_CONTROLLER
option is not set. Logically, as the driver depends on this option to compile,
the Kconfig should select it but actually that is not correct because the
Kconfig provides also the COMPILE_TEST to increase the compile test coverage.

By providing the missing 'device_reset' stub for the driver in reset.h, that
let the kernel to compile on different platforms with the Mediatek thermal
driver enabled with the COMPILE_TEST option.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index a552134a209e..ec0306ce7b92 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -56,6 +56,12 @@ static inline void reset_control_put(struct reset_control *rstc)
 	WARN_ON(1);
 }
 
+static inline int __must_check device_reset(struct device *dev)
+{
+	WARN_ON(1);
+	return -ENOTSUPP;
+}
+
 static inline int device_reset_optional(struct device *dev)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From 974e0a4537f556867483f493c7f67ccdcb7fc504 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@hgst.com>
Date: Mon, 4 Apr 2016 12:17:09 -0400
Subject: libata-core: Allow longer timeout for drive spinup from PUIS

When spinning up a drive from powered on standby mode (PUIS),
SETFEATURES_SPINUP is executed with the default timeout used
for any SETFEATURES subcommand, that is 5+10 seconds. The
total 15s is too short for some drives to complete spinup
(e.g. drives with a large indirection table stored on media),
resulting in ata_dev_read_id to fail twice on the execution
of SETFEATURES_SPINUP. For this feature, allow a larger
default timeout of 30 seconds. However, in the same spirit
as with the timeout of other feature subcommands, do not
ignore ata_probe_timeout if it is set).

Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 6 +++++-
 include/linux/ata.h       | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 55e257c268dd..7b21021dbf7d 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4528,6 +4528,7 @@ unsigned int ata_dev_set_feature(struct ata_device *dev, u8 enable, u8 feature)
 {
 	struct ata_taskfile tf;
 	unsigned int err_mask;
+	unsigned long timeout = 0;
 
 	/* set up set-features taskfile */
 	DPRINTK("set features - SATA features\n");
@@ -4539,7 +4540,10 @@ unsigned int ata_dev_set_feature(struct ata_device *dev, u8 enable, u8 feature)
 	tf.protocol = ATA_PROT_NODATA;
 	tf.nsect = feature;
 
-	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	if (enable == SETFEATURES_SPINUP)
+		timeout = ata_probe_timeout ?
+			  ata_probe_timeout * 1000 : SETFEATURES_SPINUP_TIMEOUT;
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, timeout);
 
 	DPRINTK("EXIT, err_mask=%x\n", err_mask);
 	return err_mask;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index c1a2f345cbe6..f310ec0f072e 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -371,7 +371,8 @@ enum {
 	SETFEATURES_AAM_ON	= 0x42,
 	SETFEATURES_AAM_OFF	= 0xC2,
 
-	SETFEATURES_SPINUP	= 0x07, /* Spin-up drive */
+	SETFEATURES_SPINUP		= 0x07, /* Spin-up drive */
+	SETFEATURES_SPINUP_TIMEOUT	= 30000, /* 30s timeout for drive spin-up from PUIS */
 
 	SETFEATURES_SATA_ENABLE = 0x10, /* Enable use of SATA feature */
 	SETFEATURES_SATA_DISABLE = 0x90, /* Disable use of SATA feature */
-- 
cgit v1.2.3


From 77ed2c5745d93416992857d124f35834b62b3e70 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 8 Mar 2016 20:01:32 +0900
Subject: android,lowmemorykiller: Don't abuse TIF_MEMDIE.

Currently, lowmemorykiller (LMK) is using TIF_MEMDIE for two purposes.
One is to remember processes killed by LMK, and the other is to
accelerate termination of processes killed by LMK.

But since LMK is invoked as a memory shrinker function, there still
should be some memory available. It is very likely that memory
allocations by processes killed by LMK will succeed without using
ALLOC_NO_WATERMARKS via TIF_MEMDIE. Even if their allocations cannot
escape from memory allocation loop unless they use ALLOC_NO_WATERMARKS,
lowmem_deathpending_timeout can guarantee forward progress by choosing
next victim process.

On the other hand, mark_oom_victim() assumes that it must be called with
oom_lock held and it must not be called after oom_killer_disable() was
called. But LMK is calling it without holding oom_lock and checking
oom_killer_disabled. It is possible that LMK calls mark_oom_victim()
due to allocation requests by kernel threads after current thread
returned from oom_killer_disabled(). This will break synchronization
for PM/suspend.

This patch introduces per a task_struct flag for remembering processes
killed by LMK, and replaces TIF_MEMDIE with that flag. By applying this
patch, assumption by mark_oom_victim() becomes true.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Arve Hjonnevag <arve@android.com>
Cc: Riley Andrews <riandrews@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/lowmemorykiller.c | 9 ++-------
 include/linux/sched.h                     | 4 ++++
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 2509e5df7244..c79f22425fa8 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -131,7 +131,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 		if (!p)
 			continue;
 
-		if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
+		if (task_lmk_waiting(p) && p->mm &&
 		    time_before_eq(jiffies, lowmem_deathpending_timeout)) {
 			task_unlock(p);
 			rcu_read_unlock();
@@ -162,13 +162,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 	if (selected) {
 		task_lock(selected);
 		send_sig(SIGKILL, selected, 0);
-		/*
-		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
-		 * infrastructure. There is no real reason why the selected
-		 * task should have access to the memory reserves.
-		 */
 		if (selected->mm)
-			mark_oom_victim(selected);
+			task_set_lmk_waiting(selected);
 		task_unlock(selected);
 		lowmem_print(1, "Killing '%s' (%d), adj %hd,\n"
 				 "   to free %ldkB on behalf of '%s' (%d) because\n"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60bba7e032dc..9dff190e6a0a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2184,6 +2184,7 @@ static inline void memalloc_noio_restore(unsigned int flags)
 #define PFA_NO_NEW_PRIVS 0	/* May not gain new privileges. */
 #define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
 #define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
+#define PFA_LMK_WAITING  3      /* Lowmemorykiller is waiting */
 
 
 #define TASK_PFA_TEST(name, func)					\
@@ -2207,6 +2208,9 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 
+TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
+TASK_PFA_SET(LMK_WAITING, lmk_waiting)
+
 /*
  * task->jobctl flags
  */
-- 
cgit v1.2.3


From ca065d0cf80fa547724440a8bf37f1e674d917c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:13 -0700
Subject: udp: no longer use SLAB_DESTROY_BY_RCU

Tom Herbert would like not touching UDP socket refcnt for encapsulated
traffic. For this to happen, we need to use normal RCU rules, with a grace
period before freeing a socket. UDP sockets are not short lived in the
high usage case, so the added cost of call_rcu() should not be a concern.

This actually removes a lot of complexity in UDP stack.

Multicast receives no longer need to hold a bucket spinlock.

Note that ip early demux still needs to take a reference on the socket.

Same remark for functions used by xt_socket and xt_PROXY netfilter modules,
but this might be changed later.

Performance for a single UDP socket receiving flood traffic from
many RX queues/cpus.

Simple udp_rx using simple recvfrom() loop :
438 kpps instead of 374 kpps : 17 % increase of the peak rate.

v2: Addressed Willem de Bruijn feedback in multicast handling
 - keep early demux break in __udp4_lib_demux_lookup()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Willem de Bruijn <willemb@google.com>
Tested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |   8 +-
 include/net/sock.h  |  12 +--
 include/net/udp.h   |   2 +-
 net/ipv4/udp.c      | 293 ++++++++++++++++------------------------------------
 net/ipv4/udp_diag.c |  18 ++--
 net/ipv6/udp.c      | 196 ++++++++++++-----------------------
 6 files changed, 171 insertions(+), 358 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 87c094961bd5..32342754643a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -98,11 +98,11 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
-#define udp_portaddr_for_each_entry(__sk, node, list) \
-	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry(__sk, list) \
+	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
-#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
-	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry_rcu(__sk, list) \
+	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
 #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag)
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9e77353a92ae..7ad73db9dde2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -178,7 +178,7 @@ struct sock_common {
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
-		struct hlist_nulls_node skc_portaddr_node;
+		struct hlist_node	skc_portaddr_node;
 	};
 	struct proto		*skc_prot;
 	possible_net_t		skc_net;
@@ -670,18 +670,18 @@ static inline void sk_add_bind_node(struct sock *sk,
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
 /**
- * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @offset:	offset of hlist_node within the struct.
  *
  */
-#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
-	for (pos = (head)->first;					       \
-	     (!is_a_nulls(pos)) &&					       \
+#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)		       \
+	for (pos = rcu_dereference((head)->first);			       \
+	     pos != NULL &&						       \
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
-	     pos = pos->next)
+	     pos = rcu_dereference(pos->next))
 
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
diff --git a/include/net/udp.h b/include/net/udp.h
index 92927f729ac8..d870ec1611c4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -59,7 +59,7 @@ struct udp_skb_cb {
  *	@lock:	spinlock protecting changes to head/count
  */
 struct udp_hslot {
-	struct hlist_nulls_head	head;
+	struct hlist_head	head;
 	int			count;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 45ff590661f4..355bdb221057 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       unsigned int log)
 {
 	struct sock *sk2;
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 
-	sk_nulls_for_each(sk2, node, &hslot->head) {
+	sk_for_each(sk2, &hslot->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 						  bool match_wildcard))
 {
 	struct sock *sk2;
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 	int res = 0;
 
 	spin_lock(&hslot2->lock);
-	udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+	udp_portaddr_for_each_entry(sk2, &hslot2->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    (udp_sk(sk2)->udp_port_hash == num) &&
@@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
 						    bool match_wildcard))
 {
 	struct net *net = sock_net(sk);
-	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
 	struct sock *sk2;
 
-	sk_nulls_for_each(sk2, node, &hslot->head) {
+	sk_for_each(sk2, &hslot->head) {
 		if (net_eq(sock_net(sk2), net) &&
 		    sk2 != sk &&
 		    sk2->sk_family == sk->sk_family &&
@@ -333,17 +330,18 @@ found:
 			goto fail_unlock;
 		}
 
-		sk_nulls_add_node_rcu(sk, &hslot->head);
+		sk_add_node_rcu(sk, &hslot->head);
 		hslot->count++;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
 		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
 		spin_lock(&hslot2->lock);
-		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+		hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
 					 &hslot2->head);
 		hslot2->count++;
 		spin_unlock(&hslot2->lock);
 	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	error = 0;
 fail_unlock:
 	spin_unlock_bh(&hslot->lock);
@@ -497,37 +495,27 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-begin:
 	result = NULL;
 	badness = 0;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score2(sk, net, saddr, sport,
 				      daddr, hnum, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			badness = score;
+			result = sk;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -535,23 +523,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot2)
-		goto begin;
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score2(result, net, saddr, sport,
-				  daddr, hnum, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
 	return result;
 }
 
@@ -563,15 +534,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
@@ -593,35 +561,27 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 						  htonl(INADDR_ANY), hnum, dif,
 						  hslot2, slot2, skb);
 		}
-		rcu_read_unlock();
 		return result;
 	}
 begin:
 	result = NULL;
 	badness = 0;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -629,25 +589,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
-				  daddr, dport, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
-	rcu_read_unlock();
 	return result;
 }
 EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -663,13 +604,24 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
-	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
-				 &udp_table, NULL);
+	struct sock *sk;
+
+	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+			       dif, &udp_table, NULL);
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	return sk;
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
 
 static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 				       __be16 loc_port, __be32 loc_addr,
@@ -771,7 +723,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	sock_put(sk);
+	return;
 }
 
 void udp_err(struct sk_buff *skb, u32 info)
@@ -1474,13 +1426,13 @@ void udp_lib_unhash(struct sock *sk)
 		spin_lock_bh(&hslot->lock);
 		if (rcu_access_pointer(sk->sk_reuseport_cb))
 			reuseport_detach_sock(sk);
-		if (sk_nulls_del_node_init_rcu(sk)) {
+		if (sk_del_node_init_rcu(sk)) {
 			hslot->count--;
 			inet_sk(sk)->inet_num = 0;
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
 			spin_lock(&hslot2->lock);
-			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
 			hslot2->count--;
 			spin_unlock(&hslot2->lock);
 		}
@@ -1513,12 +1465,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
 
 			if (hslot2 != nhslot2) {
 				spin_lock(&hslot2->lock);
-				hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+				hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
 				hslot2->count--;
 				spin_unlock(&hslot2->lock);
 
 				spin_lock(&nhslot2->lock);
-				hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+				hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
 							 &nhslot2->head);
 				nhslot2->count++;
 				spin_unlock(&nhslot2->lock);
@@ -1697,35 +1649,6 @@ drop:
 	return -1;
 }
 
-static void flush_stack(struct sock **stack, unsigned int count,
-			struct sk_buff *skb, unsigned int final)
-{
-	unsigned int i;
-	struct sk_buff *skb1 = NULL;
-	struct sock *sk;
-
-	for (i = 0; i < count; i++) {
-		sk = stack[i];
-		if (likely(!skb1))
-			skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-
-		if (!skb1) {
-			atomic_inc(&sk->sk_drops);
-			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
-					 IS_UDPLITE(sk));
-			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
-					 IS_UDPLITE(sk));
-		}
-
-		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
-			skb1 = NULL;
-
-		sock_put(sk);
-	}
-	if (unlikely(skb1))
-		kfree_skb(skb1);
-}
-
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
@@ -1749,14 +1672,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable,
 				    int proto)
 {
-	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct hlist_nulls_node *node;
+	struct sock *sk, *first = NULL;
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
-	int dif = skb->dev->ifindex;
-	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
 	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
-	bool inner_flushed = false;
+	unsigned int offset = offsetof(typeof(*sk), sk_node);
+	int dif = skb->dev->ifindex;
+	struct hlist_node *node;
+	struct sk_buff *nskb;
 
 	if (use_hash2) {
 		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1767,23 +1690,28 @@ start_lookup:
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-	spin_lock(&hslot->lock);
-	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
-		if (__udp_is_mcast_sock(net, sk,
-					uh->dest, daddr,
-					uh->source, saddr,
-					dif, hnum)) {
-			if (unlikely(count == ARRAY_SIZE(stack))) {
-				flush_stack(stack, count, skb, ~0);
-				inner_flushed = true;
-				count = 0;
-			}
-			stack[count++] = sk;
-			sock_hold(sk);
+	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+					 uh->source, saddr, dif, hnum))
+			continue;
+
+		if (!first) {
+			first = sk;
+			continue;
 		}
-	}
+		nskb = skb_clone(skb, GFP_ATOMIC);
 
-	spin_unlock(&hslot->lock);
+		if (unlikely(!nskb)) {
+			atomic_inc(&sk->sk_drops);
+			UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
+					 IS_UDPLITE(sk));
+			UDP_INC_STATS_BH(net, UDP_MIB_INERRORS,
+					 IS_UDPLITE(sk));
+			continue;
+		}
+		if (udp_queue_rcv_skb(sk, nskb) > 0)
+			consume_skb(nskb);
+	}
 
 	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
 	if (use_hash2 && hash2 != hash2_any) {
@@ -1791,16 +1719,13 @@ start_lookup:
 		goto start_lookup;
 	}
 
-	/*
-	 * do the slow work with no lock held
-	 */
-	if (count) {
-		flush_stack(stack, count, skb, count - 1);
+	if (first) {
+		if (udp_queue_rcv_skb(first, skb) > 0)
+			consume_skb(skb);
 	} else {
-		if (!inner_flushed)
-			UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
-					 proto == IPPROTO_UDPLITE);
-		consume_skb(skb);
+		kfree_skb(skb);
+		UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+				 proto == IPPROTO_UDPLITE);
 	}
 	return 0;
 }
@@ -1897,7 +1822,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 						 inet_compute_pseudo);
 
 		ret = udp_queue_rcv_skb(sk, skb);
-		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input, but
 		 * it wants the return to be -protocol, or 0
@@ -1958,49 +1882,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
 						  int dif)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+	unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
 	struct udp_hslot *hslot = &udp_table.hash[slot];
 
 	/* Do not bother scanning a too big list */
 	if (hslot->count > 10)
 		return NULL;
 
-	rcu_read_lock();
-begin:
-	count = 0;
 	result = NULL;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum)) {
+	sk_for_each_rcu(sk, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+					rmt_port, rmt_addr, dif, hnum)) {
+			if (result)
+				return NULL;
 			result = sk;
-			++count;
-		}
-	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-		if (count != 1 ||
-		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(!__udp_is_mcast_sock(net, result,
-						       loc_port, loc_addr,
-						       rmt_port, rmt_addr,
-						       dif, hnum))) {
-			sock_put(result);
-			result = NULL;
 		}
 	}
-	rcu_read_unlock();
+
 	return result;
 }
 
@@ -2013,37 +1912,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    __be16 rmt_port, __be32 rmt_addr,
 					    int dif)
 {
-	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(loc_port);
 	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
 	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+	struct sock *sk;
 
-	rcu_read_lock();
-	result = NULL;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
-		if (INET_MATCH(sk, net, acookie,
-			       rmt_addr, loc_addr, ports, dif))
-			result = sk;
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+		if (INET_MATCH(sk, net, acookie, rmt_addr,
+			       loc_addr, ports, dif))
+			return sk;
 		/* Only check first socket in chain */
 		break;
 	}
-
-	if (result) {
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(!INET_MATCH(sk, net, acookie,
-					      rmt_addr, loc_addr,
-					      ports, dif))) {
-			sock_put(result);
-			result = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return result;
+	return NULL;
 }
 
 void udp_v4_early_demux(struct sk_buff *skb)
@@ -2051,7 +1935,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
 	const struct udphdr *uh;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct dst_entry *dst;
 	int dif = skb->dev->ifindex;
 	int ours;
@@ -2083,11 +1967,9 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	} else if (skb->pkt_type == PACKET_HOST) {
 		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
 					     uh->source, iph->saddr, dif);
-	} else {
-		return;
 	}
 
-	if (!sk)
+	if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
 		return;
 
 	skb->sk = sk;
@@ -2387,14 +2269,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 
 	for (state->bucket = start; state->bucket <= state->udp_table->mask;
 	     ++state->bucket) {
-		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 
-		if (hlist_nulls_empty(&hslot->head))
+		if (hlist_empty(&hslot->head))
 			continue;
 
 		spin_lock_bh(&hslot->lock);
-		sk_nulls_for_each(sk, node, &hslot->head) {
+		sk_for_each(sk, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -2413,7 +2294,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_nulls_next(sk);
+		sk = sk_next(sk);
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
@@ -2622,12 +2503,12 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 
 	table->hash2 = table->hash + (table->mask + 1);
 	for (i = 0; i <= table->mask; i++) {
-		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+		INIT_HLIST_HEAD(&table->hash[i].head);
 		table->hash[i].count = 0;
 		spin_lock_init(&table->hash[i].lock);
 	}
 	for (i = 0; i <= table->mask; i++) {
-		INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+		INIT_HLIST_HEAD(&table->hash2[i].head);
 		table->hash2[i].count = 0;
 		spin_lock_init(&table->hash2[i].lock);
 	}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index df1966f3b6ec..3d5ccf4b1412 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 			const struct inet_diag_req_v2 *req)
 {
 	int err = -EINVAL;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct sk_buff *rep;
 	struct net *net = sock_net(in_skb->sk);
 
+	rcu_read_lock();
 	if (req->sdiag_family == AF_INET)
 		sk = __udp4_lib_lookup(net,
 				req->id.idiag_src[0], req->id.idiag_sport,
@@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 				req->id.idiag_dport,
 				req->id.idiag_if, tbl, NULL);
 #endif
-	else
-		goto out_nosk;
-
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	rcu_read_unlock();
 	err = -ENOENT;
 	if (!sk)
 		goto out_nosk;
@@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
 		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
-	int num, s_num, slot, s_slot;
 	struct net *net = sock_net(skb->sk);
+	int num, s_num, slot, s_slot;
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
 	for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
-		struct sock *sk;
-		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &table->hash[slot];
+		struct sock *sk;
 
 		num = 0;
 
-		if (hlist_nulls_empty(&hslot->head))
+		if (hlist_empty(&hslot->head))
 			continue;
 
 		spin_lock_bh(&hslot->lock);
-		sk_nulls_for_each(sk, node, &hslot->head) {
+		sk_for_each(sk, &hslot->head) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b772a7641fbd..78a7dfd12707 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -213,37 +213,28 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 		struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-begin:
 	result = NULL;
 	badness = -1;
-	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score2(sk, net, saddr, sport,
 				      daddr, hnum, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
 
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -251,27 +242,10 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot2)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score2(result, net, saddr, sport,
-				  daddr, hnum, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
 	return result;
 }
 
+/* rcu_read_lock() must be held */
 struct sock *__udp6_lib_lookup(struct net *net,
 				      const struct in6_addr *saddr, __be16 sport,
 				      const struct in6_addr *daddr, __be16 dport,
@@ -279,15 +253,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
 				      struct sk_buff *skb)
 {
 	struct sock *sk, *result;
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	int score, badness, matches = 0, reuseport = 0;
-	bool select_ok = true;
 	u32 hash = 0;
 
-	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp6_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
@@ -309,34 +280,26 @@ struct sock *__udp6_lib_lookup(struct net *net,
 						  &in6addr_any, hnum, dif,
 						  hslot2, slot2, skb);
 		}
-		rcu_read_unlock();
 		return result;
 	}
 begin:
 	result = NULL;
 	badness = -1;
-	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
-			result = sk;
-			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
 				hash = udp6_ehashfn(net, daddr, hnum,
 						    saddr, sport);
-				if (select_ok) {
-					struct sock *sk2;
-
-					sk2 = reuseport_select_sock(sk, hash, skb,
+				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-					if (sk2) {
-						result = sk2;
-						select_ok = false;
-						goto found;
-					}
-				}
+				if (result)
+					return result;
 				matches = 1;
 			}
+			result = sk;
+			badness = score;
 		} else if (score == badness && reuseport) {
 			matches++;
 			if (reciprocal_scale(hash, matches) == 0)
@@ -344,25 +307,6 @@ begin:
 			hash = next_pseudo_random32(hash);
 		}
 	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begin;
-
-	if (result) {
-found:
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
-					daddr, dport, dif) < badness)) {
-			sock_put(result);
-			goto begin;
-		}
-	}
-	rcu_read_unlock();
 	return result;
 }
 EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -382,12 +326,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
 struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
 			     const struct in6_addr *daddr, __be16 dport, int dif)
 {
-	return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
+	struct sock *sk;
+
+	sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				dif, &udp_table, NULL);
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	return sk;
 }
 EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+#endif
 
 /*
  *	This should be easy, if there is something there we
@@ -585,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	sock_put(sk);
+	return;
 }
 
 static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -747,33 +703,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static void flush_stack(struct sock **stack, unsigned int count,
-			struct sk_buff *skb, unsigned int final)
-{
-	struct sk_buff *skb1 = NULL;
-	struct sock *sk;
-	unsigned int i;
-
-	for (i = 0; i < count; i++) {
-		sk = stack[i];
-		if (likely(!skb1))
-			skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-		if (!skb1) {
-			atomic_inc(&sk->sk_drops);
-			UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
-					  IS_UDPLITE(sk));
-			UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
-					  IS_UDPLITE(sk));
-		}
-
-		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
-			skb1 = NULL;
-		sock_put(sk);
-	}
-	if (unlikely(skb1))
-		kfree_skb(skb1);
-}
-
 static void udp6_csum_zero_error(struct sk_buff *skb)
 {
 	/* RFC 2460 section 8.1 says that we SHOULD log
@@ -792,15 +721,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		const struct in6_addr *saddr, const struct in6_addr *daddr,
 		struct udp_table *udptable, int proto)
 {
-	struct sock *sk, *stack[256 / sizeof(struct sock *)];
+	struct sock *sk, *first = NULL;
 	const struct udphdr *uh = udp_hdr(skb);
-	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
-	int dif = inet6_iif(skb);
-	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int offset = offsetof(typeof(*sk), sk_node);
 	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
-	bool inner_flushed = false;
+	int dif = inet6_iif(skb);
+	struct hlist_node *node;
+	struct sk_buff *nskb;
 
 	if (use_hash2) {
 		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
@@ -811,27 +740,32 @@ start_lookup:
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-	spin_lock(&hslot->lock);
-	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
-		if (__udp_v6_is_mcast_sock(net, sk,
-					   uh->dest, daddr,
-					   uh->source, saddr,
-					   dif, hnum) &&
-		    /* If zero checksum and no_check is not on for
-		     * the socket then skip it.
-		     */
-		    (uh->check || udp_sk(sk)->no_check6_rx)) {
-			if (unlikely(count == ARRAY_SIZE(stack))) {
-				flush_stack(stack, count, skb, ~0);
-				inner_flushed = true;
-				count = 0;
-			}
-			stack[count++] = sk;
-			sock_hold(sk);
+	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+		if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
+					    uh->source, saddr, dif, hnum))
+			continue;
+		/* If zero checksum and no_check is not on for
+		 * the socket then skip it.
+		 */
+		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+			continue;
+		if (!first) {
+			first = sk;
+			continue;
+		}
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (unlikely(!nskb)) {
+			atomic_inc(&sk->sk_drops);
+			UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
+					  IS_UDPLITE(sk));
+			UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS,
+					  IS_UDPLITE(sk));
+			continue;
 		}
-	}
 
-	spin_unlock(&hslot->lock);
+		if (udpv6_queue_rcv_skb(sk, nskb) > 0)
+			consume_skb(nskb);
+	}
 
 	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
 	if (use_hash2 && hash2 != hash2_any) {
@@ -839,13 +773,13 @@ start_lookup:
 		goto start_lookup;
 	}
 
-	if (count) {
-		flush_stack(stack, count, skb, count - 1);
+	if (first) {
+		if (udpv6_queue_rcv_skb(first, skb) > 0)
+			consume_skb(skb);
 	} else {
-		if (!inner_flushed)
-			UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
-					  proto == IPPROTO_UDPLITE);
-		consume_skb(skb);
+		kfree_skb(skb);
+		UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+				  proto == IPPROTO_UDPLITE);
 	}
 	return 0;
 }
@@ -853,10 +787,10 @@ start_lookup:
 int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
+	const struct in6_addr *saddr, *daddr;
 	struct net *net = dev_net(skb->dev);
-	struct sock *sk;
 	struct udphdr *uh;
-	const struct in6_addr *saddr, *daddr;
+	struct sock *sk;
 	u32 ulen = 0;
 
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -910,7 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		int ret;
 
 		if (!uh->check && !udp_sk(sk)->no_check6_rx) {
-			sock_put(sk);
 			udp6_csum_zero_error(skb);
 			goto csum_error;
 		}
@@ -920,7 +853,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 						 ip6_compute_pseudo);
 
 		ret = udpv6_queue_rcv_skb(sk, skb);
-		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input */
 		if (ret > 0)
-- 
cgit v1.2.3


From 8f6fd83c6c5ec66a4a70c728535ddcdfef4f3697 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Wed, 2 Mar 2016 10:09:19 -0500
Subject: rhashtable: accept GFP flags in rhashtable_walk_init

In certain cases, the 802.11 mesh pathtable code wants to
iterate over all of the entries in the forwarding table from
the receive path, which is inside an RCU read-side critical
section.  Enable walks inside atomic sections by allowing
GFP_ATOMIC allocations for the walker state.

Change all existing callsites to pass in GFP_KERNEL.

Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
[also adjust gfs2/glock.c and rhashtable tests]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/gfs2/glock.c            | 4 ++--
 include/linux/rhashtable.h | 3 ++-
 lib/rhashtable.c           | 6 ++++--
 lib/test_rhashtable.c      | 2 +-
 net/ipv6/ila/ila_xlat.c    | 3 ++-
 net/netfilter/nft_hash.c   | 4 ++--
 net/netlink/af_netlink.c   | 3 ++-
 net/sctp/proc.c            | 3 ++-
 8 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6539131c52a2..4b73bd101bdc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1913,7 +1913,7 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
 	}
 	return ret;
 }
@@ -1941,7 +1941,7 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
 	}
 	return ret;
 }
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 63bd7601b6de..3eef0802a0cd 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -346,7 +346,8 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 					    struct bucket_table *old_tbl);
 int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
 
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp);
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
 int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cc808707d1cf..5d845ffd7982 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -487,6 +487,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * rhashtable_walk_init - Initialise an iterator
  * @ht:		Table to walk over
  * @iter:	Hash table Iterator
+ * @gfp:	GFP flags for allocations
  *
  * This function prepares a hash table walk.
  *
@@ -504,14 +505,15 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * You must call rhashtable_walk_exit if this function returns
  * successfully.
  */
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+			 gfp_t gfp)
 {
 	iter->ht = ht;
 	iter->p = NULL;
 	iter->slot = 0;
 	iter->skip = 0;
 
-	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	iter->walker = kmalloc(sizeof(*iter->walker), gfp);
 	if (!iter->walker)
 		return -ENOMEM;
 
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 270bf7289b1e..297fdb5e74bd 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -143,7 +143,7 @@ static void test_bucket_stats(struct rhashtable *ht)
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
 
-	err = rhashtable_walk_init(ht, &hti);
+	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
 	if (err) {
 		pr_warn("Test failed: allocation error");
 		return;
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 295ca29a23c3..0b03533453e4 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -501,7 +501,8 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
 	struct ila_net *ilan = net_generic(net, ila_net_id);
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
 
-	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter);
+	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+				    GFP_KERNEL);
 }
 
 static int ila_nl_dump_done(struct netlink_callback *cb)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3f9d45d3d9b7..6fa016564f90 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -192,7 +192,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 	u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
 	int err;
 
-	err = rhashtable_walk_init(&priv->ht, &hti);
+	err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
 	iter->err = err;
 	if (err)
 		return;
@@ -248,7 +248,7 @@ static void nft_hash_gc(struct work_struct *work)
 	priv = container_of(work, struct nft_hash, gc_work.work);
 	set  = nft_set_container_of(priv);
 
-	err = rhashtable_walk_init(&priv->ht, &hti);
+	err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
 	if (err)
 		goto schedule;
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 215fc08c02ab..0f16bf635480 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2343,7 +2343,8 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
 {
 	int err;
 
-	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
+	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
+				   GFP_KERNEL);
 	if (err) {
 		iter->link = MAX_LINKS;
 		return err;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5cfac8d5d3b3..6d45d53321e6 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -319,7 +319,8 @@ static int sctp_transport_walk_start(struct seq_file *seq)
 	struct sctp_ht_iter *iter = seq->private;
 	int err;
 
-	err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti);
+	err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti,
+				   GFP_KERNEL);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 4da56b99d99e5a7df2b7f11e87bfea935f909732 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 4 Apr 2016 14:46:42 +0100
Subject: mm/vmap: Add a notifier for when we run out of vmap address space

vmaps are temporary kernel mappings that may be of long duration.
Reusing a vmap on an object is preferrable for a driver as the cost of
setting up the vmap can otherwise dominate the operation on the object.
However, the vmap address space is rather limited on 32bit systems and
so we add a notification for vmap pressure in order for the driver to
release any cached vmappings.

The interface is styled after the oom-notifier where the callees are
passed a pointer to an unsigned long counter for them to indicate if they
have freed any space.

v2: Guard the blocking notifier call with gfpflags_allow_blocking()
v3: Correct typo in forward declaration and move to head of file

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Peniaev <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Andrew Morton <akpm@linux-foundation.org> # for inclusion via DRM
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1459777603-23618-3-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/linux/vmalloc.h |  4 ++++
 mm/vmalloc.c            | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..8b51df3ab334 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -8,6 +8,7 @@
 #include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
+struct notifier_block;		/* in notifier.h */
 
 /* bits in flags of vmalloc's vm_struct below */
 #define VM_IOREMAP		0x00000001	/* ioremap() and friends */
@@ -187,4 +188,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 #define VMALLOC_TOTAL 0UL
 #endif
 
+int register_vmap_purge_notifier(struct notifier_block *nb);
+int unregister_vmap_purge_notifier(struct notifier_block *nb);
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..12d27ac303ae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/list.h>
+#include <linux/notifier.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
@@ -344,6 +345,8 @@ static void __insert_vmap_area(struct vmap_area *va)
 
 static void purge_vmap_area_lazy(void);
 
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
@@ -363,6 +366,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	BUG_ON(offset_in_page(size));
 	BUG_ON(!is_power_of_2(align));
 
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
 	va = kmalloc_node(sizeof(struct vmap_area),
 			gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!va))
@@ -468,6 +473,16 @@ overflow:
 		purged = 1;
 		goto retry;
 	}
+
+	if (gfpflags_allow_blocking(gfp_mask)) {
+		unsigned long freed = 0;
+		blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+		if (freed > 0) {
+			purged = 0;
+			goto retry;
+		}
+	}
+
 	if (printk_ratelimit())
 		pr_warn("vmap allocation for size %lu failed: "
 			"use vmalloc=<size> to increase size.\n", size);
@@ -475,6 +490,18 @@ overflow:
 	return ERR_PTR(-EBUSY);
 }
 
+int register_vmap_purge_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
+
+int unregister_vmap_purge_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
+
 static void __free_vmap_area(struct vmap_area *va)
 {
 	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
-- 
cgit v1.2.3


From 3c5bcb2e1930bdbccd14a49660895f014349b51d Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Thu, 17 Mar 2016 15:02:53 +0200
Subject: ieee80211: support parsing Fine Timing Measurement action frame

Add definition for Fine Timing Measurement (FTM) frame format
as defined in IEEE802.11-REVmcD5.0 section 9.6.8.33

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3b1f6cef9513..bf9706c5b0bd 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -7,6 +7,7 @@
  * Copyright (c) 2005, Devicescape Software, Inc.
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1011,6 +1012,16 @@ struct ieee80211_mgmt {
 					u8 tpc_elem_length;
 					struct ieee80211_tpc_report_ie tpc;
 				} __packed tpc_report;
+				struct {
+					u8 action_code;
+					u8 dialog_token;
+					u8 follow_up;
+					u8 tod[6];
+					u8 toa[6];
+					__le16 tod_error;
+					__le16 toa_error;
+					u8 variable[0];
+				} __packed ftm;
 			} u;
 		} __packed action;
 	} u;
-- 
cgit v1.2.3


From c663e5f56737757db4d0b317c510ab505f93cecb Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 22 Mar 2016 10:51:16 +0100
Subject: gpio: support native single-ended hardware drivers

Some GPIO controllers has a special hardware bit we can flip
to support open drain / source. This means that on these hardwares
we do not need to emulate OD/OS by setting the line to input
instead of actively driving it high/low. Add an optional vtable
callback to the driver set_single_ended() so that driver can
implement this in hardware if they have it.

We may need a pinctrl_gpio_set_config() call at some point to
propagate this down to a backing pin control device on systems
with split GPIO/pin control.

Reported-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 52 ++++++++++++++++++++++++++++++++-------------
 include/linux/gpio/driver.h | 25 +++++++++++++++++++++-
 2 files changed, 61 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 72065532c1c7..1edc830a1b51 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1509,8 +1509,8 @@ EXPORT_SYMBOL_GPL(gpiod_direction_input);
 
 static int _gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 {
-	struct gpio_chip	*chip;
-	int			status = -EINVAL;
+	struct gpio_chip *gc = desc->gdev->chip;
+	int ret;
 
 	/* GPIOs used for IRQs shall not be set as output */
 	if (test_bit(FLAG_USED_AS_IRQ, &desc->flags)) {
@@ -1520,28 +1520,50 @@ static int _gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 		return -EIO;
 	}
 
-	/* Open drain pin should not be driven to 1 */
-	if (value && test_bit(FLAG_OPEN_DRAIN,  &desc->flags))
-		return gpiod_direction_input(desc);
-
-	/* Open source pin should not be driven to 0 */
-	if (!value && test_bit(FLAG_OPEN_SOURCE,  &desc->flags))
-		return gpiod_direction_input(desc);
+	if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) {
+		/* First see if we can enable open drain in hardware */
+		if (gc->set_single_ended) {
+			ret = gc->set_single_ended(gc, gpio_chip_hwgpio(desc),
+						   LINE_MODE_OPEN_DRAIN);
+			if (!ret)
+				goto set_output_value;
+		}
+		/* Emulate open drain by not actively driving the line high */
+		if (value)
+			return gpiod_direction_input(desc);
+	}
+	else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) {
+		if (gc->set_single_ended) {
+			ret = gc->set_single_ended(gc, gpio_chip_hwgpio(desc),
+						   LINE_MODE_OPEN_SOURCE);
+			if (!ret)
+				goto set_output_value;
+		}
+		/* Emulate open source by not actively driving the line low */
+		if (!value)
+			return gpiod_direction_input(desc);
+	} else {
+		/* Make sure to disable open drain/source hardware, if any */
+		if (gc->set_single_ended)
+			gc->set_single_ended(gc,
+					     gpio_chip_hwgpio(desc),
+					     LINE_MODE_PUSH_PULL);
+	}
 
-	chip = desc->gdev->chip;
-	if (!chip->set || !chip->direction_output) {
+set_output_value:
+	if (!gc->set || !gc->direction_output) {
 		gpiod_warn(desc,
 		       "%s: missing set() or direction_output() operations\n",
 		       __func__);
 		return -EIO;
 	}
 
-	status = chip->direction_output(chip, gpio_chip_hwgpio(desc), value);
-	if (status == 0)
+	ret = gc->direction_output(gc, gpio_chip_hwgpio(desc), value);
+	if (!ret)
 		set_bit(FLAG_IS_OUT, &desc->flags);
 	trace_gpio_value(desc_to_gpio(desc), 0, value);
-	trace_gpio_direction(desc_to_gpio(desc), 0, status);
-	return status;
+	trace_gpio_direction(desc_to_gpio(desc), 0, ret);
+	return ret;
 }
 
 /**
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index bee976f82788..50882e09289b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -19,6 +19,18 @@ struct gpio_device;
 
 #ifdef CONFIG_GPIOLIB
 
+/**
+ * enum single_ended_mode - mode for single ended operation
+ * @LINE_MODE_PUSH_PULL: normal mode for a GPIO line, drive actively high/low
+ * @LINE_MODE_OPEN_DRAIN: set line to be open drain
+ * @LINE_MODE_OPEN_SOURCE: set line to be open source
+ */
+enum single_ended_mode {
+	LINE_MODE_PUSH_PULL,
+	LINE_MODE_OPEN_DRAIN,
+	LINE_MODE_OPEN_SOURCE,
+};
+
 /**
  * struct gpio_chip - abstract a GPIO controller
  * @label: a functional name for the GPIO device, such as a part
@@ -38,7 +50,15 @@ struct gpio_device;
  * @set: assigns output value for signal "offset"
  * @set_multiple: assigns output values for multiple signals defined by "mask"
  * @set_debounce: optional hook for setting debounce time for specified gpio in
- *      interrupt triggered gpio chips
+ *	interrupt triggered gpio chips
+ * @set_single_ended: optional hook for setting a line as open drain, open
+ *	source, or non-single ended (restore from open drain/source to normal
+ *	push-pull mode) this should be implemented if the hardware supports
+ *	open drain or open source settings. The GPIOlib will otherwise try
+ *	to emulate open drain/source by not actively driving lines high/low
+ *	if a consumer request this. The driver may return -ENOTSUPP if e.g.
+ *	it supports just open drain but not open source and is called
+ *	with LINE_MODE_OPEN_SOURCE as mode argument.
  * @to_irq: optional hook supporting non-static gpio_to_irq() mappings;
  *	implementation may not sleep
  * @dbg_show: optional routine to show contents in debugfs; default code
@@ -130,6 +150,9 @@ struct gpio_chip {
 	int			(*set_debounce)(struct gpio_chip *chip,
 						unsigned offset,
 						unsigned debounce);
+	int			(*set_single_ended)(struct gpio_chip *chip,
+						unsigned offset,
+						enum single_ended_mode mode);
 
 	int			(*to_irq)(struct gpio_chip *chip,
 						unsigned offset);
-- 
cgit v1.2.3


From 627d2d6b550094d88f9e518e15967e7bf906ebbf Mon Sep 17 00:00:00 2001
From: samanthakumar <samanthakumar@google.com>
Date: Tue, 5 Apr 2016 12:41:16 -0400
Subject: udp: enable MSG_PEEK at non-zero offset

Enable peeking at UDP datagrams at the offset specified with socket
option SOL_SOCKET/SO_PEEK_OFF. Peek at any datagram in the queue, up
to the end of the given datagram.

Implement the SO_PEEK_OFF semantics introduced in commit ef64a54f6e55
("sock: Introduce the SO_PEEK_OFF sock option"). Increase the offset
on peek, decrease it on regular reads.

When peeking, always checksum the packet immediately, to avoid
recomputation on subsequent peeks and final read.

The socket lock is not held for the duration of udp_recvmsg, so
peek and read operations can run concurrently. Only the last store
to sk_peek_off is preserved.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  7 ++++++-
 include/net/sock.h     |  2 ++
 net/core/datagram.c    |  9 ++++++---
 net/core/sock.c        |  9 +++++++++
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/udp.c         | 22 +++++++++++-----------
 net/ipv6/af_inet6.c    |  1 +
 net/ipv6/udp.c         | 22 +++++++++++-----------
 8 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 15d0df943466..007381270ff8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2949,7 +2949,12 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
+static inline void skb_free_datagram_locked(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	__skb_free_datagram_locked(sk, skb, 0);
+}
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index b75998952482..1decb7a22261 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -457,6 +457,8 @@ struct sock {
 #define SK_CAN_REUSE	1
 #define SK_FORCE_REUSE	2
 
+int sk_set_peek_off(struct sock *sk, int val);
+
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fa9dc6450b08..b7de71f8d5d3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_free_datagram);
 
-void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 {
 	bool slow;
 
 	if (likely(atomic_read(&skb->users) == 1))
 		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
+	else if (likely(!atomic_dec_and_test(&skb->users))) {
+		sk_peek_offset_bwd(sk, len);
 		return;
+	}
 
 	slow = lock_sock_fast(sk);
+	sk_peek_offset_bwd(sk, len);
 	skb_orphan(skb);
 	sk_mem_reclaim_partial(sk);
 	unlock_sock_fast(sk, slow);
@@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	/* skb is now orphaned, can be freed outside of locked section */
 	__kfree_skb(skb);
 }
-EXPORT_SYMBOL(skb_free_datagram_locked);
+EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 /**
  *	skb_kill_datagram - Free a datagram skbuff forcibly
diff --git a/net/core/sock.c b/net/core/sock.c
index e12197b359fd..2ce76e82857f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2187,6 +2187,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
+int sk_set_peek_off(struct sock *sk, int val)
+{
+	if (val < 0)
+		return -EINVAL;
+
+	sk->sk_peek_off = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
 
 /*
  * Set of default routines for initialising struct proto_ops when
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e481992dbae..a38b9910af60 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf747e86ce52..d80312ddbb8a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1294,7 +1294,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -1304,15 +1304,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -1322,16 +1323,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 
 		if (err == -EINVAL)
 			goto csum_copy_err;
@@ -1344,7 +1345,8 @@ try_again:
 			UDP_INC_STATS_USER(sock_net(sk),
 					   UDP_MIB_INERRORS, is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 
 	if (!peeked)
@@ -1368,9 +1370,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37cfd67c..2b78aad0d52f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -561,6 +561,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
+	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 84c8d7b66820..87bd7aff88b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -357,7 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	struct inet_sock *inet = inet_sk(sk);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, off = 0;
+	int peeked, peeking, off;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
@@ -371,15 +371,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
+	peeking = off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				  &peeked, &off, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	ulen = skb->len;
 	copied = len;
-	if (copied > ulen)
-		copied = ulen;
+	if (copied > ulen - off)
+		copied = ulen - off;
 	else if (copied < ulen)
 		msg->msg_flags |= MSG_TRUNC;
 
@@ -391,16 +392,16 @@ try_again:
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
 	if (checksum_valid || skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, 0, msg, copied);
+		err = skb_copy_datagram_msg(skb, off, msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
@@ -417,7 +418,8 @@ try_again:
 						    UDP_MIB_INERRORS,
 						    is_udplite);
 		}
-		goto out_free;
+		skb_free_datagram_locked(sk, skb);
+		return err;
 	}
 	if (!peeked) {
 		if (is_udp4)
@@ -465,9 +467,7 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-out_free:
-	skb_free_datagram_locked(sk, skb);
-out:
+	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
-- 
cgit v1.2.3


From f9cd476123ced488e628339becedb2cf3243a58a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 4 Apr 2016 22:44:59 +0200
Subject: dmaengine: pl08x: allocate OF slave channel data at probe time

The current OF translation of channels can never work with
any DMA client using the DMA channels directly: the only way
to get the channels initialized properly is in the
dma_async_device_register() call, where chan->dev etc is
allocated and initialized.

Allocate and initialize all possible DMA channels and
only augment a target channel with the periph_buses at
of_xlate(). Remove some const settings to make things work.

Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Tested-by: Joachim Eastwood <manabian@gmail.com>
Tested-by: Johannes Stezenbach <js@sig21.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/amba-pl08x.c   | 86 +++++++++++++++++++++++++++++++---------------
 include/linux/amba/pl08x.h |  2 +-
 2 files changed, 59 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 9b42c0588550..81db1c4811ce 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -107,16 +107,20 @@ struct pl08x_driver_data;
 /**
  * struct vendor_data - vendor-specific config parameters for PL08x derivatives
  * @channels: the number of channels available in this variant
+ * @signals: the number of request signals available from the hardware
  * @dualmaster: whether this version supports dual AHB masters or not.
  * @nomadik: whether the channels have Nomadik security extension bits
  *	that need to be checked for permission before use and some registers are
  *	missing
  * @pl080s: whether this version is a PL080S, which has separate register and
  *	LLI word for transfer size.
+ * @max_transfer_size: the maximum single element transfer size for this
+ *	PL08x variant.
  */
 struct vendor_data {
 	u8 config_offset;
 	u8 channels;
+	u8 signals;
 	bool dualmaster;
 	bool nomadik;
 	bool pl080s;
@@ -235,7 +239,7 @@ struct pl08x_dma_chan {
 	struct virt_dma_chan vc;
 	struct pl08x_phy_chan *phychan;
 	const char *name;
-	const struct pl08x_channel_data *cd;
+	struct pl08x_channel_data *cd;
 	struct dma_slave_config cfg;
 	struct pl08x_txd *at;
 	struct pl08x_driver_data *host;
@@ -1909,6 +1913,12 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 
 		if (slave) {
 			chan->cd = &pl08x->pd->slave_channels[i];
+			/*
+			 * Some implementations have muxed signals, whereas some
+			 * use a mux in front of the signals and need dynamic
+			 * assignment of signals.
+			 */
+			chan->signal = i;
 			pl08x_dma_slave_init(chan);
 		} else {
 			chan->cd = &pl08x->pd->memcpy_channel;
@@ -2050,40 +2060,33 @@ static struct dma_chan *pl08x_of_xlate(struct of_phandle_args *dma_spec,
 				       struct of_dma *ofdma)
 {
 	struct pl08x_driver_data *pl08x = ofdma->of_dma_data;
-	struct pl08x_channel_data *data;
-	struct pl08x_dma_chan *chan;
 	struct dma_chan *dma_chan;
+	struct pl08x_dma_chan *plchan;
 
 	if (!pl08x)
 		return NULL;
 
-	if (dma_spec->args_count != 2)
+	if (dma_spec->args_count != 2) {
+		dev_err(&pl08x->adev->dev,
+			"DMA channel translation requires two cells\n");
 		return NULL;
+	}
 
 	dma_chan = pl08x_find_chan_id(pl08x, dma_spec->args[0]);
-	if (dma_chan)
-		return dma_get_slave_channel(dma_chan);
-
-	chan = devm_kzalloc(pl08x->slave.dev, sizeof(*chan) + sizeof(*data),
-			    GFP_KERNEL);
-	if (!chan)
+	if (!dma_chan) {
+		dev_err(&pl08x->adev->dev,
+			"DMA slave channel not found\n");
 		return NULL;
+	}
 
-	data = (void *)&chan[1];
-	data->bus_id = "(none)";
-	data->periph_buses = dma_spec->args[1];
-
-	chan->cd = data;
-	chan->host = pl08x;
-	chan->slave = true;
-	chan->name = data->bus_id;
-	chan->state = PL08X_CHAN_IDLE;
-	chan->signal = dma_spec->args[0];
-	chan->vc.desc_free = pl08x_desc_free;
-
-	vchan_init(&chan->vc, &pl08x->slave);
+	plchan = to_pl08x_chan(dma_chan);
+	dev_dbg(&pl08x->adev->dev,
+		"translated channel for signal %d\n",
+		dma_spec->args[0]);
 
-	return dma_get_slave_channel(&chan->vc.chan);
+	/* Augment channel data for applicable AHB buses */
+	plchan->cd->periph_buses = dma_spec->args[1];
+	return dma_get_slave_channel(dma_chan);
 }
 
 static int pl08x_of_probe(struct amba_device *adev,
@@ -2091,9 +2094,11 @@ static int pl08x_of_probe(struct amba_device *adev,
 			  struct device_node *np)
 {
 	struct pl08x_platform_data *pd;
+	struct pl08x_channel_data *chanp = NULL;
 	u32 cctl_memcpy = 0;
 	u32 val;
 	int ret;
+	int i;
 
 	pd = devm_kzalloc(&adev->dev, sizeof(*pd), GFP_KERNEL);
 	if (!pd)
@@ -2195,6 +2200,27 @@ static int pl08x_of_probe(struct amba_device *adev,
 	/* Use the buses that can access memory, obviously */
 	pd->memcpy_channel.periph_buses = pd->mem_buses;
 
+	/*
+	 * Allocate channel data for all possible slave channels (one
+	 * for each possible signal), channels will then be allocated
+	 * for a device and have it's AHB interfaces set up at
+	 * translation time.
+	 */
+	chanp = devm_kcalloc(&adev->dev,
+			pl08x->vd->signals,
+			sizeof(struct pl08x_channel_data),
+			GFP_KERNEL);
+	if (!chanp)
+		return -ENOMEM;
+
+	pd->slave_channels = chanp;
+	for (i = 0; i < pl08x->vd->signals; i++) {
+		/* chanp->periph_buses will be assigned at translation */
+		chanp->bus_id = kasprintf(GFP_KERNEL, "slave%d", i);
+		chanp++;
+	}
+	pd->num_slave_channels = pl08x->vd->signals;
+
 	pl08x->pd = pd;
 
 	return of_dma_controller_register(adev->dev.of_node, pl08x_of_xlate,
@@ -2234,6 +2260,10 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out_no_pl08x;
 	}
 
+	/* Assign useful pointers to the driver state */
+	pl08x->adev = adev;
+	pl08x->vd = vd;
+
 	/* Initialize memcpy engine */
 	dma_cap_set(DMA_MEMCPY, pl08x->memcpy.cap_mask);
 	pl08x->memcpy.dev = &adev->dev;
@@ -2284,10 +2314,6 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		}
 	}
 
-	/* Assign useful pointers to the driver state */
-	pl08x->adev = adev;
-	pl08x->vd = vd;
-
 	/* By default, AHB1 only.  If dualmaster, from platform */
 	pl08x->lli_buses = PL08X_AHB1;
 	pl08x->mem_buses = PL08X_AHB1;
@@ -2438,6 +2464,7 @@ out_no_pl08x:
 static struct vendor_data vendor_pl080 = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 8,
+	.signals = 16,
 	.dualmaster = true,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
 };
@@ -2445,6 +2472,7 @@ static struct vendor_data vendor_pl080 = {
 static struct vendor_data vendor_nomadik = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 8,
+	.signals = 32,
 	.dualmaster = true,
 	.nomadik = true,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
@@ -2453,6 +2481,7 @@ static struct vendor_data vendor_nomadik = {
 static struct vendor_data vendor_pl080s = {
 	.config_offset = PL080S_CH_CONFIG,
 	.channels = 8,
+	.signals = 32,
 	.pl080s = true,
 	.max_transfer_size = PL080S_CONTROL_TRANSFER_SIZE_MASK,
 };
@@ -2460,6 +2489,7 @@ static struct vendor_data vendor_pl080s = {
 static struct vendor_data vendor_pl081 = {
 	.config_offset = PL080_CH_CONFIG,
 	.channels = 2,
+	.signals = 16,
 	.dualmaster = false,
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
 };
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 10fe2a211c2e..27e9ec8778eb 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -86,7 +86,7 @@ struct pl08x_channel_data {
  * @mem_buses: buses which memory can be accessed from: PL08X_AHB1 | PL08X_AHB2
  */
 struct pl08x_platform_data {
-	const struct pl08x_channel_data *slave_channels;
+	struct pl08x_channel_data *slave_channels;
 	unsigned int num_slave_channels;
 	struct pl08x_channel_data memcpy_channel;
 	int (*get_xfer_signal)(const struct pl08x_channel_data *);
-- 
cgit v1.2.3


From fa59507f720077a856c9952a31cfd45cd97ef6f9 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 30 Mar 2016 12:56:28 +0300
Subject: usb: otg-fsm: Add documentation for struct otg_fsm

struct otg_fsm is the interface to the OTG state machine.

Document the input, output and internal state variables.
Definations are taken from Table 7-2 and Table 7-4 of
the USB OTG & EH Specification Rev.2.0

Re-arrange some of the members as per use case for more
clarity.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 include/linux/usb/otg-fsm.h | 90 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 24198e16f849..8eec0c261be5 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -72,37 +72,113 @@ enum otg_fsm_timer {
 	NUM_OTG_FSM_TIMERS,
 };
 
-/* OTG state machine according to the OTG spec */
+/**
+ * struct otg_fsm - OTG state machine according to the OTG spec
+ *
+ * OTG hardware Inputs
+ *
+ *	Common inputs for A and B device
+ * @id:		TRUE for B-device, FALSE for A-device.
+ * @adp_change: TRUE when current ADP measurement (n) value, compared to the
+ *		ADP measurement taken at n-2, differs by more than CADP_THR
+ * @power_up:	TRUE when the OTG device first powers up its USB system and
+ *		ADP measurement taken if ADP capable
+ *
+ *	A-Device state inputs
+ * @a_srp_det:	TRUE if the A-device detects SRP
+ * @a_vbus_vld:	TRUE when VBUS voltage is in regulation
+ * @b_conn:	TRUE if the A-device detects connection from the B-device
+ * @a_bus_resume: TRUE when the B-device detects that the A-device is signaling
+ *		  a resume (K state)
+ *	B-Device state inputs
+ * @a_bus_suspend: TRUE when the B-device detects that the A-device has put the
+ *		bus into suspend
+ * @a_conn:	TRUE if the B-device detects a connection from the A-device
+ * @b_se0_srp:	TRUE when the line has been at SE0 for more than the minimum
+ *		time before generating SRP
+ * @b_ssend_srp: TRUE when the VBUS has been below VOTG_SESS_VLD for more than
+ *		 the minimum time before generating SRP
+ * @b_sess_vld:	TRUE when the B-device detects that the voltage on VBUS is
+ *		above VOTG_SESS_VLD
+ * @test_device: TRUE when the B-device switches to B-Host and detects an OTG
+ *		test device. This must be set by host/hub driver
+ *
+ *	Application inputs (A-Device)
+ * @a_bus_drop:	TRUE when A-device application needs to power down the bus
+ * @a_bus_req:	TRUE when A-device application wants to use the bus.
+ *		FALSE to suspend the bus
+ *
+ *	Application inputs (B-Device)
+ * @b_bus_req:	TRUE during the time that the Application running on the
+ *		B-device wants to use the bus
+ *
+ *	Auxilary inputs (OTG v1.3 only. Obsolete now.)
+ * @a_sess_vld:	TRUE if the A-device detects that VBUS is above VA_SESS_VLD
+ * @b_bus_suspend: TRUE when the A-device detects that the B-device has put
+ *		the bus into suspend
+ * @b_bus_resume: TRUE when the A-device detects that the B-device is signaling
+ *		 resume on the bus
+ *
+ * OTG Output status. Read only for users. Updated by OTG FSM helpers defined
+ * in this file
+ *
+ *	Outputs for Both A and B device
+ * @drv_vbus:	TRUE when A-device is driving VBUS
+ * @loc_conn:	TRUE when the local device has signaled that it is connected
+ *		to the bus
+ * @loc_sof:	TRUE when the local device is generating activity on the bus
+ * @adp_prb:	TRUE when the local device is in the process of doing
+ *		ADP probing
+ *
+ *	Outputs for B-device state
+ * @adp_sns:	TRUE when the B-device is in the process of carrying out
+ *		ADP sensing
+ * @data_pulse: TRUE when the B-device is performing data line pulsing
+ *
+ * Internal Variables
+ *
+ * a_set_b_hnp_en: TRUE when the A-device has successfully set the
+ *		b_hnp_enable bit in the B-device.
+ *		   Unused as OTG fsm uses otg->host->b_hnp_enable instead
+ * b_srp_done:	TRUE when the B-device has completed initiating SRP
+ * b_hnp_enable: TRUE when the B-device has accepted the
+ *		SetFeature(b_hnp_enable) B-device.
+ *		Unused as OTG fsm uses otg->gadget->b_hnp_enable instead
+ * a_clr_err:	Asserted (by application ?) to clear a_vbus_err due to an
+ *		overcurrent condition and causes the A-device to transition
+ *		to a_wait_vfall
+ */
 struct otg_fsm {
 	/* Input */
 	int id;
 	int adp_change;
 	int power_up;
-	int test_device;
-	int a_bus_drop;
-	int a_bus_req;
 	int a_srp_det;
 	int a_vbus_vld;
 	int b_conn;
 	int a_bus_resume;
 	int a_bus_suspend;
 	int a_conn;
-	int b_bus_req;
 	int b_se0_srp;
 	int b_ssend_srp;
 	int b_sess_vld;
+	int test_device;
+	int a_bus_drop;
+	int a_bus_req;
+	int b_bus_req;
+
 	/* Auxilary inputs */
 	int a_sess_vld;
 	int b_bus_resume;
 	int b_bus_suspend;
 
 	/* Output */
-	int data_pulse;
 	int drv_vbus;
 	int loc_conn;
 	int loc_sof;
 	int adp_prb;
 	int adp_sns;
+	int data_pulse;
 
 	/* Internal variables */
 	int a_set_b_hnp_en;
@@ -110,7 +186,7 @@ struct otg_fsm {
 	int b_hnp_enable;
 	int a_clr_err;
 
-	/* Informative variables */
+	/* Informative variables. All unused as of now */
 	int a_bus_drop_inf;
 	int a_bus_req_inf;
 	int a_clr_err_inf;
-- 
cgit v1.2.3


From 4e332df63487418ec512c3c376c07df9ab3ae035 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 30 Mar 2016 12:56:29 +0300
Subject: usb: otg-fsm: support multiple instances

Move the state_changed variable into struct otg_fsm
so that we can support multiple instances.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/common/usb-otg-fsm.c | 10 ++++------
 include/linux/usb/otg-fsm.h      |  1 +
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c
index 504708f59b93..9059b7dc185e 100644
--- a/drivers/usb/common/usb-otg-fsm.c
+++ b/drivers/usb/common/usb-otg-fsm.c
@@ -61,8 +61,6 @@ static int otg_set_protocol(struct otg_fsm *fsm, int protocol)
 	return 0;
 }
 
-static int state_changed;
-
 /* Called when leaving a state.  Do state clean up jobs here */
 static void otg_leave_state(struct otg_fsm *fsm, enum usb_otg_state old_state)
 {
@@ -208,7 +206,6 @@ static void otg_start_hnp_polling(struct otg_fsm *fsm)
 /* Called when entering a state */
 static int otg_set_state(struct otg_fsm *fsm, enum usb_otg_state new_state)
 {
-	state_changed = 1;
 	if (fsm->otg->state == new_state)
 		return 0;
 	VDBG("Set state: %s\n", usb_otg_state_string(new_state));
@@ -324,6 +321,7 @@ static int otg_set_state(struct otg_fsm *fsm, enum usb_otg_state new_state)
 	}
 
 	fsm->otg->state = new_state;
+	fsm->state_changed = 1;
 	return 0;
 }
 
@@ -335,7 +333,7 @@ int otg_statemachine(struct otg_fsm *fsm)
 	mutex_lock(&fsm->lock);
 
 	state = fsm->otg->state;
-	state_changed = 0;
+	fsm->state_changed = 0;
 	/* State machine state change judgement */
 
 	switch (state) {
@@ -448,7 +446,7 @@ int otg_statemachine(struct otg_fsm *fsm)
 	}
 	mutex_unlock(&fsm->lock);
 
-	VDBG("quit statemachine, changed = %d\n", state_changed);
-	return state_changed;
+	VDBG("quit statemachine, changed = %d\n", fsm->state_changed);
+	return fsm->state_changed;
 }
 EXPORT_SYMBOL_GPL(otg_statemachine);
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 8eec0c261be5..7a0350535cb1 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -210,6 +210,7 @@ struct otg_fsm {
 	struct mutex lock;
 	u8 *host_req_flag;
 	struct delayed_work hnp_polling_work;
+	bool state_changed;
 };
 
 struct otg_fsm_ops {
-- 
cgit v1.2.3


From 49ddf8e6e2347cffdcf83d1ca2d04ff929820178 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 31 Mar 2016 20:02:10 +0300
Subject: mac80211: add fast-rx path

The regular RX path has a lot of code, but with a few
assumptions on the hardware it's possible to reduce the
amount of code significantly. Currently the assumptions
on the driver are the following:
 * hardware/driver reordering buffer (if supporting aggregation)
 * hardware/driver decryption & PN checking (if using encryption)
 * hardware/driver did de-duplication
 * hardware/driver did A-MSDU deaggregation
 * AP_LINK_PS is used (in AP mode)
 * no client powersave handling in mac80211 (in client mode)

of which some are actually checked per packet:
 * de-duplication
 * PN checking
 * decryption
and additionally packets must
 * not be A-MSDU (have been deaggregated by driver/device)
 * be data packets
 * not be fragmented
 * be unicast
 * have RFC 1042 header

Additionally dynamically we assume:
 * no encryption or CCMP/GCMP, TKIP/WEP/other not allowed
 * station must be authorized
 * 4-addr format not enabled

Some data needed for the RX path is cached in a new per-station
"fast_rx" structure, so that we only need to look at this and
the packet, no other memory when processing packets on the fast
RX path.

After doing the above per-packet checks, the data path collapses
down to a pretty simple conversion function taking advantage of
the data cached in the small fast_rx struct.

This should speed up the RX processing, and will make it easier
to reason about parallelizing RX (for which statistics will need
to be per-CPU still.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  10 ++
 net/mac80211/cfg.c         |  10 +-
 net/mac80211/ieee80211_i.h |   5 +
 net/mac80211/key.c         |   1 +
 net/mac80211/mlme.c        |   9 ++
 net/mac80211/rx.c          | 351 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/sta_info.c    |   2 +
 net/mac80211/sta_info.h    |  34 +++++
 8 files changed, 419 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bf9706c5b0bd..113bfc468a4d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -638,6 +638,16 @@ static inline bool ieee80211_is_first_frag(__le16 seq_ctrl)
 	return (seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG)) == 0;
 }
 
+/**
+ * ieee80211_is_frag - check if a frame is a fragment
+ * @hdr: 802.11 header of the frame
+ */
+static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr)
+{
+	return ieee80211_has_morefrags(hdr->frame_control) ||
+	       hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG);
+}
+
 struct ieee80211s_hdr {
 	u8 flags;
 	u8 ttl;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 62a90f270f03..fc4730b938d0 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -65,11 +65,13 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 		return ret;
 
 	if (type == NL80211_IFTYPE_AP_VLAN &&
-	    params && params->use_4addr == 0)
+	    params && params->use_4addr == 0) {
 		RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
-	else if (type == NL80211_IFTYPE_STATION &&
-		 params && params->use_4addr >= 0)
+		ieee80211_check_fast_rx_iface(sdata);
+	} else if (type == NL80211_IFTYPE_STATION &&
+		   params && params->use_4addr >= 0) {
 		sdata->u.mgd.use_4addr = params->use_4addr;
+	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
 		struct ieee80211_local *local = sdata->local;
@@ -1367,6 +1369,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
 
 			rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
 			new_4addr = true;
+			__ieee80211_check_fast_rx_iface(vlansdata);
 		}
 
 		if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -1889,6 +1892,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 			sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
 		else
 			sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+		ieee80211_check_fast_rx_iface(sdata);
 	}
 
 	if (params->ht_opmode >= 0) {
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c8945e2d8a86..6243109979ed 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1494,6 +1494,11 @@ u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
 int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
 			     u64 *cookie, gfp_t gfp);
 
+void ieee80211_check_fast_rx(struct sta_info *sta);
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
+void ieee80211_clear_fast_rx(struct sta_info *sta);
+
 /* STA code */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
 int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 3df7b0392d30..edd6f2945f69 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -338,6 +338,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 		} else {
 			rcu_assign_pointer(sta->gtk[idx], new);
 		}
+		ieee80211_check_fast_rx(sta);
 	} else {
 		defunikey = old &&
 			old == key_mtx_dereference(sdata->local,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2112df4ffb7b..d3c75ac8a029 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2217,6 +2217,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 	const u8 *ssid;
 	u8 *dst = ifmgd->associated->bssid;
 	u8 unicast_limit = max(1, max_probe_tries - 3);
+	struct sta_info *sta;
 
 	/*
 	 * Try sending broadcast probe requests for the last three
@@ -2235,6 +2236,14 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 	 */
 	ifmgd->probe_send_count++;
 
+	if (dst) {
+		mutex_lock(&sdata->local->sta_mtx);
+		sta = sta_info_get(sdata, dst);
+		if (!WARN_ON(!sta))
+			ieee80211_check_fast_rx(sta);
+		mutex_unlock(&sdata->local->sta_mtx);
+	}
+
 	if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
 		ifmgd->nullfunc_failed = false;
 		ieee80211_send_nullfunc(sdata->local, sdata, false);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 2863832b0db4..96f8bbf21649 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3508,6 +3508,342 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 	return false;
 }
 
+void ieee80211_check_fast_rx(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_key *key;
+	struct ieee80211_fast_rx fastrx = {
+		.dev = sdata->dev,
+		.vif_type = sdata->vif.type,
+		.control_port_protocol = sdata->control_port_protocol,
+	}, *old, *new = NULL;
+	bool assign = false;
+
+	/* use sparse to check that we don't return without updating */
+	__acquire(check_fast_rx);
+
+	BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != sizeof(rfc1042_header));
+	BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != ETH_ALEN);
+	ether_addr_copy(fastrx.rfc1042_hdr, rfc1042_header);
+	ether_addr_copy(fastrx.vif_addr, sdata->vif.addr);
+
+	/* fast-rx doesn't do reordering */
+	if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) &&
+	    !ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER))
+		goto clear;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		/* 4-addr is harder to deal with, later maybe */
+		if (sdata->u.mgd.use_4addr)
+			goto clear;
+		/* software powersave is a huge mess, avoid all of it */
+		if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
+			goto clear;
+		if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
+		    !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
+			goto clear;
+		if (sta->sta.tdls) {
+			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+			fastrx.expected_ds_bits = 0;
+		} else {
+			fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0;
+			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
+			fastrx.expected_ds_bits =
+				cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		}
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_AP:
+		/* parallel-rx requires this, at least with calls to
+		 * ieee80211_sta_ps_transition()
+		 */
+		if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
+			goto clear;
+		fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
+		fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+		fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_TODS);
+
+		fastrx.internal_forward =
+			!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
+			(sdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
+			 !sdata->u.vlan.sta);
+		break;
+	default:
+		goto clear;
+	}
+
+	if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+		goto clear;
+
+	rcu_read_lock();
+	key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+	if (key) {
+		switch (key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_TKIP:
+			/* we don't want to deal with MMIC in fast-rx */
+			goto clear_rcu;
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
+			break;
+		default:
+			/* we also don't want to deal with WEP or cipher scheme
+			 * since those require looking up the key idx in the
+			 * frame, rather than assuming the PTK is used
+			 * (we need to revisit this once we implement the real
+			 * PTK index, which is now valid in the spec, but we
+			 * haven't implemented that part yet)
+			 */
+			goto clear_rcu;
+		}
+
+		fastrx.key = true;
+		fastrx.icv_len = key->conf.icv_len;
+	}
+
+	assign = true;
+ clear_rcu:
+	rcu_read_unlock();
+ clear:
+	__release(check_fast_rx);
+
+	if (assign)
+		new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL);
+
+	spin_lock_bh(&sta->lock);
+	old = rcu_dereference_protected(sta->fast_rx, true);
+	rcu_assign_pointer(sta->fast_rx, new);
+	spin_unlock_bh(&sta->lock);
+
+	if (old)
+		kfree_rcu(old, rcu_head);
+}
+
+void ieee80211_clear_fast_rx(struct sta_info *sta)
+{
+	struct ieee80211_fast_rx *old;
+
+	spin_lock_bh(&sta->lock);
+	old = rcu_dereference_protected(sta->fast_rx, true);
+	RCU_INIT_POINTER(sta->fast_rx, NULL);
+	spin_unlock_bh(&sta->lock);
+
+	if (old)
+		kfree_rcu(old, rcu_head);
+}
+
+void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	lockdep_assert_held(&local->sta_mtx);
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata &&
+		    (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
+			continue;
+		ieee80211_check_fast_rx(sta);
+	}
+}
+
+void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	mutex_lock(&local->sta_mtx);
+	__ieee80211_check_fast_rx_iface(sdata);
+	mutex_unlock(&local->sta_mtx);
+}
+
+static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
+				     struct ieee80211_fast_rx *fast_rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct sta_info *sta = rx->sta;
+	int orig_len = skb->len;
+	int snap_offs = ieee80211_hdrlen(hdr->frame_control);
+	struct {
+		u8 snap[sizeof(rfc1042_header)];
+		__be16 proto;
+	} *payload __aligned(2);
+	struct {
+		u8 da[ETH_ALEN];
+		u8 sa[ETH_ALEN];
+	} addrs __aligned(2);
+
+	/* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write
+	 * to a common data structure; drivers can implement that per queue
+	 * but we don't have that information in mac80211
+	 */
+	if (!(status->flag & RX_FLAG_DUP_VALIDATED))
+		return false;
+
+#define FAST_RX_CRYPT_FLAGS	(RX_FLAG_PN_VALIDATED | RX_FLAG_DECRYPTED)
+
+	/* If using encryption, we also need to have:
+	 *  - PN_VALIDATED: similar, but the implementation is tricky
+	 *  - DECRYPTED: necessary for PN_VALIDATED
+	 */
+	if (fast_rx->key &&
+	    (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS)
+		return false;
+
+	/* we don't deal with A-MSDU deaggregation here */
+	if (status->rx_flags & IEEE80211_RX_AMSDU)
+		return false;
+
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+		return false;
+
+	if (unlikely(ieee80211_is_frag(hdr)))
+		return false;
+
+	/* Since our interface address cannot be multicast, this
+	 * implicitly also rejects multicast frames without the
+	 * explicit check.
+	 *
+	 * We shouldn't get any *data* frames not addressed to us
+	 * (AP mode will accept multicast *management* frames), but
+	 * punting here will make it go through the full checks in
+	 * ieee80211_accept_frame().
+	 */
+	if (!ether_addr_equal(fast_rx->vif_addr, hdr->addr1))
+		return false;
+
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS |
+					      IEEE80211_FCTL_TODS)) !=
+	    fast_rx->expected_ds_bits)
+		goto drop;
+
+	/* assign the key to drop unencrypted frames (later)
+	 * and strip the IV/MIC if necessary
+	 */
+	if (fast_rx->key && !(status->flag & RX_FLAG_IV_STRIPPED)) {
+		/* GCMP header length is the same */
+		snap_offs += IEEE80211_CCMP_HDR_LEN;
+	}
+
+	if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
+		goto drop;
+	payload = (void *)(skb->data + snap_offs);
+
+	if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
+		return false;
+
+	/* Don't handle these here since they require special code.
+	 * Accept AARP and IPX even though they should come with a
+	 * bridge-tunnel header - but if we get them this way then
+	 * there's little point in discarding them.
+	 */
+	if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
+		     payload->proto == fast_rx->control_port_protocol))
+		return false;
+
+	/* after this point, don't punt to the slowpath! */
+
+	if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) &&
+	    pskb_trim(skb, skb->len - fast_rx->icv_len))
+		goto drop;
+
+	if (unlikely(fast_rx->sta_notify)) {
+		ieee80211_sta_rx_notify(rx->sdata, hdr);
+		fast_rx->sta_notify = false;
+	}
+
+	/* statistics part of ieee80211_rx_h_sta_process() */
+	sta->rx_stats.last_rx = jiffies;
+	sta->rx_stats.last_rate = sta_stats_encode_rate(status);
+
+	sta->rx_stats.fragments++;
+
+	if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
+		sta->rx_stats.last_signal = status->signal;
+		ewma_signal_add(&sta->rx_stats_avg.signal, -status->signal);
+	}
+
+	if (status->chains) {
+		int i;
+
+		sta->rx_stats.chains = status->chains;
+		for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
+			int signal = status->chain_signal[i];
+
+			if (!(status->chains & BIT(i)))
+				continue;
+
+			sta->rx_stats.chain_signal_last[i] = signal;
+			ewma_signal_add(&sta->rx_stats_avg.chain_signal[i],
+					-signal);
+		}
+	}
+	/* end of statistics */
+
+	if (rx->key && !ieee80211_has_protected(hdr->frame_control))
+		goto drop;
+
+	/* do the header conversion - first grab the addresses */
+	ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
+	ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
+	/* remove the SNAP but leave the ethertype */
+	skb_pull(skb, snap_offs + sizeof(rfc1042_header));
+	/* push the addresses in front */
+	memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs));
+
+	skb->dev = fast_rx->dev;
+
+	ieee80211_rx_stats(fast_rx->dev, skb->len);
+
+	/* The seqno index has the same property as needed
+	 * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
+	 * for non-QoS-data frames. Here we know it's a data
+	 * frame, so count MSDUs.
+	 */
+	u64_stats_update_begin(&sta->rx_stats.syncp);
+	sta->rx_stats.msdu[rx->seqno_idx]++;
+	sta->rx_stats.bytes += orig_len;
+	u64_stats_update_end(&sta->rx_stats.syncp);
+
+	if (fast_rx->internal_forward) {
+		struct sta_info *dsta = sta_info_get(rx->sdata, skb->data);
+
+		if (dsta) {
+			/*
+			 * Send to wireless media and increase priority by 256
+			 * to keep the received priority instead of
+			 * reclassifying the frame (see cfg80211_classify8021d).
+			 */
+			skb->priority += 256;
+			skb->protocol = htons(ETH_P_802_3);
+			skb_reset_network_header(skb);
+			skb_reset_mac_header(skb);
+			dev_queue_xmit(skb);
+			return true;
+		}
+	}
+
+	/* deliver to local stack */
+	skb->protocol = eth_type_trans(skb, fast_rx->dev);
+	memset(skb->cb, 0, sizeof(skb->cb));
+	if (rx->napi)
+		napi_gro_receive(rx->napi, skb);
+	else
+		netif_receive_skb(skb);
+
+	return true;
+ drop:
+	dev_kfree_skb(skb);
+	sta->rx_stats.dropped++;
+	return true;
+}
+
 /*
  * This function returns whether or not the SKB
  * was destined for RX processing or not, which,
@@ -3522,6 +3858,21 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
 
 	rx->skb = skb;
 
+	/* See if we can do fast-rx; if we have to copy we already lost,
+	 * so punt in that case. We should never have to deliver a data
+	 * frame to multiple interfaces anyway.
+	 *
+	 * We skip the ieee80211_accept_frame() call and do the necessary
+	 * checking inside ieee80211_invoke_fast_rx().
+	 */
+	if (consume && rx->sta) {
+		struct ieee80211_fast_rx *fast_rx;
+
+		fast_rx = rcu_dereference(rx->sta->fast_rx);
+		if (fast_rx && ieee80211_invoke_fast_rx(rx, fast_rx))
+			return true;
+	}
+
 	if (!ieee80211_accept_frame(rx))
 		return false;
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index bdd303e8b577..a0ce7e40f420 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1874,6 +1874,7 @@ int sta_info_move_state(struct sta_info *sta,
 				atomic_dec(&sta->sdata->bss->num_mcast_sta);
 			clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
 			ieee80211_clear_fast_xmit(sta);
+			ieee80211_clear_fast_rx(sta);
 		}
 		break;
 	case IEEE80211_STA_AUTHORIZED:
@@ -1884,6 +1885,7 @@ int sta_info_move_state(struct sta_info *sta,
 				atomic_inc(&sta->sdata->bss->num_mcast_sta);
 			set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
 			ieee80211_check_fast_xmit(sta);
+			ieee80211_check_fast_rx(sta);
 		}
 		break;
 	default:
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 7c23b575672e..a0a06609338d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -285,6 +285,38 @@ struct ieee80211_fast_tx {
 	struct rcu_head rcu_head;
 };
 
+/**
+ * struct ieee80211_fast_rx - RX fastpath information
+ * @dev: netdevice for reporting the SKB
+ * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type)
+ * @vif_addr: interface address
+ * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache)
+ * @control_port_protocol: control port protocol copied from sdata
+ * @expected_ds_bits: from/to DS bits expected
+ * @icv_len: length of the MIC if present
+ * @key: bool indicating encryption is expected (key is set)
+ * @sta_notify: notify the MLME code (once)
+ * @internal_forward: forward froms internally on AP/VLAN type interfaces
+ * @da_offs: offset of the DA in the header (for header conversion)
+ * @sa_offs: offset of the SA in the header (for header conversion)
+ * @rcu_head: RCU head for freeing this structure
+ */
+struct ieee80211_fast_rx {
+	struct net_device *dev;
+	enum nl80211_iftype vif_type;
+	u8 vif_addr[ETH_ALEN] __aligned(2);
+	u8 rfc1042_hdr[6] __aligned(2);
+	__be16 control_port_protocol;
+	__le16 expected_ds_bits;
+	u8 icv_len;
+	u8 key:1,
+	   sta_notify:1,
+	   internal_forward:1;
+	u8 da_offs, sa_offs;
+
+	struct rcu_head rcu_head;
+};
+
 /**
  * struct mesh_sta - mesh STA information
  * @plink_lock: serialize access to plink fields
@@ -391,6 +423,7 @@ DECLARE_EWMA(signal, 1024, 8)
  * @cipher_scheme: optional cipher scheme for this station
  * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
  * @fast_tx: TX fastpath information
+ * @fast_rx: RX fastpath information
  * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
  *	the BSS one.
  * @tx_stats: TX statistics
@@ -414,6 +447,7 @@ struct sta_info {
 	spinlock_t lock;
 
 	struct ieee80211_fast_tx __rcu *fast_tx;
+	struct ieee80211_fast_rx __rcu *fast_rx;
 
 #ifdef CONFIG_MAC80211_MESH
 	struct mesh_sta *mesh;
-- 
cgit v1.2.3


From 6e0456b5454561c4e9fa9e8a4acea405e6d56c80 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Thu, 3 Mar 2016 22:59:00 +0100
Subject: mac80211: add A-MSDU tx support

Requires software tx queueing and fast-xmit support. For good
performance, drivers need frag_list support as well. This avoids the
need for copying data of aggregated frames. Running without it is only
supported for debugging purposes.

To avoid performance and packet size issues, the rate control module or
driver needs to limit the maximum A-MSDU size by setting
max_rc_amsdu_len in struct ieee80211_sta.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
[fix locking issue]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |   3 +
 include/net/mac80211.h     |  19 ++++++
 net/mac80211/agg-tx.c      |   5 ++
 net/mac80211/debugfs.c     |   2 +
 net/mac80211/ieee80211_i.h |   1 +
 net/mac80211/sta_info.c    |   2 +
 net/mac80211/tx.c          | 156 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 188 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 113bfc468a4d..b118744d3382 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -164,6 +164,9 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
+/* Maximal size of an A-MSDU that can be transported in a HT BA session */
+#define IEEE80211_MAX_MPDU_LEN_HT_BA		4095
+
 /* Maximal size of an A-MSDU */
 #define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
 #define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5f4b4c773a92..a3ee76559791 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -713,6 +713,7 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -720,6 +721,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
+	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 };
 
 /*
@@ -1746,6 +1748,7 @@ struct ieee80211_sta_rates {
  *	Both additional HT limits must be enforced by the low level driver.
  *	This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2).
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
+ * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1767,6 +1770,7 @@ struct ieee80211_sta {
 	u8 max_amsdu_subframes;
 	u16 max_amsdu_len;
 	bool support_p2p_ps;
+	u16 max_rc_amsdu_len;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
@@ -1983,6 +1987,15 @@ struct ieee80211_txq {
  * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX,
  *	which implies using per-CPU station statistics.
  *
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ *	A-MSDU frames. Requires software tx queueing and fast-xmit support.
+ *	When not using minstrel/minstrel_ht rate control, the driver must
+ *	limit the maximum A-MSDU size based on the current tx rate by setting
+ *	max_rc_amsdu_len in struct ieee80211_sta.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ *	skbs, needed for zero-copy software A-MSDU.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2021,6 +2034,8 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
 	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
 	IEEE80211_HW_USES_RSS,
+	IEEE80211_HW_TX_AMSDU,
+	IEEE80211_HW_TX_FRAG_LIST,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -2093,6 +2108,9 @@ enum ieee80211_hw_flags {
  *	size is smaller (an example is LinkSys WRT120N with FW v1.0.07
  *	build 002 Jun 18 2012).
  *
+ * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum
+ *	of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list.
+ *
  * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
  *	(if %IEEE80211_HW_QUEUE_CONTROL is set)
  *
@@ -2147,6 +2165,7 @@ struct ieee80211_hw {
 	u8 max_rate_tries;
 	u8 max_rx_aggregation_subframes;
 	u8 max_tx_aggregation_subframes;
+	u8 max_tx_fragments;
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f243a2..42fa81031dfa 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 				  size_t len)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_txq *txq;
 	u16 capab, tid;
 	u8 buf_size;
 	bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
 	buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
 
+	txq = sta->sta.txq[tid];
+	if (!amsdu && txq)
+		set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 52ed2afc408d..b251b2f7f8dd 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -128,6 +128,8 @@ static const char *hw_flag_names[] = {
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
 	FLAG(SUPPORTS_REORDERING_BUFFER),
 	FLAG(USES_RSS),
+	FLAG(TX_AMSDU),
+	FLAG(TX_FRAG_LIST),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6243109979ed..40c1d343992c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -802,6 +802,7 @@ struct mac80211_qos_map {
 enum txq_info_flags {
 	IEEE80211_TXQ_STOP,
 	IEEE80211_TXQ_AMPDU,
+	IEEE80211_TXQ_NO_AMSDU,
 };
 
 struct txq_info {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cf2aca0cc200..960e13d8ed30 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -416,6 +416,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA;
+
 	sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
 
 	return sta;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 597c8fe672a3..4fa2842ddb25 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 out:
 	spin_unlock_bh(&txqi->queue.lock);
 
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
 	return skb;
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2802,6 +2806,154 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
 		kfree_rcu(fast_tx, rcu_head);
 }
 
+static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local,
+					struct sk_buff *skb, int headroom,
+					int *subframe_len)
+{
+	int amsdu_len = *subframe_len + sizeof(struct ethhdr);
+	int padding = (4 - amsdu_len) & 3;
+
+	if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) {
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+
+		if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) {
+			wiphy_debug(local->hw.wiphy,
+				    "failed to reallocate TX buffer\n");
+			return false;
+		}
+	}
+
+	if (padding) {
+		*subframe_len += padding;
+		memset(skb_put(skb, padding), 0, padding);
+	}
+
+	return true;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_fast_tx *fast_tx,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr;
+	struct ethhdr amsdu_hdr;
+	int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+	int subframe_len = skb->len - hdr_len;
+	void *data;
+	u8 *qc;
+
+	if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+		return false;
+
+	if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+		return true;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+					 &subframe_len))
+		return false;
+
+	amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+	memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+	memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+	data = skb_push(skb, sizeof(amsdu_hdr));
+	memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+	memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+	hdr = data;
+	qc = ieee80211_get_qos_ctl(hdr);
+	*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+	info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+	return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+				      struct sta_info *sta,
+				      struct ieee80211_fast_tx *fast_tx,
+				      struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+	struct ieee80211_txq *txq = sta->sta.txq[tid];
+	struct txq_info *txqi;
+	struct sk_buff **frag_tail, *head;
+	int subframe_len = skb->len - ETH_ALEN;
+	u8 max_subframes = sta->sta.max_amsdu_subframes;
+	int max_frags = local->hw.max_tx_fragments;
+	int max_amsdu_len = sta->sta.max_amsdu_len;
+	__be16 len;
+	void *data;
+	bool ret = false;
+	int n = 1, nfrags;
+
+	if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+		return false;
+
+	if (!txq)
+		return false;
+
+	txqi = to_txq_info(txq);
+	if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+		return false;
+
+	if (sta->sta.max_rc_amsdu_len)
+		max_amsdu_len = min_t(int, max_amsdu_len,
+				      sta->sta.max_rc_amsdu_len);
+
+	spin_lock_bh(&txqi->queue.lock);
+
+	head = skb_peek_tail(&txqi->queue);
+	if (!head)
+		goto out;
+
+	if (skb->len + head->len > max_amsdu_len)
+		goto out;
+
+	if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+		goto out;
+
+	nfrags = 1 + skb_shinfo(skb)->nr_frags;
+	nfrags += 1 + skb_shinfo(head)->nr_frags;
+	frag_tail = &skb_shinfo(head)->frag_list;
+	while (*frag_tail) {
+		nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+		frag_tail = &(*frag_tail)->next;
+		n++;
+	}
+
+	if (max_subframes && n > max_subframes)
+		goto out;
+
+	if (max_frags && nfrags > max_frags)
+		goto out;
+
+	if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2,
+					 &subframe_len))
+		goto out;
+
+	ret = true;
+	data = skb_push(skb, ETH_ALEN + 2);
+	memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+	data += 2 * ETH_ALEN;
+	len = cpu_to_be16(subframe_len);
+	memcpy(data, &len, 2);
+	memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header));
+
+	head->len += skb->len;
+	head->data_len += skb->len;
+	*frag_tail = skb;
+
+out:
+	spin_unlock_bh(&txqi->queue.lock);
+
+	return ret;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 				struct net_device *dev, struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
@@ -2856,6 +3008,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_tx_stats(dev, skb->len + extra_head);
 
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+		return true;
+
 	/* will not be crypto-handled beyond what we do here, so use false
 	 * as the may-encrypt argument for the resize to not account for
 	 * more room than we already have in 'extra_head'
-- 
cgit v1.2.3


From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Generalise system_verify_data() to provide access to internal
 content

Generalise system_verify_data() to provide access to internal content
through a callback.  This allows all the PKCS#7 stuff to be hidden inside
this function and removed from the PE file parser and the PKCS#7 test key.

If external content is not required, NULL should be passed as data to the
function.  If the callback is not required, that can be set to NULL.

The function is now called verify_pkcs7_signature() to contrast with
verify_pefile_signature() and the definitions of both have been moved into
linux/verification.h along with the key_being_used_for enum.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 arch/x86/kernel/kexec-bzimage64.c       | 18 ++-------
 certs/system_keyring.c                  | 45 ++++++++++++++++-----
 crypto/asymmetric_keys/Kconfig          |  4 +-
 crypto/asymmetric_keys/mscode_parser.c  | 21 ++++------
 crypto/asymmetric_keys/pkcs7_key_type.c | 72 +++++++++++++--------------------
 crypto/asymmetric_keys/pkcs7_parser.c   | 21 +++++-----
 crypto/asymmetric_keys/verify_pefile.c  | 40 +++++-------------
 crypto/asymmetric_keys/verify_pefile.h  |  5 +--
 include/crypto/pkcs7.h                  |  3 +-
 include/crypto/public_key.h             | 14 -------
 include/keys/asymmetric-type.h          |  1 +
 include/keys/system_keyring.h           |  7 +---
 include/linux/verification.h            | 50 +++++++++++++++++++++++
 include/linux/verify_pefile.h           | 22 ----------
 kernel/module_signing.c                 |  5 ++-
 15 files changed, 155 insertions(+), 173 deletions(-)
 create mode 100644 include/linux/verification.h
 delete mode 100644 include/linux/verify_pefile.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2af478e3fd4e..f2356bda2b05 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -19,8 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/efi.h>
-#include <linux/verify_pefile.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 
 #include <asm/bootparam.h>
 #include <asm/setup.h>
@@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data)
 #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
 static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 {
-	bool trusted;
-	int ret;
-
-	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring,
-				      VERIFYING_KEXEC_PE_SIGNATURE,
-				      &trusted);
-	if (ret < 0)
-		return ret;
-	if (!trusted)
-		return -EKEYREJECTED;
-	return 0;
+	return verify_pefile_signature(kernel, kernel_len,
+				       NULL,
+				       VERIFYING_KEXEC_PE_SIGNATURE);
 }
 #endif
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index f4180326c2e1..a83bffedc0aa 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -108,16 +108,25 @@ late_initcall(load_system_certificate_list);
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 
 /**
- * Verify a PKCS#7-based signature on system data.
- * @data: The data to be verified.
+ * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data.
+ * @data: The data to be verified (NULL if expecting internal data).
  * @len: Size of @data.
  * @raw_pkcs7: The PKCS#7 message that is the signature.
  * @pkcs7_len: The size of @raw_pkcs7.
+ * @trusted_keys: Trusted keys to use (NULL for system_trusted_keyring).
  * @usage: The use to which the key is being put.
+ * @view_content: Callback to gain access to content.
+ * @ctx: Context for callback.
  */
-int system_verify_data(const void *data, unsigned long len,
-		       const void *raw_pkcs7, size_t pkcs7_len,
-		       enum key_being_used_for usage)
+int verify_pkcs7_signature(const void *data, size_t len,
+			   const void *raw_pkcs7, size_t pkcs7_len,
+			   struct key *trusted_keys,
+			   int untrusted_error,
+			   enum key_being_used_for usage,
+			   int (*view_content)(void *ctx,
+					       const void *data, size_t len,
+					       size_t asn1hdrlen),
+			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
 	bool trusted;
@@ -128,7 +137,7 @@ int system_verify_data(const void *data, unsigned long len,
 		return PTR_ERR(pkcs7);
 
 	/* The data should be detached - so we need to supply it. */
-	if (pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
+	if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
 		pr_err("PKCS#7 signature with non-detached data\n");
 		ret = -EBADMSG;
 		goto error;
@@ -138,13 +147,29 @@ int system_verify_data(const void *data, unsigned long len,
 	if (ret < 0)
 		goto error;
 
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
+	if (!trusted_keys)
+		trusted_keys = system_trusted_keyring;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
 	if (ret < 0)
 		goto error;
 
-	if (!trusted) {
+	if (!trusted && untrusted_error) {
 		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = -ENOKEY;
+		ret = untrusted_error;
+		goto error;
+	}
+
+	if (view_content) {
+		size_t asn1hdrlen;
+
+		ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen);
+		if (ret < 0) {
+			if (ret == -ENODATA)
+				pr_devel("PKCS#7 message does not contain data\n");
+			goto error;
+		}
+
+		ret = view_content(ctx, data, len, asn1hdrlen);
 	}
 
 error:
@@ -152,6 +177,6 @@ error:
 	pr_devel("<==%s() = %d\n", __func__, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(system_verify_data);
+EXPORT_SYMBOL_GPL(verify_pkcs7_signature);
 
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig
index 91a7e047a765..f7d2ef9789d8 100644
--- a/crypto/asymmetric_keys/Kconfig
+++ b/crypto/asymmetric_keys/Kconfig
@@ -40,8 +40,7 @@ config PKCS7_MESSAGE_PARSER
 
 config PKCS7_TEST_KEY
 	tristate "PKCS#7 testing key type"
-	depends on PKCS7_MESSAGE_PARSER
-	select SYSTEM_TRUSTED_KEYRING
+	depends on SYSTEM_DATA_VERIFICATION
 	help
 	  This option provides a type of key that can be loaded up from a
 	  PKCS#7 message - provided the message is signed by a trusted key.  If
@@ -54,6 +53,7 @@ config PKCS7_TEST_KEY
 config SIGNED_PE_FILE_VERIFICATION
 	bool "Support for PE file signature verification"
 	depends on PKCS7_MESSAGE_PARSER=y
+	depends on SYSTEM_DATA_VERIFICATION
 	select ASN1
 	select OID_REGISTRY
 	help
diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 3242cbfaeaa2..6a76d5c70ef6 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -21,19 +21,13 @@
 /*
  * Parse a Microsoft Individual Code Signing blob
  */
-int mscode_parse(struct pefile_context *ctx)
+int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+		 size_t asn1hdrlen)
 {
-	const void *content_data;
-	size_t data_len;
-	int ret;
-
-	ret = pkcs7_get_content_data(ctx->pkcs7, &content_data, &data_len, 1);
-
-	if (ret) {
-		pr_debug("PKCS#7 message does not contain data\n");
-		return ret;
-	}
+	struct pefile_context *ctx = _ctx;
 
+	content_data -= asn1hdrlen;
+	data_len += asn1hdrlen;
 	pr_devel("Data: %zu [%*ph]\n", data_len, (unsigned)(data_len),
 		 content_data);
 
@@ -129,7 +123,6 @@ int mscode_note_digest(void *context, size_t hdrlen,
 {
 	struct pefile_context *ctx = context;
 
-	ctx->digest = value;
-	ctx->digest_len = vlen;
-	return 0;
+	ctx->digest = kmemdup(value, vlen, GFP_KERNEL);
+	return ctx->digest ? 0 : -ENOMEM;
 }
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index e2d0edbbc71a..ab9bf5363ecd 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -13,12 +13,9 @@
 #include <linux/key.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/verification.h>
 #include <linux/key-type.h>
-#include <keys/asymmetric-type.h>
-#include <crypto/pkcs7.h>
 #include <keys/user-type.h>
-#include <keys/system_keyring.h>
-#include "pkcs7_parser.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("PKCS#7 testing key type");
@@ -29,59 +26,46 @@ MODULE_PARM_DESC(pkcs7_usage,
 		 "Usage to specify when verifying the PKCS#7 message");
 
 /*
- * Preparse a PKCS#7 wrapped and validated data blob.
+ * Retrieve the PKCS#7 message content.
  */
-static int pkcs7_preparse(struct key_preparsed_payload *prep)
+static int pkcs7_view_content(void *ctx, const void *data, size_t len,
+			      size_t asn1hdrlen)
 {
-	enum key_being_used_for usage = pkcs7_usage;
-	struct pkcs7_message *pkcs7;
-	const void *data, *saved_prep_data;
-	size_t datalen, saved_prep_datalen;
-	bool trusted;
+	struct key_preparsed_payload *prep = ctx;
+	const void *saved_prep_data;
+	size_t saved_prep_datalen;
 	int ret;
 
-	kenter("");
-
-	if (usage >= NR__KEY_BEING_USED_FOR) {
-		pr_err("Invalid usage type %d\n", usage);
-		return -EINVAL;
-	}
-
 	saved_prep_data = prep->data;
 	saved_prep_datalen = prep->datalen;
-	pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
-	if (IS_ERR(pkcs7)) {
-		ret = PTR_ERR(pkcs7);
-		goto error;
-	}
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error_free;
-
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
-	if (ret < 0)
-		goto error_free;
-	if (!trusted)
-		pr_warn("PKCS#7 message doesn't chain back to a trusted key\n");
-
-	ret = pkcs7_get_content_data(pkcs7, &data, &datalen, false);
-	if (ret < 0)
-		goto error_free;
-
 	prep->data = data;
-	prep->datalen = datalen;
+	prep->datalen = len;
+
 	ret = user_preparse(prep);
+
 	prep->data = saved_prep_data;
 	prep->datalen = saved_prep_datalen;
-
-error_free:
-	pkcs7_free_message(pkcs7);
-error:
-	kleave(" = %d", ret);
 	return ret;
 }
 
+/*
+ * Preparse a PKCS#7 wrapped and validated data blob.
+ */
+static int pkcs7_preparse(struct key_preparsed_payload *prep)
+{
+	enum key_being_used_for usage = pkcs7_usage;
+
+	if (usage >= NR__KEY_BEING_USED_FOR) {
+		pr_err("Invalid usage type %d\n", usage);
+		return -EINVAL;
+	}
+
+	return verify_pkcs7_signature(NULL, 0,
+				      prep->data, prep->datalen,
+				      NULL, -ENOKEY, usage,
+				      pkcs7_view_content, prep);
+}
+
 /*
  * user defined keys take an arbitrary string as the description and an
  * arbitrary blob of data as the payload
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 835701613125..af4cd8649117 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -168,24 +168,25 @@ EXPORT_SYMBOL_GPL(pkcs7_parse_message);
  * @pkcs7: The preparsed PKCS#7 message to access
  * @_data: Place to return a pointer to the data
  * @_data_len: Place to return the data length
- * @want_wrapper: True if the ASN.1 object header should be included in the data
+ * @_headerlen: Size of ASN.1 header not included in _data
  *
- * Get access to the data content of the PKCS#7 message, including, optionally,
- * the header of the ASN.1 object that contains it.  Returns -ENODATA if the
- * data object was missing from the message.
+ * Get access to the data content of the PKCS#7 message.  The size of the
+ * header of the ASN.1 object that contains it is also provided and can be used
+ * to adjust *_data and *_data_len to get the entire object.
+ *
+ * Returns -ENODATA if the data object was missing from the message.
  */
 int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 			   const void **_data, size_t *_data_len,
-			   bool want_wrapper)
+			   size_t *_headerlen)
 {
-	size_t wrapper;
-
 	if (!pkcs7->data)
 		return -ENODATA;
 
-	wrapper = want_wrapper ? pkcs7->data_hdrlen : 0;
-	*_data = pkcs7->data - wrapper;
-	*_data_len = pkcs7->data_len + wrapper;
+	*_data = pkcs7->data;
+	*_data_len = pkcs7->data_len;
+	if (_headerlen)
+		*_headerlen = pkcs7->data_hdrlen;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pkcs7_get_content_data);
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 7e8c2338ae25..265351075b0e 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/pe.h>
 #include <linux/asn1.h>
-#include <crypto/pkcs7.h>
+#include <linux/verification.h>
 #include <crypto/hash.h>
 #include "verify_pefile.h"
 
@@ -392,9 +392,8 @@ error_no_desc:
  * verify_pefile_signature - Verify the signature on a PE binary image
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
- * @trust_keyring: Signing certificates to use as starting points
+ * @trust_keys: Signing certificate(s) to use as starting points
  * @usage: The use to which the key is being put.
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
  * binary image intersects keys we already know and trust.
@@ -418,14 +417,10 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-			    struct key *trusted_keyring,
-			    enum key_being_used_for usage,
-			    bool *_trusted)
+			    struct key *trusted_keys,
+			    enum key_being_used_for usage)
 {
-	struct pkcs7_message *pkcs7;
 	struct pefile_context ctx;
-	const void *data;
-	size_t datalen;
 	int ret;
 
 	kenter("");
@@ -439,19 +434,10 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	if (ret < 0)
 		return ret;
 
-	pkcs7 = pkcs7_parse_message(pebuf + ctx.sig_offset, ctx.sig_len);
-	if (IS_ERR(pkcs7))
-		return PTR_ERR(pkcs7);
-	ctx.pkcs7 = pkcs7;
-
-	ret = pkcs7_get_content_data(ctx.pkcs7, &data, &datalen, false);
-	if (ret < 0 || datalen == 0) {
-		pr_devel("PKCS#7 message does not contain data\n");
-		ret = -EBADMSG;
-		goto error;
-	}
-
-	ret = mscode_parse(&ctx);
+	ret = verify_pkcs7_signature(NULL, 0,
+				     pebuf + ctx.sig_offset, ctx.sig_len,
+				     trusted_keys, -EKEYREJECTED, usage,
+				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
 
@@ -462,16 +448,8 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	 * contents.
 	 */
 	ret = pefile_digest_pe(pebuf, pelen, &ctx);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_validate_trust(pkcs7, trusted_keyring, _trusted);
 
 error:
-	pkcs7_free_message(ctx.pkcs7);
+	kfree(ctx.digest);
 	return ret;
 }
diff --git a/crypto/asymmetric_keys/verify_pefile.h b/crypto/asymmetric_keys/verify_pefile.h
index a133eb81a492..cd4d20930728 100644
--- a/crypto/asymmetric_keys/verify_pefile.h
+++ b/crypto/asymmetric_keys/verify_pefile.h
@@ -9,7 +9,6 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-#include <linux/verify_pefile.h>
 #include <crypto/pkcs7.h>
 #include <crypto/hash_info.h>
 
@@ -23,7 +22,6 @@ struct pefile_context {
 	unsigned	sig_offset;
 	unsigned	sig_len;
 	const struct section_header *secs;
-	struct pkcs7_message *pkcs7;
 
 	/* PKCS#7 MS Individual Code Signing content */
 	const void	*digest;		/* Digest */
@@ -39,4 +37,5 @@ struct pefile_context {
 /*
  * mscode_parser.c
  */
-extern int mscode_parse(struct pefile_context *ctx);
+extern int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+			size_t asn1hdrlen);
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 441aff9b5aa7..8323e3e57131 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -12,6 +12,7 @@
 #ifndef _CRYPTO_PKCS7_H
 #define _CRYPTO_PKCS7_H
 
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 struct key;
@@ -26,7 +27,7 @@ extern void pkcs7_free_message(struct pkcs7_message *pkcs7);
 
 extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 				  const void **_data, size_t *_datalen,
-				  bool want_wrapper);
+				  size_t *_headerlen);
 
 /*
  * pkcs7_trust.c
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 2f5de5c1a3a0..b3928e801b8c 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -14,20 +14,6 @@
 #ifndef _LINUX_PUBLIC_KEY_H
 #define _LINUX_PUBLIC_KEY_H
 
-/*
- * The use to which an asymmetric key is being put.
- */
-enum key_being_used_for {
-	VERIFYING_MODULE_SIGNATURE,
-	VERIFYING_FIRMWARE_SIGNATURE,
-	VERIFYING_KEXEC_PE_SIGNATURE,
-	VERIFYING_KEY_SIGNATURE,
-	VERIFYING_KEY_SELF_SIGNATURE,
-	VERIFYING_UNSPECIFIED_SIGNATURE,
-	NR__KEY_BEING_USED_FOR
-};
-extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
-
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h
index 70a8775bb444..d1e23dda4363 100644
--- a/include/keys/asymmetric-type.h
+++ b/include/keys/asymmetric-type.h
@@ -15,6 +15,7 @@
 #define _KEYS_ASYMMETRIC_TYPE_H
 
 #include <linux/key-type.h>
+#include <linux/verification.h>
 
 extern struct key_type key_type_asymmetric;
 
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 39fd38cfa8c9..b2d645ac35a0 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
@@ -29,12 +30,6 @@ static inline struct key *get_system_trusted_keyring(void)
 }
 #endif
 
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-extern int system_verify_data(const void *data, unsigned long len,
-			      const void *raw_pkcs7, size_t pkcs7_len,
-			      enum key_being_used_for usage);
-#endif
-
 #ifdef CONFIG_IMA_MOK_KEYRING
 extern struct key *ima_mok_keyring;
 extern struct key *ima_blacklist_keyring;
diff --git a/include/linux/verification.h b/include/linux/verification.h
new file mode 100644
index 000000000000..bb0fcf941cb7
--- /dev/null
+++ b/include/linux/verification.h
@@ -0,0 +1,50 @@
+/* Signature verification
+ *
+ * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_VERIFICATION_H
+#define _LINUX_VERIFICATION_H
+
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+	VERIFYING_MODULE_SIGNATURE,
+	VERIFYING_FIRMWARE_SIGNATURE,
+	VERIFYING_KEXEC_PE_SIGNATURE,
+	VERIFYING_KEY_SIGNATURE,
+	VERIFYING_KEY_SELF_SIGNATURE,
+	VERIFYING_UNSPECIFIED_SIGNATURE,
+	NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+
+struct key;
+
+extern int verify_pkcs7_signature(const void *data, size_t len,
+				  const void *raw_pkcs7, size_t pkcs7_len,
+				  struct key *trusted_keys,
+				  int untrusted_error,
+				  enum key_being_used_for usage,
+				  int (*view_content)(void *ctx,
+						      const void *data, size_t len,
+						      size_t asn1hdrlen),
+				  void *ctx);
+
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
+				   struct key *trusted_keys,
+				   enum key_being_used_for usage);
+#endif
+
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h
deleted file mode 100644
index da2049b5161c..000000000000
--- a/include/linux/verify_pefile.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Signed PE file verification
- *
- * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _LINUX_VERIFY_PEFILE_H
-#define _LINUX_VERIFY_PEFILE_H
-
-#include <crypto/public_key.h>
-
-extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-				   struct key *trusted_keyring,
-				   enum key_being_used_for usage,
-				   bool *_trusted);
-
-#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..593aace88a02 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len,
-				  VERIFYING_MODULE_SIGNATURE);
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
 }
-- 
cgit v1.2.3


From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: PKCS#7: Make trust determination dependent on contents of trust
 keyring

Make the determination of the trustworthiness of a key dependent on whether
a key that can verify it is present in the supplied ring of trusted keys
rather than whether or not the verifying key has KEY_FLAG_TRUSTED set.

verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust
chain cannot be verified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c                  | 13 ++++---------
 crypto/asymmetric_keys/pkcs7_key_type.c |  2 +-
 crypto/asymmetric_keys/pkcs7_parser.h   |  1 -
 crypto/asymmetric_keys/pkcs7_trust.c    | 18 +++---------------
 crypto/asymmetric_keys/verify_pefile.c  |  2 +-
 crypto/asymmetric_keys/x509_parser.h    |  1 -
 include/crypto/pkcs7.h                  |  3 +--
 include/linux/verification.h            |  1 -
 kernel/module_signing.c                 |  2 +-
 9 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index a83bffedc0aa..dc18869ff680 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -121,7 +121,6 @@ late_initcall(load_system_certificate_list);
 int verify_pkcs7_signature(const void *data, size_t len,
 			   const void *raw_pkcs7, size_t pkcs7_len,
 			   struct key *trusted_keys,
-			   int untrusted_error,
 			   enum key_being_used_for usage,
 			   int (*view_content)(void *ctx,
 					       const void *data, size_t len,
@@ -129,7 +128,6 @@ int verify_pkcs7_signature(const void *data, size_t len,
 			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
-	bool trusted;
 	int ret;
 
 	pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
@@ -149,13 +147,10 @@ int verify_pkcs7_signature(const void *data, size_t len,
 
 	if (!trusted_keys)
 		trusted_keys = system_trusted_keyring;
-	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
-	if (ret < 0)
-		goto error;
-
-	if (!trusted && untrusted_error) {
-		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = untrusted_error;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys);
+	if (ret < 0) {
+		if (ret == -ENOKEY)
+			pr_err("PKCS#7 signature not signed with a trusted key\n");
 		goto error;
 	}
 
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index ab9bf5363ecd..3b92523882e5 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -62,7 +62,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	return verify_pkcs7_signature(NULL, 0,
 				      prep->data, prep->datalen,
-				      NULL, -ENOKEY, usage,
+				      NULL, usage,
 				      pkcs7_view_content, prep);
 }
 
diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h
index d5eec31e95b6..f4e81074f5e0 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.h
+++ b/crypto/asymmetric_keys/pkcs7_parser.h
@@ -22,7 +22,6 @@ struct pkcs7_signed_info {
 	struct pkcs7_signed_info *next;
 	struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
 	unsigned	index;
-	bool		trusted;
 	bool		unsupported_crypto;	/* T if not usable due to missing crypto */
 
 	/* Message digest - the digest of the Content Data (or NULL) */
diff --git a/crypto/asymmetric_keys/pkcs7_trust.c b/crypto/asymmetric_keys/pkcs7_trust.c
index b9a5487cd82d..36e77cb07bd0 100644
--- a/crypto/asymmetric_keys/pkcs7_trust.c
+++ b/crypto/asymmetric_keys/pkcs7_trust.c
@@ -30,7 +30,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 	struct public_key_signature *sig = sinfo->sig;
 	struct x509_certificate *x509, *last = NULL, *p;
 	struct key *key;
-	bool trusted;
 	int ret;
 
 	kenter(",%u,", sinfo->index);
@@ -42,10 +41,8 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 	for (x509 = sinfo->signer; x509; x509 = x509->signer) {
 		if (x509->seen) {
-			if (x509->verified) {
-				trusted = x509->trusted;
+			if (x509->verified)
 				goto verified;
-			}
 			kleave(" = -ENOKEY [cached]");
 			return -ENOKEY;
 		}
@@ -122,7 +119,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 matched:
 	ret = verify_signature(key, sig);
-	trusted = test_bit(KEY_FLAG_TRUSTED, &key->flags);
 	key_put(key);
 	if (ret < 0) {
 		if (ret == -ENOMEM)
@@ -134,12 +130,9 @@ matched:
 verified:
 	if (x509) {
 		x509->verified = true;
-		for (p = sinfo->signer; p != x509; p = p->signer) {
+		for (p = sinfo->signer; p != x509; p = p->signer)
 			p->verified = true;
-			p->trusted = trusted;
-		}
 	}
-	sinfo->trusted = trusted;
 	kleave(" = 0");
 	return 0;
 }
@@ -148,7 +141,6 @@ verified:
  * pkcs7_validate_trust - Validate PKCS#7 trust chain
  * @pkcs7: The PKCS#7 certificate to validate
  * @trust_keyring: Signing certificates to use as starting points
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message intersects
  * keys we already know and trust.
@@ -170,16 +162,13 @@ verified:
  * May also return -ENOMEM.
  */
 int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-			 struct key *trust_keyring,
-			 bool *_trusted)
+			 struct key *trust_keyring)
 {
 	struct pkcs7_signed_info *sinfo;
 	struct x509_certificate *p;
 	int cached_ret = -ENOKEY;
 	int ret;
 
-	*_trusted = false;
-
 	for (p = pkcs7->certs; p; p = p->next)
 		p->seen = false;
 
@@ -193,7 +182,6 @@ int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 				cached_ret = -ENOPKG;
 			continue;
 		case 0:
-			*_trusted |= sinfo->trusted;
 			cached_ret = 0;
 			continue;
 		default:
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 265351075b0e..672a94c2c3ff 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -436,7 +436,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 
 	ret = verify_pkcs7_signature(NULL, 0,
 				     pebuf + ctx.sig_offset, ctx.sig_len,
-				     trusted_keys, -EKEYREJECTED, usage,
+				     trusted_keys, usage,
 				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h
index f24f4d808e7f..05eef1c68881 100644
--- a/crypto/asymmetric_keys/x509_parser.h
+++ b/crypto/asymmetric_keys/x509_parser.h
@@ -39,7 +39,6 @@ struct x509_certificate {
 	unsigned	index;
 	bool		seen;			/* Infinite recursion prevention */
 	bool		verified;
-	bool		trusted;
 	bool		self_signed;		/* T if self-signed (check unsupported_sig too) */
 	bool		unsupported_key;	/* T if key uses unsupported crypto */
 	bool		unsupported_sig;	/* T if signature uses unsupported crypto */
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 8323e3e57131..583f199400a3 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -33,8 +33,7 @@ extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
  * pkcs7_trust.c
  */
 extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-				struct key *trust_keyring,
-				bool *_trusted);
+				struct key *trust_keyring);
 
 /*
  * pkcs7_verify.c
diff --git a/include/linux/verification.h b/include/linux/verification.h
index bb0fcf941cb7..a10549a6c7cd 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -33,7 +33,6 @@ struct key;
 extern int verify_pkcs7_signature(const void *data, size_t len,
 				  const void *raw_pkcs7, size_t pkcs7_len,
 				  struct key *trusted_keys,
-				  int untrusted_error,
 				  enum key_being_used_for usage,
 				  int (*view_content)(void *ctx,
 						      const void *data, size_t len,
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 593aace88a02..6a64e03b9f44 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit v1.2.3


From 31e6850e0fdb3a586363cc4d2f9801cdf9374310 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 5 Apr 2016 12:39:30 +0100
Subject: iommu: Add MMIO mapping type

On some platforms, MMIO regions might need slightly different treatment
compared to mapping regular memory; add the notion of MMIO mappings to
the IOMMU API's memory type flags, so that callers can let the IOMMU
drivers know to do the right thing.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a5c539fa5d2b..34b643227df1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -30,6 +30,7 @@
 #define IOMMU_WRITE	(1 << 1)
 #define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
 #define IOMMU_NOEXEC	(1 << 3)
+#define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
 
 struct iommu_ops;
 struct iommu_group;
-- 
cgit v1.2.3


From 848fc67da5fb43ef7d92d20891eef79f5de45816 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 4 Mar 2016 15:32:40 +0100
Subject: clk: renesas: mstp: Drop check for CONFIG_PM_GENERIC_DOMAINS_OF

As of commit 71d076ceb245f0d9 ("ARM: shmobile: Enable PM and
PM_GENERIC_DOMAINS for SoCs with PM Domains"),
CONFIG_PM_GENERIC_DOMAINS_OF is always enabled for SoCs with MSTP
clocks.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
---
 drivers/clk/renesas/clk-mstp.c | 3 ---
 include/linux/clk/renesas.h    | 4 ----
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c
index 3d44e183aedd..5e3b3d856eef 100644
--- a/drivers/clk/renesas/clk-mstp.c
+++ b/drivers/clk/renesas/clk-mstp.c
@@ -243,8 +243,6 @@ static void __init cpg_mstp_clocks_init(struct device_node *np)
 }
 CLK_OF_DECLARE(cpg_mstp_clks, "renesas,cpg-mstp-clocks", cpg_mstp_clocks_init);
 
-
-#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
 int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev)
 {
 	struct device_node *np = dev->of_node;
@@ -326,4 +324,3 @@ void __init cpg_mstp_add_clk_domain(struct device_node *np)
 
 	of_genpd_add_provider_simple(np, pd);
 }
-#endif /* !CONFIG_PM_GENERIC_DOMAINS_OF */
diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h
index 7adfd80fbf55..2a3379cf1330 100644
--- a/include/linux/clk/renesas.h
+++ b/include/linux/clk/renesas.h
@@ -24,12 +24,8 @@ void r8a7778_clocks_init(u32 mode);
 void r8a7779_clocks_init(u32 mode);
 void rcar_gen2_clocks_init(u32 mode);
 
-#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
 void cpg_mstp_add_clk_domain(struct device_node *np);
 int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev);
 void cpg_mstp_detach_dev(struct generic_pm_domain *domain, struct device *dev);
-#else
-static inline void cpg_mstp_add_clk_domain(struct device_node *np) {}
-#endif
 
 #endif
-- 
cgit v1.2.3


From 12a56817b329d8a73ab53bad09aa976aeea46db9 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 4 Mar 2016 16:59:26 +0100
Subject: clk: renesas: mstp: Clarify cpg_mstp_{at,de}tach_dev() domain
 parameter

Make it clear that the "domain" parameter of the cpg_mstp_attach_dev()
and cpg_mstp_detach_dev() functions is not used.

The cpg_mstp_attach_dev() and cpg_mstp_detach_dev() callbacks are not
only used by the CPG/MSTP Clock Domain driver, but also by the R-Mobile
SYSC PM Domain driver.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/clk/renesas/clk-mstp.c | 4 ++--
 include/linux/clk/renesas.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c
index 5e3b3d856eef..8b597b9a3804 100644
--- a/drivers/clk/renesas/clk-mstp.c
+++ b/drivers/clk/renesas/clk-mstp.c
@@ -243,7 +243,7 @@ static void __init cpg_mstp_clocks_init(struct device_node *np)
 }
 CLK_OF_DECLARE(cpg_mstp_clks, "renesas,cpg-mstp-clocks", cpg_mstp_clocks_init);
 
-int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev)
+int cpg_mstp_attach_dev(struct generic_pm_domain *unused, struct device *dev)
 {
 	struct device_node *np = dev->of_node;
 	struct of_phandle_args clkspec;
@@ -295,7 +295,7 @@ fail_put:
 	return error;
 }
 
-void cpg_mstp_detach_dev(struct generic_pm_domain *domain, struct device *dev)
+void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev)
 {
 	if (!list_empty(&dev->power.subsys_data->clock_list))
 		pm_clk_destroy(dev);
diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h
index 2a3379cf1330..095b1681daf4 100644
--- a/include/linux/clk/renesas.h
+++ b/include/linux/clk/renesas.h
@@ -25,7 +25,7 @@ void r8a7779_clocks_init(u32 mode);
 void rcar_gen2_clocks_init(u32 mode);
 
 void cpg_mstp_add_clk_domain(struct device_node *np);
-int cpg_mstp_attach_dev(struct generic_pm_domain *domain, struct device *dev);
-void cpg_mstp_detach_dev(struct generic_pm_domain *domain, struct device *dev);
+int cpg_mstp_attach_dev(struct generic_pm_domain *unused, struct device *dev);
+void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev);
 
 #endif
-- 
cgit v1.2.3


From 8ced425ee630c03beea06c1dfa35190bf8395d07 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 5 Apr 2016 17:10:16 +0200
Subject: tun: use socket locks for sk_{attach,detatch}_filter

This reverts commit 5a5abb1fa3b05dd ("tun, bpf: fix suspicious RCU usage
in tun_{attach, detach}_filter") and replaces it to use lock_sock around
sk_{attach,detach}_filter. The checks inside filter.c are updated with
lockdep_sock_is_held to check for proper socket locks.

It keeps the code cleaner by ensuring that only one lock governs the
socket filter instead of two independent locks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 14 +++++++++-----
 include/linux/filter.h |  4 ----
 net/core/filter.c      | 35 +++++++++++++----------------------
 3 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9abc36bf77ea..64bc143eddd9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -622,8 +622,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 
 	/* Re-attach the filter to persist device */
 	if (!skip_filter && (tun->filter_attached == true)) {
-		err = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (!err)
 			goto out;
 	}
@@ -1824,7 +1825,9 @@ static void tun_detach_filter(struct tun_struct *tun, int n)
 
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		__sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		sk_detach_filter(tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 	}
 
 	tun->filter_attached = false;
@@ -1837,8 +1840,9 @@ static int tun_attach_filter(struct tun_struct *tun)
 
 	for (i = 0; i < tun->numqueues; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (ret) {
 			tun_detach_filter(tun, i);
 			return ret;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a51a5361695f..43aa1f8855c7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -465,14 +465,10 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
-int __sk_detach_filter(struct sock *sk, bool locked);
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ca7f832b2980..e8486ba601ea 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1149,8 +1149,7 @@ void bpf_prog_destroy(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
-			    bool locked)
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
 
@@ -1166,8 +1165,10 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
 		return -ENOMEM;
 	}
 
-	old_fp = rcu_dereference_protected(sk->sk_filter, locked);
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
+
 	if (old_fp)
 		sk_filter_uncharge(sk, old_fp);
 
@@ -1246,8 +1247,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
  * occurs or there is insufficient memory for the filter a negative
  * errno code is returned. On success the return is zero.
  */
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked)
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct bpf_prog *prog = __get_filter(fprog, sk);
 	int err;
@@ -1255,7 +1255,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, locked);
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		__bpf_prog_release(prog);
 		return err;
@@ -1263,12 +1263,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__sk_attach_filter);
-
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
-{
-	return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
 
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
@@ -1314,7 +1309,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk));
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		bpf_prog_put(prog);
 		return err;
@@ -2255,7 +2250,7 @@ static int __init register_sk_filter_ops(void)
 }
 late_initcall(register_sk_filter_ops);
 
-int __sk_detach_filter(struct sock *sk, bool locked)
+int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
 	struct sk_filter *filter;
@@ -2263,7 +2258,8 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return -EPERM;
 
-	filter = rcu_dereference_protected(sk->sk_filter, locked);
+	filter = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	if (filter) {
 		RCU_INIT_POINTER(sk->sk_filter, NULL);
 		sk_filter_uncharge(sk, filter);
@@ -2272,12 +2268,7 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__sk_detach_filter);
-
-int sk_detach_filter(struct sock *sk)
-{
-	return __sk_detach_filter(sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
 
 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 		  unsigned int len)
@@ -2288,7 +2279,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 
 	lock_sock(sk);
 	filter = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	if (!filter)
 		goto out;
 
-- 
cgit v1.2.3


From a6024562ffd7e0f31bc6671817840ad1e91de7b4 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 5 Apr 2016 08:22:51 -0700
Subject: udp: Add GRO functions to UDP socket

This patch adds GRO functions (gro_receive and gro_complete) to UDP
sockets. udp_gro_receive is changed to perform socket lookup on a
packet. If a socket is found the related GRO functions are called.

This features obsoletes using UDP offload infrastructure for GRO
(udp_offload). This has the advantage of not being limited to provide
offload on a per port basis, GRO is now applied to whatever individual
UDP sockets are bound to.  This also allows the possbility of
"application defined GRO"-- that is we can attach something like
a BPF program to a UDP socket to perfrom GRO on an application
layer protocol.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h    |  8 ++++++++
 include/net/udp.h      |  7 +++++--
 net/ipv4/udp_offload.c | 52 +++++++++++++++++++-------------------------------
 net/ipv6/Makefile      |  5 +++--
 net/ipv6/af_inet6.c    |  8 ++++++++
 net/ipv6/ip6_offload.c |  2 --
 net/ipv6/ip6_offload.h |  3 ++-
 net/ipv6/udp_offload.c | 11 ++++++++---
 8 files changed, 54 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 32342754643a..d1fd8cd39478 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,14 @@ struct udp_sock {
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
+
+	/* GRO functions for UDP socket */
+	struct sk_buff **	(*gro_receive)(struct sock *sk,
+					       struct sk_buff **head,
+					       struct sk_buff *skb);
+	int			(*gro_complete)(struct sock *sk,
+						struct sk_buff *skb,
+						int nhoff);
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/include/net/udp.h b/include/net/udp.h
index 3aa0b3ec1fb0..3c5a65e0946d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -167,9 +167,12 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
 	UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
 }
 
+typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
+				     __be16 dport);
+
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh);
-int udp_gro_complete(struct sk_buff *skb, int nhoff);
+				 struct udphdr *uh, udp_lookup_t lookup);
+int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0ed2dafb7cc4..65c3fd34b363 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -179,6 +179,7 @@ out_unlock:
 
 	return segs;
 }
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
@@ -304,13 +305,13 @@ unlock:
 EXPORT_SYMBOL(udp_del_offload);
 
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh)
+				 struct udphdr *uh, udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	struct sk_buff *p, **pp = NULL;
 	struct udphdr *uh2;
 	unsigned int off = skb_gro_offset(skb);
 	int flush = 1;
+	struct sock *sk;
 
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +323,11 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
 	rcu_read_lock();
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_receive)
-			goto unflush;
-	}
+	sk = (*lookup)(skb, uh->source, uh->dest);
+
+	if (sk && udp_sk(sk)->gro_receive)
+		goto unflush;
+
 	goto out_unlock;
 
 unflush:
@@ -352,9 +351,7 @@ unflush:
 
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
-	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
-						     uo_priv->offload);
+	pp = udp_sk(sk)->gro_receive(sk, head, skb);
 
 out_unlock:
 	rcu_read_unlock();
@@ -362,6 +359,7 @@ out:
 	NAPI_GRO_CB(skb)->flush |= flush;
 	return pp;
 }
+EXPORT_SYMBOL(udp_gro_receive);
 
 static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
@@ -383,39 +381,28 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					     inet_gro_compute_pseudo);
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 0;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
 	return NULL;
 }
 
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+		     udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	__be16 newlen = htons(skb->len - nhoff);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 	int err = -ENOSYS;
+	struct sock *sk;
 
 	uh->len = newlen;
 
 	rcu_read_lock();
-
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_complete)
-			break;
-	}
-
-	if (uo_priv) {
-		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb,
-				nhoff + sizeof(struct udphdr),
-				uo_priv->offload);
-	}
-
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (sk && udp_sk(sk)->gro_complete)
+		err = udp_sk(sk)->gro_complete(sk, skb,
+				nhoff + sizeof(struct udphdr));
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -426,6 +413,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	return err;
 }
+EXPORT_SYMBOL(udp_gro_complete);
 
 static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 {
@@ -440,7 +428,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
 static const struct net_offload udpv4_offload = {
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fbd90bf8d33..5e9d6bf4aaca 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,9 +8,10 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		addrlabel.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
-		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
+		udp_offload.o
 
-ipv6-offload :=	ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
 ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
 ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b78aad0d52f..bfa86f040c16 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,8 @@
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
 
+#include "ip6_offload.h"
+
 MODULE_AUTHOR("Cast of dozens");
 MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
 MODULE_LICENSE("GPL");
@@ -959,6 +961,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto udplitev6_fail;
 
+	err = udpv6_offload_init();
+	if (err)
+		goto udpv6_offload_fail;
+
 	err = tcpv6_init();
 	if (err)
 		goto tcpv6_fail;
@@ -988,6 +994,8 @@ pingv6_fail:
 ipv6_packet_fail:
 	tcpv6_exit();
 tcpv6_fail:
+	udpv6_offload_exit();
+udpv6_offload_fail:
 	udplitev6_exit();
 udplitev6_fail:
 	udpv6_exit();
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 82e9f3076028..204af2219471 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -325,8 +325,6 @@ static int __init ipv6_offload_init(void)
 
 	if (tcpv6_offload_init() < 0)
 		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
-	if (udp_offload_init() < 0)
-		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
 	if (ipv6_exthdrs_offload_init() < 0)
 		pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
 
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
index 2e155c651b35..96b40e41ac53 100644
--- a/net/ipv6/ip6_offload.h
+++ b/net/ipv6/ip6_offload.h
@@ -12,7 +12,8 @@
 #define __ip6_offload_h
 
 int ipv6_exthdrs_offload_init(void);
-int udp_offload_init(void);
+int udpv6_offload_init(void);
+int udpv6_offload_exit(void);
 int tcpv6_offload_init(void);
 
 #endif
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2b0fbe6929e8..5429f6bcf047 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -153,7 +153,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 1;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
@@ -173,7 +173,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
 static const struct net_offload udpv6_offload = {
@@ -184,7 +184,12 @@ static const struct net_offload udpv6_offload = {
 	},
 };
 
-int __init udp_offload_init(void)
+int udpv6_offload_init(void)
 {
 	return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
 }
+
+int udpv6_offload_exit(void)
+{
+	return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+}
-- 
cgit v1.2.3


From 46aa2f30aa7fe03a4dcd732b009284c02ff4f093 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 5 Apr 2016 08:22:56 -0700
Subject: udp: Remove udp_offloads

Now that the UDP encapsulation GRO functions have been moved to the UDP
socket we not longer need the udp_offload insfrastructure so removing it.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 -------------
 include/net/protocol.h    |  3 ---
 net/ipv4/udp_offload.c    | 63 -----------------------------------------------
 3 files changed, 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb0d5d09c2e4..cb4e508b3f38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2159,23 +2159,6 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
-struct udp_offload;
-
-struct udp_offload_callbacks {
-	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-						 struct sk_buff *skb,
-						 struct udp_offload *uoff);
-	int			(*gro_complete)(struct sk_buff *skb,
-						int nhoff,
-						struct udp_offload *uoff);
-};
-
-struct udp_offload {
-	__be16			 port;
-	u8			 ipproto;
-	struct udp_offload_callbacks callbacks;
-};
-
 /* often modified stats are per-CPU, other are shared (netdev->stats) */
 struct pcpu_sw_netstats {
 	u64     rx_packets;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index da689f5432de..bf36ca34af7a 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -107,9 +107,6 @@ int inet_del_offload(const struct net_offload *prot, unsigned char num);
 void inet_register_protosw(struct inet_protosw *p);
 void inet_unregister_protosw(struct inet_protosw *p);
 
-int  udp_add_offload(struct net *net, struct udp_offload *prot);
-void udp_del_offload(struct udp_offload *prot);
-
 #if IS_ENABLED(CONFIG_IPV6)
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char num);
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char num);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 65c3fd34b363..6230cf4b0d2d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,18 +14,6 @@
 #include <net/udp.h>
 #include <net/protocol.h>
 
-static DEFINE_SPINLOCK(udp_offload_lock);
-static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
-
-#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
-
-struct udp_offload_priv {
-	struct udp_offload	*offload;
-	possible_net_t	net;
-	struct rcu_head		rcu;
-	struct udp_offload_priv __rcu *next;
-};
-
 static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	netdev_features_t features,
 	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -254,56 +242,6 @@ out:
 	return segs;
 }
 
-int udp_add_offload(struct net *net, struct udp_offload *uo)
-{
-	struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
-
-	if (!new_offload)
-		return -ENOMEM;
-
-	write_pnet(&new_offload->net, net);
-	new_offload->offload = uo;
-
-	spin_lock(&udp_offload_lock);
-	new_offload->next = udp_offload_base;
-	rcu_assign_pointer(udp_offload_base, new_offload);
-	spin_unlock(&udp_offload_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(udp_add_offload);
-
-static void udp_offload_free_routine(struct rcu_head *head)
-{
-	struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
-	kfree(ou_priv);
-}
-
-void udp_del_offload(struct udp_offload *uo)
-{
-	struct udp_offload_priv __rcu **head = &udp_offload_base;
-	struct udp_offload_priv *uo_priv;
-
-	spin_lock(&udp_offload_lock);
-
-	uo_priv = udp_deref_protected(*head);
-	for (; uo_priv != NULL;
-	     uo_priv = udp_deref_protected(*head)) {
-		if (uo_priv->offload == uo) {
-			rcu_assign_pointer(*head,
-					   udp_deref_protected(uo_priv->next));
-			goto unlock;
-		}
-		head = &uo_priv->next;
-	}
-	pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
-unlock:
-	spin_unlock(&udp_offload_lock);
-	if (uo_priv)
-		call_rcu(&uo_priv->rcu, udp_offload_free_routine);
-}
-EXPORT_SYMBOL(udp_del_offload);
-
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -327,7 +265,6 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 
 	if (sk && udp_sk(sk)->gro_receive)
 		goto unflush;
-
 	goto out_unlock;
 
 unflush:
-- 
cgit v1.2.3


From ec5e099d6e941668d121ea9ca7057f4fa00830b0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:22 -0700
Subject: perf: optimize perf_fetch_caller_regs

avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu init logic and
subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
so we can safely drop memset from all of the above cases and move it into
perf_ftrace_function_call that calls it with stack allocated pt_regs.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      | 2 --
 kernel/trace/trace_event_perf.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..e89f7199c223 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
  */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	memset(regs, 0, sizeof(*regs));
-
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..7a68afca8249 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -316,6 +316,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 
+	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
 	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
-- 
cgit v1.2.3


From 1e1dcd93b468901e114f279c94a0b356adc5e7cd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:24 -0700
Subject: perf: split perf_trace_buf_prepare into alloc and update parts

split allows to move expensive update of 'struct trace_entry' to later phase.
Repurpose unused 1st argument of perf_tp_event() to indicate event type.

While splitting use temp variable 'rctx' instead of '*rctx' to avoid
unnecessary loads done by the compiler due to -fno-strict-aliasing

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  8 ++++----
 include/trace/perf.h            |  8 ++++----
 kernel/events/core.c            |  6 ++++--
 kernel/trace/trace_event_perf.c | 39 ++++++++++++++++++++-------------------
 kernel/trace/trace_kprobe.c     | 10 ++++++----
 kernel/trace/trace_syscalls.c   | 13 +++++++------
 kernel/trace/trace_uprobe.c     |  5 +++--
 8 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e89f7199c223..eb41b535ef38 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1016,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 extern void perf_event_init(void);
-extern void perf_tp_event(u64 addr, u64 count, void *record,
+extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
 			  struct task_struct *task);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..56f795e6a093 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -605,15 +605,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
-extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs **regs, int *rctxp);
+void perf_trace_buf_update(void *record, u16 type);
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
 static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
 		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 6f7e37869065..77cd9043b7e4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -53,8 +53,7 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	entry = perf_trace_buf_prepare(__entry_size,			\
-			event_call->event.type, &__regs, &rctx);	\
+	entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
@@ -64,8 +63,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, 0,		\
-		__count, __regs, head, __task);				\
+	perf_trace_buf_submit(entry, __entry_size, rctx,		\
+			      event_call->event.type, __count, __regs,	\
+			      head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..d8512883c0a0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6987,7 +6987,7 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
 {
@@ -6999,9 +6999,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 		.data = record,
 	};
 
-	perf_sample_data_init(&data, addr, 0);
+	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	perf_trace_buf_update(record, event_type);
+
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7a68afca8249..5a927075977f 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
-void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs **regs, int *rctxp)
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 {
-	struct trace_entry *entry;
-	unsigned long flags;
 	char *raw_data;
-	int pc;
+	int rctx;
 
 	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-			"perf buffer not large enough"))
+		      "perf buffer not large enough"))
 		return NULL;
 
-	pc = preempt_count();
-
-	*rctxp = perf_swevent_get_recursion_context();
-	if (*rctxp < 0)
+	*rctxp = rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		return NULL;
 
 	if (regs)
-		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
-	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+		*regs = this_cpu_ptr(&__perf_regs[rctx]);
+	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+	return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
+NOKPROBE_SYMBOL(perf_trace_buf_alloc);
+
+void perf_trace_buf_update(void *record, u16 type)
+{
+	struct trace_entry *entry = record;
+	int pc = preempt_count();
+	unsigned long flags;
 
-	entry = (struct trace_entry *)raw_data;
 	local_save_flags(flags);
 	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
-
-	return raw_data;
 }
-EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_update);
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
@@ -319,13 +320,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
-	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 919e0ddd8fcc..5546eec0505f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->func = (unsigned long)tk->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e78f364cc192..b2b6efc083a4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx,
+			      sys_data->enter_event->event.type, 1, regs,
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7915142c89e4..c53485441c88 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		goto out;
 
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 		memset(data + len, 0, size - esize - len);
 	}
 
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:27 -0700
Subject: bpf: support bpf_get_stackid() and bpf_perf_event_output() in
 tracepoint programs

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..198f6ace70ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,6 +160,7 @@ struct bpf_array {
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e5ebe3254d2..413ec5614180 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -340,12 +340,52 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	/*
+	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+	 * from there and call the same bpf_perf_event_output() helper
+	 */
+	u64 ctx = *(long *)r1;
+
+	return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+	.func		= bpf_perf_event_output_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	u64 ctx = *(long *)r1;
+
+	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+	.func		= bpf_get_stackid_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return NULL;
+		return &bpf_get_stackid_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:28 -0700
Subject: bpf: sanitize bpf tracepoint access

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 +
 include/linux/trace_events.h |  1 +
 kernel/bpf/verifier.c        |  6 +++++-
 kernel/events/core.c         |  8 ++++++++
 kernel/trace/trace_events.c  | 18 ++++++++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 198f6ace70ec..b2365a6eba3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
+	u32 max_ctx_offset;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 56f795e6a093..fe6441203b59 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
+extern int trace_event_get_offsets(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ffe97d6166..9a01019ff7c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7133,6 +7133,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	if (is_tracepoint) {
+		int off = trace_event_get_offsets(event->tp_event);
+
+		if (prog->aux->max_ctx_offset > off) {
+			bpf_prog_put(prog);
+			return -EACCES;
+		}
+	}
 	event->tp_event->prog = prog;
 
 	return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..ced963049e0a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
 	}
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+	struct ftrace_event_field *tail;
+	struct list_head *head;
+
+	head = trace_get_fields(call);
+	/*
+	 * head->next points to the last field with the largest offset,
+	 * since it was added last by trace_define_field()
+	 */
+	tail = list_first_entry(head, struct ftrace_event_field, link);
+	return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
-- 
cgit v1.2.3


From 1d111406c6d91f4d7f6cc69a43e59546e8010aae Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 20 Mar 2016 13:57:20 +0100
Subject: PCI: Add Intel Thunderbolt device IDs

Intel Gen 1 and 2 chips use the same ID for NHI, bridges and switch.  Gen 3
chips and onward use a distinct ID for the NHI.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Andreas Noever <andreas.noever@gmail.com>
---
 drivers/pci/quirks.c         | 16 ++++++++++------
 drivers/thunderbolt/nhi.c    |  8 +++++---
 drivers/thunderbolt/switch.c |  9 +++++----
 include/linux/pci_ids.h      | 18 ++++++++++++++++++
 4 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8e678027b900..b584ddf83555 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3232,7 +3232,8 @@ static void quirk_apple_poweroff_thunderbolt(struct pci_dev *dev)
 	acpi_execute_simple_method(SXIO, NULL, 0);
 	acpi_execute_simple_method(SXLV, NULL, 0);
 }
-DECLARE_PCI_FIXUP_SUSPEND_LATE(PCI_VENDOR_ID_INTEL, 0x1547,
+DECLARE_PCI_FIXUP_SUSPEND_LATE(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 			       quirk_apple_poweroff_thunderbolt);
 
 /*
@@ -3266,9 +3267,10 @@ static void quirk_apple_wait_for_thunderbolt(struct pci_dev *dev)
 	if (!nhi)
 		goto out;
 	if (nhi->vendor != PCI_VENDOR_ID_INTEL
-			|| (nhi->device != 0x1547 && nhi->device != 0x156c)
-			|| nhi->subsystem_vendor != 0x2222
-			|| nhi->subsystem_device != 0x1111)
+		    || (nhi->device != PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C &&
+			nhi->device != PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI)
+		    || nhi->subsystem_vendor != 0x2222
+		    || nhi->subsystem_device != 0x1111)
 		goto out;
 	dev_info(&dev->dev, "quirk: waiting for thunderbolt to reestablish PCI tunnels...\n");
 	device_pm_wait_for_dev(&dev->dev, &nhi->dev);
@@ -3276,9 +3278,11 @@ out:
 	pci_dev_put(nhi);
 	pci_dev_put(sibling);
 }
-DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, 0x1547,
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 			       quirk_apple_wait_for_thunderbolt);
-DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, 0x156d,
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL,
+			       PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_BRIDGE,
 			       quirk_apple_wait_for_thunderbolt);
 #endif
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 20a41f7de76f..36be23babb89 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -633,16 +633,18 @@ static const struct dev_pm_ops nhi_pm_ops = {
 static struct pci_device_id nhi_ids[] = {
 	/*
 	 * We have to specify class, the TB bridges use the same device and
-	 * vendor (sub)id.
+	 * vendor (sub)id on gen 1 and gen 2 controllers.
 	 */
 	{
 		.class = PCI_CLASS_SYSTEM_OTHER << 8, .class_mask = ~0,
-		.vendor = PCI_VENDOR_ID_INTEL, .device = 0x1547,
+		.vendor = PCI_VENDOR_ID_INTEL,
+		.device = PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C,
 		.subvendor = 0x2222, .subdevice = 0x1111,
 	},
 	{
 		.class = PCI_CLASS_SYSTEM_OTHER << 8, .class_mask = ~0,
-		.vendor = PCI_VENDOR_ID_INTEL, .device = 0x156c,
+		.vendor = PCI_VENDOR_ID_INTEL,
+		.device = PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI,
 		.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID,
 	},
 	{ 0,}
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index aeb982969629..db73ffed68a9 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -293,9 +293,9 @@ static int tb_plug_events_active(struct tb_switch *sw, bool active)
 	if (active) {
 		data = data & 0xFFFFFF83;
 		switch (sw->config.device_id) {
-		case 0x1513:
-		case 0x151a:
-		case 0x1549:
+		case PCI_DEVICE_ID_INTEL_LIGHT_RIDGE:
+		case PCI_DEVICE_ID_INTEL_EAGLE_RIDGE:
+		case PCI_DEVICE_ID_INTEL_PORT_RIDGE:
 			break;
 		default:
 			data |= 4;
@@ -370,7 +370,8 @@ struct tb_switch *tb_switch_alloc(struct tb *tb, u64 route)
 		tb_sw_warn(sw, "unknown switch vendor id %#x\n",
 			   sw->config.vendor_id);
 
-	if (sw->config.device_id != 0x1547 && sw->config.device_id != 0x1549)
+	if (sw->config.device_id != PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C &&
+	    sw->config.device_id != PCI_DEVICE_ID_INTEL_PORT_RIDGE)
 		tb_sw_warn(sw, "unsupported switch device id %#x\n",
 			   sw->config.device_id);
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 247da8c95860..c58752fe16c4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2604,6 +2604,24 @@
 #define PCI_DEVICE_ID_INTEL_82441	0x1237
 #define PCI_DEVICE_ID_INTEL_82380FB	0x124b
 #define PCI_DEVICE_ID_INTEL_82439	0x1250
+#define PCI_DEVICE_ID_INTEL_LIGHT_RIDGE             0x1513 /* Tbt 1 Gen 1 */
+#define PCI_DEVICE_ID_INTEL_EAGLE_RIDGE             0x151a
+#define PCI_DEVICE_ID_INTEL_LIGHT_PEAK              0x151b
+#define PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_4C         0x1547 /* Tbt 1 Gen 2 */
+#define PCI_DEVICE_ID_INTEL_CACTUS_RIDGE_2C         0x1548
+#define PCI_DEVICE_ID_INTEL_PORT_RIDGE              0x1549
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_2C_NHI    0x1566 /* Tbt 1 Gen 3 */
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_2C_BRIDGE 0x1567
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_4C_NHI    0x1568
+#define PCI_DEVICE_ID_INTEL_REDWOOD_RIDGE_4C_BRIDGE 0x1569
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_2C_NHI     0x156a /* Thunderbolt 2 */
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_2C_BRIDGE  0x156b
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_NHI     0x156c
+#define PCI_DEVICE_ID_INTEL_FALCON_RIDGE_4C_BRIDGE  0x156d
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_2C_NHI     0x1575 /* Thunderbolt 3 */
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_2C_BRIDGE  0x1576
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_NHI     0x1577
+#define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_BRIDGE  0x1578
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
-- 
cgit v1.2.3


From 32b9b10961860860268961d9aad0c56a73018c37 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 11 Feb 2016 13:19:09 -0800
Subject: clk: Allow clocks to be marked as CRITICAL

Critical clocks are those which must not be gated, else undefined
or catastrophic failure would occur.  Here we have chosen to
ensure the prepare/enable counts are correctly incremented, so as
not to confuse users with enabled clocks with no visible users.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/1455225554-13267-2-git-send-email-mturquette@baylibre.com
---
 drivers/clk/clk.c            | 5 +++++
 include/linux/clk-provider.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index fb74dc1f7520..275201fd7b01 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2397,6 +2397,11 @@ static int __clk_core_init(struct clk_core *core)
 	if (core->ops->init)
 		core->ops->init(core->hw);
 
+	if (core->flags & CLK_IS_CRITICAL) {
+		clk_core_prepare(core);
+		clk_core_enable(core);
+	}
+
 	kref_init(&core->ref);
 out:
 	clk_prepare_unlock();
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..0638b4154502 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -32,6 +32,7 @@
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
 #define CLK_RECALC_NEW_RATES	BIT(9) /* recalc rates after notifications */
 #define CLK_SET_RATE_UNGATE	BIT(10) /* clock needs to run to set rate */
+#define CLK_IS_CRITICAL		BIT(11) /* do not gate, ever */
 
 struct clk;
 struct clk_hw;
-- 
cgit v1.2.3


From d56f8994b6fb928f59481fabc25bcd1c2f9bd06d Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 11 Feb 2016 13:19:11 -0800
Subject: clk: Provide OF helper to mark clocks as CRITICAL

This call matches clocks which have been marked as critical in DT
and sets the appropriate flag.  These flags can then be used to
mark the clock core flags appropriately prior to registration.

Legacy bindings requiring this feature must add the clock-critical
property to their binding descriptions, as it is not a part of
common-clock binding.

Cc: devicetree@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/1455225554-13267-4-git-send-email-mturquette@baylibre.com
---
 drivers/clk/clk.c            | 35 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  8 +++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index dede0ce679e4..9f77cc67cdc3 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3137,6 +3137,41 @@ static int parent_ready(struct device_node *np)
 	}
 }
 
+/**
+ * of_clk_detect_critical() - set CLK_IS_CRITICAL flag from Device Tree
+ * @np: Device node pointer associated with clock provider
+ * @index: clock index
+ * @flags: pointer to clk_core->flags
+ *
+ * Detects if the clock-critical property exists and, if so, sets the
+ * corresponding CLK_IS_CRITICAL flag.
+ *
+ * Do not use this function. It exists only for legacy Device Tree
+ * bindings, such as the one-clock-per-node style that are outdated.
+ * Those bindings typically put all clock data into .dts and the Linux
+ * driver has no clock data, thus making it impossible to set this flag
+ * correctly from the driver. Only those drivers may call
+ * of_clk_detect_critical from their setup functions.
+ *
+ * Return: error code or zero on success
+ */
+int of_clk_detect_critical(struct device_node *np,
+					  int index, unsigned long *flags)
+{
+	struct property *prop;
+	const __be32 *cur;
+	uint32_t idx;
+
+	if (!np || !flags)
+		return -EINVAL;
+
+	of_property_for_each_u32(np, "clock-critical", prop, cur, idx)
+		if (index == idx)
+			*flags |= CLK_IS_CRITICAL;
+
+	return 0;
+}
+
 /**
  * of_clk_init() - Scan and init clock providers from the DT
  * @matches: array of compatible values and init functions for providers.
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0638b4154502..156286445a25 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -721,7 +721,8 @@ unsigned int of_clk_get_parent_count(struct device_node *np);
 int of_clk_parent_fill(struct device_node *np, const char **parents,
 		       unsigned int size);
 const char *of_clk_get_parent_name(struct device_node *np, int index);
-
+int of_clk_detect_critical(struct device_node *np, int index,
+			    unsigned long *flags);
 void of_clk_init(const struct of_device_id *matches);
 
 #else /* !CONFIG_OF */
@@ -758,6 +759,11 @@ static inline const char *of_clk_get_parent_name(struct device_node *np,
 {
 	return NULL;
 }
+static inline int of_clk_detect_critical(struct device_node *np, int index,
+					  unsigned long *flags)
+{
+	return 0;
+}
 static inline void of_clk_init(const struct of_device_id *matches) {}
 #endif /* CONFIG_OF */
 
-- 
cgit v1.2.3


From b296821a7c42fa58baa17513b2b7b30ae66f3336 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Apr 2016 20:48:24 -0400
Subject: xattr_handler: pass dentry and inode as separate arguments of ->get()

... and do not assume they are already attached to each other

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c                  |  6 +++---
 fs/9p/xattr.c                |  4 ++--
 fs/btrfs/xattr.c             |  6 ++----
 fs/ext2/xattr_security.c     |  6 +++---
 fs/ext2/xattr_trusted.c      |  6 +++---
 fs/ext2/xattr_user.c         |  8 ++++----
 fs/ext4/xattr_security.c     |  6 +++---
 fs/ext4/xattr_trusted.c      |  6 +++---
 fs/ext4/xattr_user.c         |  8 ++++----
 fs/f2fs/xattr.c              | 14 ++++++--------
 fs/gfs2/xattr.c              |  6 +++---
 fs/hfsplus/xattr.c           | 10 +++++-----
 fs/hfsplus/xattr.h           |  2 +-
 fs/hfsplus/xattr_security.c  |  6 +++---
 fs/hfsplus/xattr_trusted.c   |  6 +++---
 fs/hfsplus/xattr_user.c      |  6 +++---
 fs/jffs2/security.c          |  6 +++---
 fs/jffs2/xattr_trusted.c     |  6 +++---
 fs/jffs2/xattr_user.c        |  6 +++---
 fs/nfs/nfs4proc.c            | 12 ++++++------
 fs/ocfs2/xattr.c             | 20 ++++++++++----------
 fs/orangefs/xattr.c          | 10 ++++++----
 fs/posix_acl.c               | 10 +++++-----
 fs/reiserfs/xattr_security.c |  9 ++++-----
 fs/reiserfs/xattr_trusted.c  |  9 ++++-----
 fs/reiserfs/xattr_user.c     |  9 ++++-----
 fs/squashfs/xattr.c          |  6 ++++--
 fs/xattr.c                   |  3 ++-
 fs/xfs/xfs_xattr.c           |  6 +++---
 include/linux/xattr.h        |  3 ++-
 mm/shmem.c                   |  6 +++---
 31 files changed, 113 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 2d94e94b6b59..eb3589edf485 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -213,8 +213,8 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 }
 
 static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
-			      struct dentry *dentry, const char *name,
-			      void *buffer, size_t size)
+			      struct dentry *dentry, struct inode *inode,
+			      const char *name, void *buffer, size_t size)
 {
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *acl;
@@ -227,7 +227,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
 		return v9fs_xattr_get(dentry, handler->name, buffer, size);
 
-	acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
+	acl = v9fs_get_cached_acl(inode, handler->flags);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 9dd9b47a6c1a..18c62bae9591 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -138,8 +138,8 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 }
 
 static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  void *buffer, size_t size)
+				  struct dentry *dentry, struct inode *inode,
+				  const char *name, void *buffer, size_t size)
 {
 	const char *full_name = xattr_full_name(handler, name);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 145d2b89e62d..03224b00ea70 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -369,11 +369,9 @@ err:
 }
 
 static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __btrfs_getxattr(inode, name, buffer, size);
 }
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index ba97f243b050..7fd3b867ce65 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -9,10 +9,10 @@
 
 static int
 ext2_xattr_security_get(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
+			struct dentry *unused, struct inode *inode,
+			const char *name, void *buffer, size_t size)
 {
-	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+	return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name,
 			      buffer, size);
 }
 
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2c94d1930626..0f85705ff519 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -16,10 +16,10 @@ ext2_xattr_trusted_list(struct dentry *dentry)
 
 static int
 ext2_xattr_trusted_get(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       void *buffer, size_t size)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, void *buffer, size_t size)
 {
-	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+	return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      buffer, size);
 }
 
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 72a2a96d677f..1fafd27037cc 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -18,12 +18,12 @@ ext2_xattr_user_list(struct dentry *dentry)
 
 static int
 ext2_xattr_user_get(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    void *buffer, size_t size)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, void *buffer, size_t size)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+	return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER,
 			      name, buffer, size);
 }
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 3e81bdca071a..123a7d010efe 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -13,10 +13,10 @@
 
 static int
 ext4_xattr_security_get(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
+			struct dentry *unused, struct inode *inode,
+			const char *name, void *buffer, size_t size)
 {
-	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY,
 			      name, buffer, size);
 }
 
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 2a3c6f9b8cb8..60652fa24cbc 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -20,10 +20,10 @@ ext4_xattr_trusted_list(struct dentry *dentry)
 
 static int
 ext4_xattr_trusted_get(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name, void *buffer,
-		       size_t size)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, void *buffer, size_t size)
 {
-	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
 			      name, buffer, size);
 }
 
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d152f431e432..17a446ffecd3 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -19,12 +19,12 @@ ext4_xattr_user_list(struct dentry *dentry)
 
 static int
 ext4_xattr_user_get(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    void *buffer, size_t size)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, void *buffer, size_t size)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER,
 			      name, buffer, size);
 }
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 06a72dc0191a..17fd2b1a6848 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -26,10 +26,10 @@
 #include "xattr.h"
 
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, void *buffer,
-		size_t size)
+		struct dentry *unused, struct inode *inode,
+		const char *name, void *buffer, size_t size)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
@@ -45,7 +45,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+	return f2fs_getxattr(inode, handler->flags, name,
 			     buffer, size, NULL);
 }
 
@@ -86,11 +86,9 @@ static bool f2fs_xattr_trusted_list(struct dentry *dentry)
 }
 
 static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, void *buffer,
-		size_t size)
+		struct dentry *unused, struct inode *inode,
+		const char *name, void *buffer, size_t size)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (buffer)
 		*((char *)buffer) = F2FS_I(inode)->i_advise;
 	return sizeof(char);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index e8dfb4740c04..619886ba6e78 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -584,10 +584,10 @@ out:
  * Returns: actual size of data on success, -errno on error
  */
 static int gfs2_xattr_get(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  void *buffer, size_t size)
+			  struct dentry *unused, struct inode *inode,
+			  const char *name, void *buffer, size_t size)
 {
-	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_ea_location el;
 	int type = handler->flags;
 	int error;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index ab01530b4930..45dc4ae3791a 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -579,7 +579,7 @@ failed_getxattr_init:
 	return res;
 }
 
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t hfsplus_getxattr(struct inode *inode, const char *name,
 			 void *value, size_t size,
 			 const char *prefix, size_t prefixlen)
 {
@@ -594,7 +594,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 	strcpy(xattr_name, prefix);
 	strcpy(xattr_name + prefixlen, name);
 
-	res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size);
+	res = __hfsplus_getxattr(inode, xattr_name, value, size);
 	kfree(xattr_name);
 	return res;
 
@@ -844,8 +844,8 @@ end_removexattr:
 }
 
 static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				void *buffer, size_t size)
+				struct dentry *unused, struct inode *inode,
+				const char *name, void *buffer, size_t size)
 {
 	/*
 	 * Don't allow retrieving properly prefixed attributes
@@ -860,7 +860,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 	 * creates), so we pass the name through unmodified (after
 	 * ensuring it doesn't conflict with another namespace).
 	 */
-	return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
+	return __hfsplus_getxattr(inode, name, buffer, size);
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index f9b0955b3d28..d04ba6f58df2 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -28,7 +28,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
 			   void *value, size_t size);
 
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t hfsplus_getxattr(struct inode *inode, const char *name,
 			 void *value, size_t size,
 			 const char *prefix, size_t prefixlen);
 
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 72a68a3a0c99..ae2ca8c2e335 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -14,10 +14,10 @@
 #include "acl.h"
 
 static int hfsplus_security_getxattr(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *name,
-				     void *buffer, size_t size)
+				     struct dentry *unused, struct inode *inode,
+				     const char *name, void *buffer, size_t size)
 {
-	return hfsplus_getxattr(dentry, name, buffer, size,
+	return hfsplus_getxattr(inode, name, buffer, size,
 				XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN);
 }
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 95a7704c7abb..eae2947060aa 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -12,10 +12,10 @@
 #include "xattr.h"
 
 static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    void *buffer, size_t size)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, void *buffer, size_t size)
 {
-	return hfsplus_getxattr(dentry, name, buffer, size,
+	return hfsplus_getxattr(inode, name, buffer, size,
 				XATTR_TRUSTED_PREFIX,
 				XATTR_TRUSTED_PREFIX_LEN);
 }
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 6fc269baf959..3c9eec3e4c7b 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -12,11 +12,11 @@
 #include "xattr.h"
 
 static int hfsplus_user_getxattr(const struct xattr_handler *handler,
-				 struct dentry *dentry, const char *name,
-				 void *buffer, size_t size)
+				 struct dentry *unused, struct inode *inode,
+				 const char *name, void *buffer, size_t size)
 {
 
-	return hfsplus_getxattr(dentry, name, buffer, size,
+	return hfsplus_getxattr(inode, name, buffer, size,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 7a28facd7175..3ed9a4b49778 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -49,10 +49,10 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
 
 /* ---- XATTR Handler for "security.*" ----------------- */
 static int jffs2_security_getxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
-	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size);
 }
 
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index b2555ef07a12..4ebecff1d922 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -17,10 +17,10 @@
 #include "nodelist.h"
 
 static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  void *buffer, size_t size)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, void *buffer, size_t size)
 {
-	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size);
 }
 
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 539bd630b5e4..bce249e1b277 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -17,10 +17,10 @@
 #include "nodelist.h"
 
 static int jffs2_user_getxattr(const struct xattr_handler *handler,
-			       struct dentry *dentry, const char *name,
-			       void *buffer, size_t size)
+			       struct dentry *unused, struct inode *inode,
+			       const char *name, void *buffer, size_t size)
 {
-	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER,
 				 name, buffer, size);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 327b8c34d360..7a7ac1dafa02 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6263,10 +6263,10 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
 }
 
 static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *key,
-				   void *buf, size_t buflen)
+				   struct dentry *unused, struct inode *inode,
+				   const char *key, void *buf, size_t buflen)
 {
-	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
+	return nfs4_proc_get_acl(inode, buf, buflen);
 }
 
 static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
@@ -6288,11 +6288,11 @@ static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
 }
 
 static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *key,
-				     void *buf, size_t buflen)
+				     struct dentry *unused, struct inode *inode,
+				     const char *key, void *buf, size_t buflen)
 {
 	if (security_ismaclabel(key))
-		return nfs4_get_security_label(d_inode(dentry), buf, buflen);
+		return nfs4_get_security_label(inode, buf, buflen);
 	return -EOPNOTSUPP;
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7d3d979f57d9..72eef4cfe8a9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7250,10 +7250,10 @@ leave:
  * 'security' attributes support
  */
 static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    void *buffer, size_t size)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, void *buffer, size_t size)
 {
-	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY,
 			       name, buffer, size);
 }
 
@@ -7321,10 +7321,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
  * 'trusted' attributes support
  */
 static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
-	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED,
 			       name, buffer, size);
 }
 
@@ -7346,14 +7346,14 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
  * 'user' attributes support
  */
 static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				void *buffer, size_t size)
+				struct dentry *unusde, struct inode *inode,
+				const char *name, void *buffer, size_t size)
 {
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
-	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
 			       buffer, size);
 }
 
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index ef5da7538cd5..6a4c0f7ce5c1 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -478,12 +478,13 @@ static int orangefs_xattr_set_default(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_get_default(const struct xattr_handler *handler,
-				      struct dentry *dentry,
+				      struct dentry *unused,
+				      struct inode *inode,
 				      const char *name,
 				      void *buffer,
 				      size_t size)
 {
-	return orangefs_inode_getxattr(dentry->d_inode,
+	return orangefs_inode_getxattr(inode,
 				    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
 				    name,
 				    buffer,
@@ -507,12 +508,13 @@ static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
-				      struct dentry *dentry,
+				      struct dentry *unused,
+				      struct inode *inode,
 				      const char *name,
 				      void *buffer,
 				      size_t size)
 {
-	return orangefs_inode_getxattr(dentry->d_inode,
+	return orangefs_inode_getxattr(inode,
 				    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
 				    name,
 				    buffer,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index db1fb0f5d9ff..2c60f17e7d92 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -797,18 +797,18 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
 
 static int
 posix_acl_xattr_get(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    void *value, size_t size)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, void *value, size_t size)
 {
 	struct posix_acl *acl;
 	int error;
 
-	if (!IS_POSIXACL(d_backing_inode(dentry)))
+	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
-	if (d_is_symlink(dentry))
+	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	acl = get_acl(d_backing_inode(dentry), handler->flags);
+	acl = get_acl(inode, handler->flags);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ac7e104ada6b..86aeb9dd805a 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,14 +9,13 @@
 #include <linux/uaccess.h>
 
 static int
-security_get(const struct xattr_handler *handler, struct dentry *dentry,
-	     const char *name, void *buffer, size_t size)
+security_get(const struct xattr_handler *handler, struct dentry *unused,
+	     struct inode *inode, const char *name, void *buffer, size_t size)
 {
-	if (IS_PRIVATE(d_inode(dentry)))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_get(d_inode(dentry),
-				  xattr_full_name(handler, name),
+	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
 				  buffer, size);
 }
 
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index cc248a581b60..31837f031f59 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,14 +8,13 @@
 #include <linux/uaccess.h>
 
 static int
-trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
-	    const char *name, void *buffer, size_t size)
+trusted_get(const struct xattr_handler *handler, struct dentry *unused,
+	    struct inode *inode, const char *name, void *buffer, size_t size)
 {
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_get(d_inode(dentry),
-				  xattr_full_name(handler, name),
+	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
 				  buffer, size);
 }
 
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index caad583086af..f7c39731684b 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,13 +7,12 @@
 #include <linux/uaccess.h>
 
 static int
-user_get(const struct xattr_handler *handler, struct dentry *dentry,
-	 const char *name, void *buffer, size_t size)
+user_get(const struct xattr_handler *handler, struct dentry *unused,
+	 struct inode *inode, const char *name, void *buffer, size_t size)
 {
-	if (!reiserfs_xattrs_user(dentry->d_sb))
+	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-	return reiserfs_xattr_get(d_inode(dentry),
-				  xattr_full_name(handler, name),
+	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
 				  buffer, size);
 }
 
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 1e9de96288d8..1548b3784548 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -214,10 +214,12 @@ failed:
 
 
 static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
-				      struct dentry *d, const char *name,
+				      struct dentry *unused,
+				      struct inode *inode,
+				      const char *name,
 				      void *buffer, size_t size)
 {
-	return squashfs_xattr_get(d_inode(d), handler->flags, name,
+	return squashfs_xattr_get(inode, handler->flags, name,
 		buffer, size);
 }
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 4861322e28e8..461ba45b7da9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -698,7 +698,8 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->get(handler, dentry, name, buffer, size);
+	return handler->get(handler, dentry, d_inode(dentry),
+			    name, buffer, size);
 }
 
 /*
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 110f1d7d86b0..d111f691f313 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,11 +32,11 @@
 
 
 static int
-xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
-		const char *name, void *value, size_t size)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
+		struct inode *inode, const char *name, void *value, size_t size)
 {
 	int xflags = handler->flags;
-	struct xfs_inode *ip = XFS_I(d_inode(dentry));
+	struct xfs_inode *ip = XFS_I(inode);
 	int error, asize = size;
 
 	/* Convert Linux syscall to XFS internal ATTR flags */
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 4457541de3c9..c11c022298b9 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -30,7 +30,8 @@ struct xattr_handler {
 	int flags;      /* fs private flags */
 	bool (*list)(struct dentry *dentry);
 	int (*get)(const struct xattr_handler *, struct dentry *dentry,
-		   const char *name, void *buffer, size_t size);
+		   struct inode *inode, const char *name, void *buffer,
+		   size_t size);
 	int (*set)(const struct xattr_handler *, struct dentry *dentry,
 		   const char *name, const void *buffer, size_t size,
 		   int flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9428c51ab2d6..00d5d025eece 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2646,10 +2646,10 @@ static int shmem_initxattrs(struct inode *inode,
 }
 
 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	name = xattr_full_name(handler, name);
 	return simple_xattr_get(&info->xattrs, name, buffer, size);
-- 
cgit v1.2.3


From 3c9d6296b7aee536a96ea2b53a15d23511738c1c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 8 Apr 2016 12:20:30 +0200
Subject: security: drop the unused hook skb_owned_by

The skb_owned_by hook was added with the commit ca10b9e9a8ca
("selinux: add a skb_owned_by() hook") and later removed
when said commit was reverted.

Later on, when switching to list of hooks, a field named
'skb_owned_by' was included into the security_hook_head struct,
but without any users nor caller.

This commit removes the said left-over field.

Fixes: b1d9e6b0646d ("LSM: Switch to lists of hooks")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Paul Moore <pmoore@paul-moore.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 1 -
 security/security.c       | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index cdee11cbcdf1..ae2537886177 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1804,7 +1804,6 @@ struct security_hook_heads {
 	struct list_head tun_dev_attach_queue;
 	struct list_head tun_dev_attach;
 	struct list_head tun_dev_open;
-	struct list_head skb_owned_by;
 #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	struct list_head xfrm_policy_alloc_security;
diff --git a/security/security.c b/security/security.c
index 3644b0344d29..554c3fb7d4a5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1848,7 +1848,6 @@ struct security_hook_heads security_hook_heads = {
 	.tun_dev_attach =
 		LIST_HEAD_INIT(security_hook_heads.tun_dev_attach),
 	.tun_dev_open =	LIST_HEAD_INIT(security_hook_heads.tun_dev_open),
-	.skb_owned_by =	LIST_HEAD_INIT(security_hook_heads.skb_owned_by),
 #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	.xfrm_policy_alloc_security =
-- 
cgit v1.2.3


From ce23e640133484eebc20ca7b7668388213e11327 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 11 Apr 2016 00:48:00 -0400
Subject: ->getxattr(): pass dentry and inode as separate arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting                  |  6 ++++++
 .../staging/lustre/lustre/llite/llite_internal.h   |  4 ++--
 drivers/staging/lustre/lustre/llite/xattr.c        |  6 ++----
 fs/bad_inode.c                                     |  4 ++--
 fs/ceph/super.h                                    |  2 +-
 fs/ceph/xattr.c                                    |  8 ++++----
 fs/cifs/cifsfs.h                                   |  2 +-
 fs/cifs/xattr.c                                    |  6 +++---
 fs/ecryptfs/crypto.c                               |  5 ++++-
 fs/ecryptfs/ecryptfs_kernel.h                      |  4 ++--
 fs/ecryptfs/inode.c                                | 23 +++++++++++-----------
 fs/ecryptfs/mmap.c                                 |  3 ++-
 fs/fuse/dir.c                                      |  5 ++---
 fs/gfs2/inode.c                                    |  9 ++++-----
 fs/hfs/attr.c                                      |  5 ++---
 fs/hfs/hfs_fs.h                                    |  4 ++--
 fs/jfs/jfs_xattr.h                                 |  2 +-
 fs/jfs/xattr.c                                     |  8 ++++----
 fs/kernfs/inode.c                                  |  6 +++---
 fs/kernfs/kernfs-internal.h                        |  4 ++--
 fs/libfs.c                                         |  4 ++--
 fs/overlayfs/inode.c                               |  4 ++--
 fs/overlayfs/overlayfs.h                           |  4 ++--
 fs/overlayfs/super.c                               |  2 +-
 fs/ubifs/ubifs.h                                   |  4 ++--
 fs/ubifs/xattr.c                                   |  6 +++---
 fs/xattr.c                                         | 11 ++++++-----
 include/linux/fs.h                                 |  3 ++-
 include/linux/xattr.h                              |  2 +-
 net/socket.c                                       |  2 +-
 security/commoncap.c                               |  6 +++---
 security/integrity/evm/evm_main.c                  |  2 +-
 security/selinux/hooks.c                           |  9 +++++----
 security/smack/smack_lsm.c                         |  4 ++--
 34 files changed, 94 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f1b87d8aa2da..57bb3754a027 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -525,3 +525,9 @@ in your dentry operations instead.
 	set_delayed_call() where it used to set *cookie.
 	->put_link() is gone - just give the destructor to set_delayed_call()
 	in ->get_link().
+--
+[mandatory]
+	->getxattr() and xattr_handler.get() get dentry and inode passed separately.
+	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
+	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
+	called before we attach dentry to inode.
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 3e1572cb457b..d28efd27af57 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -1042,8 +1042,8 @@ static inline __u64 ll_file_maxbytes(struct inode *inode)
 /* llite/xattr.c */
 int ll_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
-ssize_t ll_getxattr(struct dentry *dentry, const char *name,
-		    void *buffer, size_t size);
+ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode,
+		    const char *name, void *buffer, size_t size);
 ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int ll_removexattr(struct dentry *dentry, const char *name);
 
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
index b68dcc921ca2..c671f221c28c 100644
--- a/drivers/staging/lustre/lustre/llite/xattr.c
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -451,11 +451,9 @@ out:
 	return rc;
 }
 
-ssize_t ll_getxattr(struct dentry *dentry, const char *name,
-		    void *buffer, size_t size)
+ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode,
+		    const char *name, void *buffer, size_t size)
 {
-	struct inode *inode = d_inode(dentry);
-
 	LASSERT(inode);
 	LASSERT(name);
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 103f5d7c3083..72e35b721608 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -106,8 +106,8 @@ static int bad_inode_setxattr(struct dentry *dentry, const char *name,
 	return -EIO;
 }
 
-static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
+static ssize_t bad_inode_getxattr(struct dentry *dentry, struct inode *inode,
+			const char *name, void *buffer, size_t size)
 {
 	return -EIO;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e705c4d612d7..beb893bb234f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -795,7 +795,7 @@ extern int ceph_setxattr(struct dentry *, const char *, const void *,
 int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
 ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
 int __ceph_removexattr(struct dentry *, const char *);
-extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
 extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9410abdef3ce..c6e917d360f7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -804,13 +804,13 @@ out:
 	return err;
 }
 
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
-		      size_t size)
+ssize_t ceph_getxattr(struct dentry *dentry, struct inode *inode,
+		      const char *name, void *value, size_t size)
 {
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_getxattr(dentry, name, value, size);
+		return generic_getxattr(dentry, inode, name, value, size);
 
-	return __ceph_getxattr(d_inode(dentry), name, value, size);
+	return __ceph_getxattr(inode, name, value, size);
 }
 
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 83aac8ba50b0..c89ecd7a5c39 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -123,7 +123,7 @@ extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 extern int	cifs_removexattr(struct dentry *, const char *);
 extern int	cifs_setxattr(struct dentry *, const char *, const void *,
 			size_t, int);
-extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t	cifs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_CIFS_NFSD_EXPORT
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 159547c8a40b..5d57c85703a9 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -213,8 +213,8 @@ set_ea_exit:
 	return rc;
 }
 
-ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
-	void *ea_value, size_t buf_size)
+ssize_t cifs_getxattr(struct dentry *direntry, struct inode *inode,
+	const char *ea_name, void *ea_value, size_t buf_size)
 {
 	ssize_t rc = -EOPNOTSUPP;
 #ifdef CONFIG_CIFS_XATTR
@@ -296,7 +296,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				goto get_ea_exit; /* rc already EOPNOTSUPP */
 
 			pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
-					d_inode(direntry), full_path, &acllen);
+					inode, full_path, &acllen);
 			if (IS_ERR(pacl)) {
 				rc = PTR_ERR(pacl);
 				cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 64026e53722a..543a146ee019 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1369,7 +1369,9 @@ int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode)
 	ssize_t size;
 	int rc = 0;
 
-	size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME,
+	size = ecryptfs_getxattr_lower(lower_dentry,
+				       ecryptfs_inode_to_lower(ecryptfs_inode),
+				       ECRYPTFS_XATTR_NAME,
 				       page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE);
 	if (size < 0) {
 		if (unlikely(ecryptfs_verbosity > 0))
@@ -1391,6 +1393,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
 	int rc;
 
 	rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+				     ecryptfs_inode_to_lower(inode),
 				     ECRYPTFS_XATTR_NAME, file_size,
 				     ECRYPTFS_SIZE_AND_MARKER_BYTES);
 	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index d123fbaa28e0..6ff907f73331 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -607,8 +607,8 @@ ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			  unsigned char *src, struct dentry *ecryptfs_dentry);
 int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
 ssize_t
-ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
-			void *value, size_t size);
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
+			const char *name, void *value, size_t size);
 int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 121114e9a464..1ac631cd9d84 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1033,29 +1033,30 @@ out:
 }
 
 ssize_t
-ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
-			void *value, size_t size)
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
+			const char *name, void *value, size_t size)
 {
 	int rc = 0;
 
-	if (!d_inode(lower_dentry)->i_op->getxattr) {
+	if (!lower_inode->i_op->getxattr) {
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	inode_lock(d_inode(lower_dentry));
-	rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
-						   size);
-	inode_unlock(d_inode(lower_dentry));
+	inode_lock(lower_inode);
+	rc = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
+					 name, value, size);
+	inode_unlock(lower_inode);
 out:
 	return rc;
 }
 
 static ssize_t
-ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
-		  size_t size)
+ecryptfs_getxattr(struct dentry *dentry, struct inode *inode,
+		  const char *name, void *value, size_t size)
 {
-	return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name,
-				       value, size);
+	return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+				       ecryptfs_inode_to_lower(inode),
+				       name, value, size);
 }
 
 static ssize_t
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 1f5865263b3e..39e4381d3a65 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -436,7 +436,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 		goto out;
 	}
 	inode_lock(lower_inode);
-	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+	size = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
+					   ECRYPTFS_XATTR_NAME,
 					   xattr_virt, PAGE_CACHE_SIZE);
 	if (size < 0)
 		size = 8;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4b855b65d457..b618527c05c6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1759,10 +1759,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	return err;
 }
 
-static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
-			     void *value, size_t size)
+static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode,
+			     const char *name, void *value, size_t size)
 {
-	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
 	struct fuse_getxattr_in inarg;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index bb30f9a72c65..45f516cada78 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1968,22 +1968,21 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
 	return ret;
 }
 
-static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
-			     void *data, size_t size)
+static ssize_t gfs2_getxattr(struct dentry *dentry, struct inode *inode,
+			     const char *name, void *data, size_t size)
 {
-	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;
 
 	/* For selinux during lookup */
 	if (gfs2_glock_is_locked_by_me(ip->i_gl))
-		return generic_getxattr(dentry, name, data, size);
+		return generic_getxattr(dentry, inode, name, data, size);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret == 0) {
-		ret = generic_getxattr(dentry, name, data, size);
+		ret = generic_getxattr(dentry, inode, name, data, size);
 		gfs2_glock_dq(&gh);
 	}
 	gfs2_holder_uninit(&gh);
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 8d931b157bbe..064f92f17efc 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -56,10 +56,9 @@ out:
 	return res;
 }
 
-ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
-			 void *value, size_t size)
+ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode,
+		     const char *name, void *value, size_t size)
 {
-	struct inode *inode = d_inode(dentry);
 	struct hfs_find_data fd;
 	hfs_cat_rec rec;
 	struct hfs_cat_file *file;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 1f1c7dcbcc2f..79daa097929a 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,8 +213,8 @@ extern void hfs_delete_inode(struct inode *);
 /* attr.c */
 extern int hfs_setxattr(struct dentry *dentry, const char *name,
 			const void *value, size_t size, int flags);
-extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
-			    void *value, size_t size);
+extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
+			    const char *name, void *value, size_t size);
 extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* mdb.c */
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e8d717dabca3..e69e14f3777b 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -57,7 +57,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
 extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
 			int);
 extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
-extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t jfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 48b15a6e5558..5becc6a3ff8c 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -933,8 +933,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
 	return size;
 }
 
-ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
-		     size_t buf_size)
+ssize_t jfs_getxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, void *data, size_t buf_size)
 {
 	int err;
 
@@ -944,7 +944,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 	 * for it via sb->s_xattr.
 	 */
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_getxattr(dentry, name, data, buf_size);
+		return generic_getxattr(dentry, inode, name, data, buf_size);
 
 	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		/*
@@ -959,7 +959,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 			return -EOPNOTSUPP;
 	}
 
-	err = __jfs_getxattr(d_inode(dentry), name, data, buf_size);
+	err = __jfs_getxattr(inode, name, data, buf_size);
 
 	return err;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 16405ae88d2d..b5247226732b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -208,10 +208,10 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
 	return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
 }
 
-ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
-			    size_t size)
+ssize_t kernfs_iop_getxattr(struct dentry *unused, struct inode *inode,
+			    const char *name, void *buf, size_t size)
 {
-	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_iattrs *attrs;
 
 	attrs = kernfs_iattrs(kn);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 6762bfbd8207..45c9192c276e 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -84,8 +84,8 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
 int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
 			size_t size, int flags);
 int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
-ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
-			    size_t size);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
+			    const char *name, void *buf, size_t size);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 
 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 0ca80b2af420..03332f4bdedf 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1127,8 +1127,8 @@ static int empty_dir_setxattr(struct dentry *dentry, const char *name,
 	return -EOPNOTSUPP;
 }
 
-static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
-				  void *value, size_t size)
+static ssize_t empty_dir_getxattr(struct dentry *dentry, struct inode *inode,
+				  const char *name, void *value, size_t size)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a4ff5d0d7db9..c7b31a03dc9c 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -246,8 +246,8 @@ static bool ovl_need_xattr_filter(struct dentry *dentry,
 		return false;
 }
 
-ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
-		     void *value, size_t size)
+ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, void *value, size_t size)
 {
 	struct path realpath;
 	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6a7090f4a441..99ec4b035237 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -173,8 +173,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr);
 int ovl_permission(struct inode *inode, int mask);
 int ovl_setxattr(struct dentry *dentry, const char *name,
 		 const void *value, size_t size, int flags);
-ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
-		     void *value, size_t size);
+ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 int ovl_removexattr(struct dentry *dentry, const char *name);
 struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ef64984c9bbc..14cab381cece 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -274,7 +274,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
 	if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
 		return false;
 
-	res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+	res = inode->i_op->getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1);
 	if (res == 1 && val == 'y')
 		return true;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c2a57e193a81..536fb495f2f1 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1734,8 +1734,8 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 /* xattr.c */
 int ubifs_setxattr(struct dentry *dentry, const char *name,
 		   const void *value, size_t size, int flags);
-ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
-		       size_t size);
+ssize_t ubifs_getxattr(struct dentry *dentry, struct inode *host,
+		       const char *name, void *buf, size_t size);
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int ubifs_removexattr(struct dentry *dentry, const char *name);
 int ubifs_init_security(struct inode *dentry, struct inode *inode,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index b043e044121d..413d650c9476 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -372,10 +372,10 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
 	return setxattr(d_inode(dentry), name, value, size, flags);
 }
 
-ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
-		       size_t size)
+ssize_t ubifs_getxattr(struct dentry *dentry, struct inode *host,
+		       const char *name, void *buf, size_t size)
 {
-	struct inode *inode, *host = d_inode(dentry);
+	struct inode *inode;
 	struct ubifs_info *c = host->i_sb->s_fs_info;
 	struct qstr nm = QSTR_INIT(name, strlen(name));
 	struct ubifs_inode *ui;
diff --git a/fs/xattr.c b/fs/xattr.c
index 461ba45b7da9..b11945e15fde 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -192,7 +192,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	if (!inode->i_op->getxattr)
 		return -EOPNOTSUPP;
 
-	error = inode->i_op->getxattr(dentry, name, NULL, 0);
+	error = inode->i_op->getxattr(dentry, inode, name, NULL, 0);
 	if (error < 0)
 		return error;
 
@@ -203,7 +203,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 		memset(value, 0, error + 1);
 	}
 
-	error = inode->i_op->getxattr(dentry, name, value, error);
+	error = inode->i_op->getxattr(dentry, inode, name, value, error);
 	*xattr_value = value;
 	return error;
 }
@@ -236,7 +236,7 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 	}
 nolsm:
 	if (inode->i_op->getxattr)
-		error = inode->i_op->getxattr(dentry, name, value, size);
+		error = inode->i_op->getxattr(dentry, inode, name, value, size);
 	else
 		error = -EOPNOTSUPP;
 
@@ -691,14 +691,15 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
  * Find the handler for the prefix and dispatch its get() operation.
  */
 ssize_t
-generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
+generic_getxattr(struct dentry *dentry, struct inode *inode,
+		 const char *name, void *buffer, size_t size)
 {
 	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->get(handler, dentry, d_inode(dentry),
+	return handler->get(handler, dentry, inode,
 			    name, buffer, size);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 329ed372d708..1b5fcaeea827 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1702,7 +1702,8 @@ struct inode_operations {
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
-	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+	ssize_t (*getxattr) (struct dentry *, struct inode *,
+			     const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index c11c022298b9..1cc4c578deb9 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -52,7 +52,7 @@ int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, i
 int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
 int vfs_removexattr(struct dentry *, const char *);
 
-ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size);
+ssize_t generic_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
 int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
 int generic_removexattr(struct dentry *dentry, const char *name);
diff --git a/net/socket.c b/net/socket.c
index 5f77a8e93830..35e4523edada 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -466,7 +466,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
 #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
 #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
-static ssize_t sockfs_getxattr(struct dentry *dentry,
+static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode,
 			       const char *name, void *value, size_t size)
 {
 	const char *proto_name;
diff --git a/security/commoncap.c b/security/commoncap.c
index 48071ed7c445..a042077312a5 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -313,7 +313,7 @@ int cap_inode_need_killpriv(struct dentry *dentry)
 	if (!inode->i_op->getxattr)
 	       return 0;
 
-	error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
+	error = inode->i_op->getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
 	if (error <= 0)
 		return 0;
 	return 1;
@@ -397,8 +397,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	if (!inode || !inode->i_op->getxattr)
 		return -ENODATA;
 
-	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps,
-				   XATTR_CAPS_SZ);
+	size = inode->i_op->getxattr((struct dentry *)dentry, inode,
+				     XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
 	if (size == -ENODATA || size == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 84c6d11fc096..b9e26288d30c 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -82,7 +82,7 @@ static int evm_find_protected_xattrs(struct dentry *dentry)
 		return -EOPNOTSUPP;
 
 	for (xattr = evm_config_xattrnames; *xattr != NULL; xattr++) {
-		error = inode->i_op->getxattr(dentry, *xattr, NULL, 0);
+		error = inode->i_op->getxattr(dentry, inode, *xattr, NULL, 0);
 		if (error < 0) {
 			if (error == -ENODATA)
 				continue;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 889cd59ca5a7..469f5c75bd4b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -506,7 +506,8 @@ static int sb_finish_set_opts(struct super_block *sb)
 			rc = -EOPNOTSUPP;
 			goto out;
 		}
-		rc = root_inode->i_op->getxattr(root, XATTR_NAME_SELINUX, NULL, 0);
+		rc = root_inode->i_op->getxattr(root, root_inode,
+						XATTR_NAME_SELINUX, NULL, 0);
 		if (rc < 0 && rc != -ENODATA) {
 			if (rc == -EOPNOTSUPP)
 				printk(KERN_WARNING "SELinux: (dev %s, type "
@@ -1412,13 +1413,13 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 			goto out_unlock;
 		}
 		context[len] = '\0';
-		rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX,
+		rc = inode->i_op->getxattr(dentry, inode, XATTR_NAME_SELINUX,
 					   context, len);
 		if (rc == -ERANGE) {
 			kfree(context);
 
 			/* Need a larger buffer.  Query for the right size. */
-			rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX,
+			rc = inode->i_op->getxattr(dentry, inode, XATTR_NAME_SELINUX,
 						   NULL, 0);
 			if (rc < 0) {
 				dput(dentry);
@@ -1432,7 +1433,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 				goto out_unlock;
 			}
 			context[len] = '\0';
-			rc = inode->i_op->getxattr(dentry,
+			rc = inode->i_op->getxattr(dentry, inode,
 						   XATTR_NAME_SELINUX,
 						   context, len);
 		}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 50bcca26c0b7..ff2b8c3cf7a9 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -272,7 +272,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip,
 	if (buffer == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	rc = ip->i_op->getxattr(dp, name, buffer, SMK_LONGLABEL);
+	rc = ip->i_op->getxattr(dp, ip, name, buffer, SMK_LONGLABEL);
 	if (rc < 0)
 		skp = ERR_PTR(rc);
 	else if (rc == 0)
@@ -3519,7 +3519,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 					TRANS_TRUE, TRANS_TRUE_SIZE,
 					0);
 			} else {
-				rc = inode->i_op->getxattr(dp,
+				rc = inode->i_op->getxattr(dp, inode,
 					XATTR_NAME_SMACKTRANSMUTE, trattr,
 					TRANS_TRUE_SIZE);
 				if (rc >= 0 && strncmp(trattr, TRANS_TRUE,
-- 
cgit v1.2.3


From adae28c59a6a71a971ffed713550911546df0e20 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 23 Mar 2016 11:36:53 +0100
Subject: mfd: syscon: Include errno.h from header

The syscon header uses the ENOTSUPP error constant, but doesn't
include the header that defines it. This causes a build error
after the imx pinctrl driver started using syscon:

include/linux/mfd/syscon.h: In function 'syscon_node_to_regmap':
include/linux/mfd/syscon.h:32:18: error: 'ENOTSUPP' undeclared (first use in this function)
  return ERR_PTR(-ENOTSUPP);
                  ^~~~~~~~

This adds the missing #include.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 8626ada871f1 ("pinctrl: imx: attach iomuxc device to gpr syscon")
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/syscon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index 1088149be0c9..40a76b97b7ab 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -16,6 +16,7 @@
 #define __LINUX_MFD_SYSCON_H__
 
 #include <linux/err.h>
+#include <linux/errno.h>
 
 struct device_node;
 
-- 
cgit v1.2.3


From a16d6ebca6efb73f6402f36e5aebf84f61721856 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Sun, 3 Apr 2016 20:44:45 +0200
Subject: i2c: introduce helper function to get 8 bit address from a message

Drivers do this in various ways, let's use one standard way of doing it.
Note: I2C_M_RD is bit 0, so the code could be simplified. To be extremly
robust and to advertise good coding practices, I still use the ternary
operator and let the compilers do the optimizing job.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 200cf13b00f6..c30833b7b073 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -654,6 +654,11 @@ static inline int i2c_adapter_id(struct i2c_adapter *adap)
 	return adap->nr;
 }
 
+static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg)
+{
+	return (msg->addr << 1) | (msg->flags & I2C_M_RD ? 1 : 0);
+}
+
 /**
  * module_i2c_driver() - Helper macro for registering a modular I2C driver
  * @__i2c_driver: i2c_driver struct
-- 
cgit v1.2.3


From f0af9593372abfde34460aa1250e670cc535a7d8 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 24 Feb 2016 13:43:45 -0600
Subject: PCI: Add pci_add_dma_alias() to abstract implementation

Add a pci_add_dma_alias() interface to encapsulate the details of adding an
alias.  No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/pci.c    | 14 ++++++++++++++
 drivers/pci/quirks.c | 19 +++++++------------
 include/linux/pci.h  |  1 +
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..1162118d1093 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4578,6 +4578,20 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
 	return 0;
 }
 
+/**
+ * pci_add_dma_alias - Add a DMA devfn alias for a device
+ * @dev: the PCI device for which alias is added
+ * @devfn: alias slot and function
+ *
+ * This helper encodes 8-bit devfn as bit number in dma_alias_mask.
+ * It should be called early, preferably as PCI fixup header quirk.
+ */
+void pci_add_dma_alias(struct pci_dev *dev, u8 devfn)
+{
+	dev->dma_alias_devfn = devfn;
+	dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+}
+
 bool pci_device_is_present(struct pci_dev *pdev)
 {
 	u32 v;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8e678027b900..e45a7a8338bb 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3610,10 +3610,8 @@ int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 
 static void quirk_dma_func0_alias(struct pci_dev *dev)
 {
-	if (PCI_FUNC(dev->devfn) != 0) {
-		dev->dma_alias_devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
-	}
+	if (PCI_FUNC(dev->devfn) != 0)
+		pci_add_dma_alias(dev, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
 }
 
 /*
@@ -3626,10 +3624,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_RICOH, 0xe476, quirk_dma_func0_alias);
 
 static void quirk_dma_func1_alias(struct pci_dev *dev)
 {
-	if (PCI_FUNC(dev->devfn) != 1) {
-		dev->dma_alias_devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 1);
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
-	}
+	if (PCI_FUNC(dev->devfn) != 1)
+		pci_add_dma_alias(dev, PCI_DEVFN(PCI_SLOT(dev->devfn), 1));
 }
 
 /*
@@ -3696,11 +3692,10 @@ static void quirk_fixed_dma_alias(struct pci_dev *dev)
 
 	id = pci_match_id(fixed_dma_alias_tbl, dev);
 	if (id) {
-		dev->dma_alias_devfn = id->driver_data;
-		dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+		pci_add_dma_alias(dev, id->driver_data);
 		dev_info(&dev->dev, "Enabling fixed DMA alias to %02x.%d\n",
-			 PCI_SLOT(dev->dma_alias_devfn),
-			 PCI_FUNC(dev->dma_alias_devfn));
+			 PCI_SLOT(id->driver_data),
+			 PCI_FUNC(id->driver_data));
 	}
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..7e7019064437 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1988,6 +1988,7 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 }
 #endif
 
+void pci_add_dma_alias(struct pci_dev *dev, u8 devfn);
 int pci_for_each_dma_alias(struct pci_dev *pdev,
 			   int (*fn)(struct pci_dev *pdev,
 				     u16 alias, void *data), void *data);
-- 
cgit v1.2.3


From 338c3149a221527e202ee26b1e35f76c965bb6c0 Mon Sep 17 00:00:00 2001
From: Jacek Lawrynowicz <jacek.lawrynowicz@intel.com>
Date: Thu, 3 Mar 2016 15:38:02 +0100
Subject: PCI: Add support for multiple DMA aliases

Solve IOMMU support issues with PCIe non-transparent bridges that use
Requester ID look-up tables (RID-LUT), e.g., the PEX8733.

The NTB connects devices in two independent PCI domains.  Devices separated
by the NTB are not able to discover each other.  A PCI packet being
forwared from one domain to another has to have its RID modified so it
appears on correct bus and completions are forwarded back to the original
domain through the NTB.  The RID is translated using a preprogrammed table
(LUT) and the PCI packet propagates upstream away from the NTB.  If the
destination system has IOMMU enabled, the packet will be discarded because
the new RID is unknown to the IOMMU.  Adding a DMA alias for the new RID
allows IOMMU to properly recognize the packet.

Each device behind the NTB has a unique RID assigned in the RID-LUT.  The
current DMA alias implementation supports only a single alias, so it's not
possible to support mutiple devices behind the NTB when IOMMU is enabled.

Enable all possible aliases on a given bus (256) that are stored in a
bitset.  Alias devfn is directly translated to a bit number.  The bitset is
not allocated for devices that have no need for DMA aliases.

More details can be found in the following article:
http://www.plxtech.com/files/pdf/technical/expresslane/RTC_Enabling%20MulitHostSystemDesigns.pdf

Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 10 +++-------
 drivers/pci/pci.c     | 19 +++++++++++++++++--
 drivers/pci/probe.c   |  1 +
 drivers/pci/search.c  | 14 +++++++++-----
 include/linux/pci.h   |  5 ++---
 5 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index bfd4f7c3b1d8..1b49e940a318 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -660,8 +660,8 @@ static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev,
 }
 
 /*
- * Look for aliases to or from the given device for exisiting groups.  The
- * dma_alias_devfn only supports aliases on the same bus, therefore the search
+ * Look for aliases to or from the given device for existing groups. DMA
+ * aliases are only supported on the same bus, therefore the search
  * space is quite small (especially since we're really only looking at pcie
  * device, and therefore only expect multiple slots on the root complex or
  * downstream switch ports).  It's conceivable though that a pair of
@@ -686,11 +686,7 @@ static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev,
 			continue;
 
 		/* We alias them or they alias us */
-		if (((pdev->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN) &&
-		     pdev->dma_alias_devfn == tmp->devfn) ||
-		    ((tmp->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN) &&
-		     tmp->dma_alias_devfn == pdev->devfn)) {
-
+		if (pci_devs_are_dma_aliases(pdev, tmp)) {
 			group = get_pci_alias_group(tmp, devfns);
 			if (group) {
 				pci_dev_put(tmp);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c82ebd0f6982..0b90c2186f1c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4588,12 +4588,27 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
  */
 void pci_add_dma_alias(struct pci_dev *dev, u8 devfn)
 {
-	dev->dma_alias_devfn = devfn;
-	dev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
+	if (!dev->dma_alias_mask)
+		dev->dma_alias_mask = kcalloc(BITS_TO_LONGS(U8_MAX),
+					      sizeof(long), GFP_KERNEL);
+	if (!dev->dma_alias_mask) {
+		dev_warn(&dev->dev, "Unable to allocate DMA alias mask\n");
+		return;
+	}
+
+	set_bit(devfn, dev->dma_alias_mask);
 	dev_info(&dev->dev, "Enabling fixed DMA alias to %02x.%d\n",
 		 PCI_SLOT(devfn), PCI_FUNC(devfn));
 }
 
+bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2)
+{
+	return (dev1->dma_alias_mask &&
+		test_bit(dev2->devfn, dev1->dma_alias_mask)) ||
+	       (dev2->dma_alias_mask &&
+		test_bit(dev1->devfn, dev2->dma_alias_mask));
+}
+
 bool pci_device_is_present(struct pci_dev *pdev)
 {
 	u32 v;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8004f67c57ec..ae7daeb83e21 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1537,6 +1537,7 @@ static void pci_release_dev(struct device *dev)
 	pcibios_release_device(pci_dev);
 	pci_bus_put(pci_dev->bus);
 	kfree(pci_dev->driver_override);
+	kfree(pci_dev->dma_alias_mask);
 	kfree(pci_dev);
 }
 
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index a20ce7d5e2a7..33e0f033a48e 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -40,11 +40,15 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 	 * If the device is broken and uses an alias requester ID for
 	 * DMA, iterate over that too.
 	 */
-	if (unlikely(pdev->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN)) {
-		ret = fn(pdev, PCI_DEVID(pdev->bus->number,
-					 pdev->dma_alias_devfn), data);
-		if (ret)
-			return ret;
+	if (unlikely(pdev->dma_alias_mask)) {
+		u8 devfn;
+
+		for_each_set_bit(devfn, pdev->dma_alias_mask, U8_MAX) {
+			ret = fn(pdev, PCI_DEVID(pdev->bus->number, devfn),
+				 data);
+			if (ret)
+				return ret;
+		}
 	}
 
 	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e7019064437..5581d05f7833 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -166,8 +166,6 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) (1 << 2),
 	/* Flag for quirk use to store if quirk-specific ACS is enabled */
 	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) (1 << 3),
-	/* Flag to indicate the device uses dma_alias_devfn */
-	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
 	/* Use a PCIe-to-PCI bridge alias even if !pci_is_pcie */
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
 	/* Do not use bus resets for device */
@@ -273,7 +271,7 @@ struct pci_dev {
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;		/* which interrupt pin this device uses */
 	u16		pcie_flags_reg;	/* cached PCIe Capabilities Register */
-	u8		dma_alias_devfn;/* devfn of DMA alias, if any */
+	unsigned long	*dma_alias_mask;/* mask of enabled devfn aliases */
 
 	struct pci_driver *driver;	/* which driver has allocated this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
@@ -1989,6 +1987,7 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 #endif
 
 void pci_add_dma_alias(struct pci_dev *dev, u8 devfn);
+bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2);
 int pci_for_each_dma_alias(struct pci_dev *pdev,
 			   int (*fn)(struct pci_dev *pdev,
 				     u16 alias, void *data), void *data);
-- 
cgit v1.2.3


From 5ac7eace2d00eab5ae0e9fdee63e38aee6001f7c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Add a facility to restrict new links into a keyring

Add a facility whereby proposed new links to be added to a keyring can be
vetted, permitting them to be rejected if necessary.  This can be used to
block public keys from which the signature cannot be verified or for which
the signature verification fails.  It could also be used to provide
blacklisting.

This affects operations like add_key(), KEYCTL_LINK and KEYCTL_INSTANTIATE.

To this end:

 (1) A function pointer is added to the key struct that, if set, points to
     the vetting function.  This is called as:

	int (*restrict_link)(struct key *keyring,
			     const struct key_type *key_type,
			     unsigned long key_flags,
			     const union key_payload *key_payload),

     where 'keyring' will be the keyring being added to, key_type and
     key_payload will describe the key being added and key_flags[*] can be
     AND'ed with KEY_FLAG_TRUSTED.

     [*] This parameter will be removed in a later patch when
     	 KEY_FLAG_TRUSTED is removed.

     The function should return 0 to allow the link to take place or an
     error (typically -ENOKEY, -ENOPKG or -EKEYREJECTED) to reject the
     link.

     The pointer should not be set directly, but rather should be set
     through keyring_alloc().

     Note that if called during add_key(), preparse is called before this
     method, but a key isn't actually allocated until after this function
     is called.

 (2) KEY_ALLOC_BYPASS_RESTRICTION is added.  This can be passed to
     key_create_or_update() or key_instantiate_and_link() to bypass the
     restriction check.

 (3) KEY_FLAG_TRUSTED_ONLY is removed.  The entire contents of a keyring
     with this restriction emplaced can be considered 'trustworthy' by
     virtue of being in the keyring when that keyring is consulted.

 (4) key_alloc() and keyring_alloc() take an extra argument that will be
     used to set restrict_link in the new key.  This ensures that the
     pointer is set before the key is published, thus preventing a window
     of unrestrictedness.  Normally this argument will be NULL.

 (5) As a temporary affair, keyring_restrict_trusted_only() is added.  It
     should be passed to keyring_alloc() as the extra argument instead of
     setting KEY_FLAG_TRUSTED_ONLY on a keyring.  This will be replaced in
     a later patch with functions that look in the appropriate places for
     authoritative keys.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 Documentation/security/keys.txt  | 22 ++++++++++++
 certs/system_keyring.c           |  8 ++---
 fs/cifs/cifsacl.c                |  2 +-
 fs/nfs/nfs4idmap.c               |  2 +-
 include/linux/key.h              | 53 +++++++++++++++++++++++------
 net/dns_resolver/dns_key.c       |  2 +-
 net/rxrpc/ar-key.c               |  4 +--
 security/integrity/digsig.c      |  7 ++--
 security/integrity/ima/ima_mok.c |  8 ++---
 security/keys/key.c              | 43 +++++++++++++++++++----
 security/keys/keyring.c          | 73 ++++++++++++++++++++++++++++++++++++----
 security/keys/persistent.c       |  4 +--
 security/keys/process_keys.c     | 16 +++++----
 security/keys/request_key.c      |  4 +--
 security/keys/request_key_auth.c |  2 +-
 15 files changed, 198 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 8c183873b2b7..a6a50b359025 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -999,6 +999,10 @@ payload contents" for more information.
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
 				  key_perm_t perm,
+				  int (*restrict_link)(struct key *,
+						       const struct key_type *,
+						       unsigned long,
+						       const union key_payload *),
 				  unsigned long flags,
 				  struct key *dest);
 
@@ -1010,6 +1014,24 @@ payload contents" for more information.
     KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
     towards the user's quota).  Error ENOMEM can also be returned.
 
+    If restrict_link not NULL, it should point to a function that will be
+    called each time an attempt is made to link a key into the new keyring.
+    This function is called to check whether a key may be added into the keying
+    or not.  Callers of key_create_or_update() within the kernel can pass
+    KEY_ALLOC_BYPASS_RESTRICTION to suppress the check.  An example of using
+    this is to manage rings of cryptographic keys that are set up when the
+    kernel boots where userspace is also permitted to add keys - provided they
+    can be verified by a key the kernel already has.
+
+    When called, the restriction function will be passed the keyring being
+    added to, the key flags value and the type and payload of the key being
+    added.  Note that when a new key is being created, this is called between
+    payload preparsing and actual key creation.  The function should return 0
+    to allow the link or an error to reject it.
+
+    A convenience function, restrict_link_reject, exists to always return
+    -EPERM to in this case.
+
 
 (*) To check the validity of a key, this function can be called:
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index dc18869ff680..417d65882870 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -36,11 +36,10 @@ static __init int system_trusted_keyring_init(void)
 			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
 			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
-			      KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			      KEY_ALLOC_NOT_IN_QUOTA,
+			      keyring_restrict_trusted_only, NULL);
 	if (IS_ERR(system_trusted_keyring))
 		panic("Can't allocate system trusted keyring\n");
-
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
 	return 0;
 }
 
@@ -85,7 +84,8 @@ static __init int load_system_certificate_list(void)
 					   KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
 					   KEY_ALLOC_TRUSTED |
-					   KEY_ALLOC_BUILT_IN);
+					   KEY_ALLOC_BUILT_IN |
+					   KEY_ALLOC_BYPASS_RESTRICTION);
 		if (IS_ERR(key)) {
 			pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
 			       PTR_ERR(key));
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 3f93125916bf..71e8a56e9479 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -360,7 +360,7 @@ init_cifs_idmap(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 5ba22c6b0ffa..c444285bb1b1 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -201,7 +201,7 @@ int nfs_idmap_init(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/include/linux/key.h b/include/linux/key.h
index 5f5b1129dc92..83b603639d2e 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -174,10 +174,9 @@ struct key {
 #define KEY_FLAG_ROOT_CAN_CLEAR	6	/* set if key can be cleared by root without permission */
 #define KEY_FLAG_INVALIDATED	7	/* set if key has been invalidated */
 #define KEY_FLAG_TRUSTED	8	/* set if key is trusted */
-#define KEY_FLAG_TRUSTED_ONLY	9	/* set if keyring only accepts links to trusted keys */
-#define KEY_FLAG_BUILTIN	10	/* set if key is builtin */
-#define KEY_FLAG_ROOT_CAN_INVAL	11	/* set if key can be invalidated by root without permission */
-#define KEY_FLAG_KEEP		12	/* set if key should not be removed */
+#define KEY_FLAG_BUILTIN	9	/* set if key is built in to the kernel */
+#define KEY_FLAG_ROOT_CAN_INVAL	10	/* set if key can be invalidated by root without permission */
+#define KEY_FLAG_KEEP		11	/* set if key should not be removed */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -205,6 +204,21 @@ struct key {
 		};
 		int reject_error;
 	};
+
+	/* This is set on a keyring to restrict the addition of a link to a key
+	 * to it.  If this method isn't provided then it is assumed that the
+	 * keyring is open to any addition.  It is ignored for non-keyring
+	 * keys.
+	 *
+	 * This is intended for use with rings of trusted keys whereby addition
+	 * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
+	 * overrides this, allowing the kernel to add extra keys without
+	 * restriction.
+	 */
+	int (*restrict_link)(struct key *keyring,
+			     const struct key_type *type,
+			     unsigned long flags,
+			     const union key_payload *payload);
 };
 
 extern struct key *key_alloc(struct key_type *type,
@@ -212,14 +226,19 @@ extern struct key *key_alloc(struct key_type *type,
 			     kuid_t uid, kgid_t gid,
 			     const struct cred *cred,
 			     key_perm_t perm,
-			     unsigned long flags);
+			     unsigned long flags,
+			     int (*restrict_link)(struct key *,
+						  const struct key_type *,
+						  unsigned long,
+						  const union key_payload *));
 
 
-#define KEY_ALLOC_IN_QUOTA	0x0000	/* add to quota, reject if would overrun */
-#define KEY_ALLOC_QUOTA_OVERRUN	0x0001	/* add to quota, permit even if overrun */
-#define KEY_ALLOC_NOT_IN_QUOTA	0x0002	/* not in quota */
-#define KEY_ALLOC_TRUSTED	0x0004	/* Key should be flagged as trusted */
-#define KEY_ALLOC_BUILT_IN	0x0008	/* Key is built into kernel */
+#define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
+#define KEY_ALLOC_QUOTA_OVERRUN		0x0001	/* add to quota, permit even if overrun */
+#define KEY_ALLOC_NOT_IN_QUOTA		0x0002	/* not in quota */
+#define KEY_ALLOC_TRUSTED		0x0004	/* Key should be flagged as trusted */
+#define KEY_ALLOC_BUILT_IN		0x0008	/* Key is built into kernel */
+#define KEY_ALLOC_BYPASS_RESTRICTION	0x0010	/* Override the check on restricted keyrings */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
@@ -288,8 +307,22 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 const struct cred *cred,
 				 key_perm_t perm,
 				 unsigned long flags,
+				 int (*restrict_link)(struct key *,
+						      const struct key_type *,
+						      unsigned long,
+						      const union key_payload *),
 				 struct key *dest);
 
+extern int keyring_restrict_trusted_only(struct key *keyring,
+					 const struct key_type *type,
+					 unsigned long,
+					 const union key_payload *payload);
+
+extern int restrict_link_reject(struct key *keyring,
+				const struct key_type *type,
+				unsigned long flags,
+				const union key_payload *payload);
+
 extern int keyring_clear(struct key *keyring);
 
 extern key_ref_t keyring_search(key_ref_t keyring,
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index c79b85eb4d4c..8737412c7b27 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -281,7 +281,7 @@ static int __init init_dns_resolver(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3fb492eedeb9..1021b4c0bdd2 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	key = key_alloc(&key_type_rxrpc, "x",
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
-			KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
 		return -ENOMEM;
@@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
 
 	key = key_alloc(&key_type_rxrpc, keyname,
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key))
 		return key;
 
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 8ef15118cc78..659566c2200b 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -83,10 +83,9 @@ int __init integrity_init_keyring(const unsigned int id)
 				    ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				     KEY_USR_VIEW | KEY_USR_READ |
 				     KEY_USR_WRITE | KEY_USR_SEARCH),
-				    KEY_ALLOC_NOT_IN_QUOTA, NULL);
-	if (!IS_ERR(keyring[id]))
-		set_bit(KEY_FLAG_TRUSTED_ONLY, &keyring[id]->flags);
-	else {
+				    KEY_ALLOC_NOT_IN_QUOTA,
+				    NULL, NULL);
+	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
 		pr_info("Can't allocate %s keyring (%d)\n",
 			keyring_name[id], err);
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 676885e4320e..ef91248cb934 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -35,20 +35,20 @@ __init int ima_mok_init(void)
 			      (KEY_POS_ALL & ~KEY_POS_SETATTR) |
 			      KEY_USR_VIEW | KEY_USR_READ |
 			      KEY_USR_WRITE | KEY_USR_SEARCH,
-			      KEY_ALLOC_NOT_IN_QUOTA, NULL);
+			      KEY_ALLOC_NOT_IN_QUOTA,
+			      keyring_restrict_trusted_only, NULL);
 
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA,
+				keyring_restrict_trusted_only, NULL);
 
 	if (IS_ERR(ima_mok_keyring) || IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA MOK or blacklist keyrings.");
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &ima_mok_keyring->flags);
 
-	set_bit(KEY_FLAG_TRUSTED_ONLY, &ima_blacklist_keyring->flags);
 	set_bit(KEY_FLAG_KEEP, &ima_blacklist_keyring->flags);
 	return 0;
 }
diff --git a/security/keys/key.c b/security/keys/key.c
index b28755131687..deb881754e03 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -201,6 +201,7 @@ serial_exists:
  * @cred: The credentials specifying UID namespace.
  * @perm: The permissions mask of the new key.
  * @flags: Flags specifying quota properties.
+ * @restrict_link: Optional link restriction method for new keyrings.
  *
  * Allocate a key of the specified type with the attributes given.  The key is
  * returned in an uninstantiated state and the caller needs to instantiate the
@@ -223,7 +224,11 @@ serial_exists:
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
-		      key_perm_t perm, unsigned long flags)
+		      key_perm_t perm, unsigned long flags,
+		      int (*restrict_link)(struct key *,
+					   const struct key_type *,
+					   unsigned long,
+					   const union key_payload *))
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -291,6 +296,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->uid = uid;
 	key->gid = gid;
 	key->perm = perm;
+	key->restrict_link = restrict_link;
 
 	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
 		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
@@ -496,6 +502,12 @@ int key_instantiate_and_link(struct key *key,
 	}
 
 	if (keyring) {
+		if (keyring->restrict_link) {
+			ret = keyring->restrict_link(keyring, key->type,
+						     key->flags, &prep.payload);
+			if (ret < 0)
+				goto error;
+		}
 		ret = __key_link_begin(keyring, &key->index_key, &edit);
 		if (ret < 0)
 			goto error;
@@ -551,8 +563,12 @@ int key_reject_and_link(struct key *key,
 	awaken = 0;
 	ret = -EBUSY;
 
-	if (keyring)
+	if (keyring) {
+		if (keyring->restrict_link)
+			return -EPERM;
+
 		link_ret = __key_link_begin(keyring, &key->index_key, &edit);
+	}
 
 	mutex_lock(&key_construction_mutex);
 
@@ -793,6 +809,10 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
 	int ret;
+	int (*restrict_link)(struct key *,
+			     const struct key_type *,
+			     unsigned long,
+			     const union key_payload *) = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
 	 * types */
@@ -811,6 +831,10 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	key_check(keyring);
 
+	key_ref = ERR_PTR(-EPERM);
+	if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION))
+		restrict_link = keyring->restrict_link;
+
 	key_ref = ERR_PTR(-ENOTDIR);
 	if (keyring->type != &key_type_keyring)
 		goto error_put_type;
@@ -835,10 +859,15 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 	index_key.desc_len = strlen(index_key.description);
 
-	key_ref = ERR_PTR(-EPERM);
-	if (!prep.trusted && test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags))
-		goto error_free_prep;
-	flags |= prep.trusted ? KEY_ALLOC_TRUSTED : 0;
+	if (restrict_link) {
+		unsigned long kflags = prep.trusted ? KEY_FLAG_TRUSTED : 0;
+		ret = restrict_link(keyring,
+				    index_key.type, kflags, &prep.payload);
+		if (ret < 0) {
+			key_ref = ERR_PTR(ret);
+			goto error_free_prep;
+		}
+	}
 
 	ret = __key_link_begin(keyring, &index_key, &edit);
 	if (ret < 0) {
@@ -879,7 +908,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	/* allocate a new key */
 	key = key_alloc(index_key.type, index_key.description,
-			cred->fsuid, cred->fsgid, cred, perm, flags);
+			cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_link_end;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index f931ccfeefb0..d2d1f3378008 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -491,13 +491,18 @@ static long keyring_read(const struct key *keyring,
  */
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  const struct cred *cred, key_perm_t perm,
-			  unsigned long flags, struct key *dest)
+			  unsigned long flags,
+			  int (*restrict_link)(struct key *,
+					       const struct key_type *,
+					       unsigned long,
+					       const union key_payload *),
+			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, cred, perm, flags);
+			    uid, gid, cred, perm, flags, restrict_link);
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
@@ -510,6 +515,51 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 }
 EXPORT_SYMBOL(keyring_alloc);
 
+/**
+ * keyring_restrict_trusted_only - Restrict additions to a keyring to trusted keys only
+ * @keyring: The keyring being added to.
+ * @type: The type of key being added.
+ * @flags: The key flags.
+ * @payload: The payload of the key intended to be added.
+ *
+ * Reject the addition of any links to a keyring that point to keys that aren't
+ * marked as being trusted.  It can be overridden by passing
+ * KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when adding a key
+ * to a keyring.
+ *
+ * This is meant to be passed as the restrict_link parameter to
+ * keyring_alloc().
+ */
+int keyring_restrict_trusted_only(struct key *keyring,
+				  const struct key_type *type,
+				  unsigned long flags,
+				  const union key_payload *payload)
+{
+	return flags & KEY_FLAG_TRUSTED ? 0 : -EPERM;
+}
+
+/**
+ * restrict_link_reject - Give -EPERM to restrict link
+ * @keyring: The keyring being added to.
+ * @type: The type of key being added.
+ * @flags: The key flags.
+ * @payload: The payload of the key intended to be added.
+ *
+ * Reject the addition of any links to a keyring.  It can be overridden by
+ * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
+ * adding a key to a keyring.
+ *
+ * This is meant to be passed as the restrict_link parameter to
+ * keyring_alloc().
+ */
+int restrict_link_reject(struct key *keyring,
+			 const struct key_type *type,
+			 unsigned long flags,
+			 const union key_payload *payload)
+{
+	return -EPERM;
+}
+
 /*
  * By default, we keys found by getting an exact match on their descriptions.
  */
@@ -1191,6 +1241,17 @@ void __key_link_end(struct key *keyring,
 	up_write(&keyring->sem);
 }
 
+/*
+ * Check addition of keys to restricted keyrings.
+ */
+static int __key_link_check_restriction(struct key *keyring, struct key *key)
+{
+	if (!keyring->restrict_link)
+		return 0;
+	return keyring->restrict_link(keyring,
+				      key->type, key->flags, &key->payload);
+}
+
 /**
  * key_link - Link a key to a keyring
  * @keyring: The keyring to make the link in.
@@ -1221,14 +1282,12 @@ int key_link(struct key *keyring, struct key *key)
 	key_check(keyring);
 	key_check(key);
 
-	if (test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags) &&
-	    !test_bit(KEY_FLAG_TRUSTED, &key->flags))
-		return -EPERM;
-
 	ret = __key_link_begin(keyring, &key->index_key, &edit);
 	if (ret == 0) {
 		kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage));
-		ret = __key_link_check_live_key(keyring, key);
+		ret = __key_link_check_restriction(keyring, key);
+		if (ret == 0)
+			ret = __key_link_check_live_key(keyring, key);
 		if (ret == 0)
 			__key_link(key, &edit);
 		__key_link_end(keyring, &key->index_key, edit);
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index c9fae5ea89fe..2ef45b319dd9 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -26,7 +26,7 @@ static int key_create_persistent_register(struct user_namespace *ns)
 					current_cred(),
 					((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 					 KEY_USR_VIEW | KEY_USR_READ),
-					KEY_ALLOC_NOT_IN_QUOTA, NULL);
+					KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
 
@@ -60,7 +60,7 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
 				   uid, INVALID_GID, current_cred(),
 				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				    KEY_USR_VIEW | KEY_USR_READ),
-				   KEY_ALLOC_NOT_IN_QUOTA,
+				   KEY_ALLOC_NOT_IN_QUOTA, NULL,
 				   ns->persistent_keyring_register);
 	if (IS_ERR(persistent))
 		return ERR_CAST(persistent);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index e6d50172872f..40a885239782 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -76,7 +76,8 @@ int install_user_keyrings(void)
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID,
 						    cred, user_keyring_perm,
-						    KEY_ALLOC_IN_QUOTA, NULL);
+						    KEY_ALLOC_IN_QUOTA,
+						    NULL, NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
 				goto error;
@@ -92,7 +93,8 @@ int install_user_keyrings(void)
 			session_keyring =
 				keyring_alloc(buf, user->uid, INVALID_GID,
 					      cred, user_keyring_perm,
-					      KEY_ALLOC_IN_QUOTA, NULL);
+					      KEY_ALLOC_IN_QUOTA,
+					      NULL, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -134,7 +136,8 @@ int install_thread_keyring_to_cred(struct cred *new)
 
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN,
+				NULL, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
 
@@ -180,7 +183,8 @@ int install_process_keyring_to_cred(struct cred *new)
 
 	keyring = keyring_alloc("_pid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN,
+				NULL, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
 
@@ -231,7 +235,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 
 		keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
 					KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-					flags, NULL);
+					flags, NULL, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	} else {
@@ -785,7 +789,7 @@ long join_session_keyring(const char *name)
 		keyring = keyring_alloc(
 			name, old->uid, old->gid, old,
 			KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
-			KEY_ALLOC_IN_QUOTA, NULL);
+			KEY_ALLOC_IN_QUOTA, NULL, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index c7a117c9a8f3..a29e3554751e 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -116,7 +116,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 	cred = get_current_cred();
 	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
 				KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+				KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
 	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -355,7 +355,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx,
 
 	key = key_alloc(ctx->index_key.type, ctx->index_key.description,
 			ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
-			perm, flags);
+			perm, flags, NULL);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 4f0f112fe276..9db8b4a82787 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -202,7 +202,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	authkey = key_alloc(&key_type_request_key_auth, desc,
 			    cred->fsuid, cred->fsgid, cred,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
-			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
+			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(authkey)) {
 		ret = PTR_ERR(authkey);
 		goto error_alloc;
-- 
cgit v1.2.3


From 77f68bac9481ad440f4f34dda3d28c2dce6eb87b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:26 +0100
Subject: KEYS: Remove KEY_FLAG_TRUSTED and KEY_ALLOC_TRUSTED

Remove KEY_FLAG_TRUSTED and KEY_ALLOC_TRUSTED as they're no longer
meaningful.  Also we can drop the trusted flag from the preparse structure.

Given this, we no longer need to pass the key flags through to
restrict_link().

Further, we can now get rid of keyring_restrict_trusted_only() also.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c        |  2 --
 include/keys/system_keyring.h |  1 -
 include/linux/key-type.h      |  1 -
 include/linux/key.h           | 21 +++++----------------
 security/integrity/digsig.c   |  3 +--
 security/keys/key.c           | 11 ++---------
 security/keys/keyring.c       | 29 +----------------------------
 7 files changed, 9 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 4e2fa8ab01d6..e460d00a7781 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -31,7 +31,6 @@ extern __initconst const unsigned long system_certificate_list_size;
  */
 int restrict_link_by_builtin_trusted(struct key *keyring,
 				     const struct key_type *type,
-				     unsigned long flags,
 				     const union key_payload *payload)
 {
 	return restrict_link_by_signature(system_trusted_keyring,
@@ -97,7 +96,6 @@ static __init int load_system_certificate_list(void)
 					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 					   KEY_USR_VIEW | KEY_USR_READ),
 					   KEY_ALLOC_NOT_IN_QUOTA |
-					   KEY_ALLOC_TRUSTED |
 					   KEY_ALLOC_BUILT_IN |
 					   KEY_ALLOC_BYPASS_RESTRICTION);
 		if (IS_ERR(key)) {
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 93715913a0b1..c72330ae76df 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -18,7 +18,6 @@
 
 extern int restrict_link_by_builtin_trusted(struct key *keyring,
 					    const struct key_type *type,
-					    unsigned long flags,
 					    const union key_payload *payload);
 
 #else
diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 7463355a198b..eaee981c5558 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -45,7 +45,6 @@ struct key_preparsed_payload {
 	size_t		datalen;	/* Raw datalen */
 	size_t		quotalen;	/* Quota length for proposed payload */
 	time_t		expiry;		/* Expiry time of key */
-	bool		trusted;	/* True if key is trusted */
 };
 
 typedef int (*request_key_actor_t)(struct key_construction *key,
diff --git a/include/linux/key.h b/include/linux/key.h
index 83b603639d2e..722914798f37 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -173,10 +173,9 @@ struct key {
 #define KEY_FLAG_NEGATIVE	5	/* set if key is negative */
 #define KEY_FLAG_ROOT_CAN_CLEAR	6	/* set if key can be cleared by root without permission */
 #define KEY_FLAG_INVALIDATED	7	/* set if key has been invalidated */
-#define KEY_FLAG_TRUSTED	8	/* set if key is trusted */
-#define KEY_FLAG_BUILTIN	9	/* set if key is built in to the kernel */
-#define KEY_FLAG_ROOT_CAN_INVAL	10	/* set if key can be invalidated by root without permission */
-#define KEY_FLAG_KEEP		11	/* set if key should not be removed */
+#define KEY_FLAG_BUILTIN	8	/* set if key is built in to the kernel */
+#define KEY_FLAG_ROOT_CAN_INVAL	9	/* set if key can be invalidated by root without permission */
+#define KEY_FLAG_KEEP		10	/* set if key should not be removed */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -217,7 +216,6 @@ struct key {
 	 */
 	int (*restrict_link)(struct key *keyring,
 			     const struct key_type *type,
-			     unsigned long flags,
 			     const union key_payload *payload);
 };
 
@@ -229,16 +227,14 @@ extern struct key *key_alloc(struct key_type *type,
 			     unsigned long flags,
 			     int (*restrict_link)(struct key *,
 						  const struct key_type *,
-						  unsigned long,
 						  const union key_payload *));
 
 
 #define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
 #define KEY_ALLOC_QUOTA_OVERRUN		0x0001	/* add to quota, permit even if overrun */
 #define KEY_ALLOC_NOT_IN_QUOTA		0x0002	/* not in quota */
-#define KEY_ALLOC_TRUSTED		0x0004	/* Key should be flagged as trusted */
-#define KEY_ALLOC_BUILT_IN		0x0008	/* Key is built into kernel */
-#define KEY_ALLOC_BYPASS_RESTRICTION	0x0010	/* Override the check on restricted keyrings */
+#define KEY_ALLOC_BUILT_IN		0x0004	/* Key is built into kernel */
+#define KEY_ALLOC_BYPASS_RESTRICTION	0x0008	/* Override the check on restricted keyrings */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
@@ -309,18 +305,11 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 unsigned long flags,
 				 int (*restrict_link)(struct key *,
 						      const struct key_type *,
-						      unsigned long,
 						      const union key_payload *),
 				 struct key *dest);
 
-extern int keyring_restrict_trusted_only(struct key *keyring,
-					 const struct key_type *type,
-					 unsigned long,
-					 const union key_payload *payload);
-
 extern int restrict_link_reject(struct key *keyring,
 				const struct key_type *type,
-				unsigned long flags,
 				const union key_payload *payload);
 
 extern int keyring_clear(struct key *keyring);
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index d647178c6bbd..98ee4c752cf5 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -51,12 +51,11 @@ static bool init_keyring __initdata;
  */
 static int restrict_link_by_ima_mok(struct key *keyring,
 				    const struct key_type *type,
-				    unsigned long flags,
 				    const union key_payload *payload)
 {
 	int ret;
 
-	ret = restrict_link_by_builtin_trusted(keyring, type, flags, payload);
+	ret = restrict_link_by_builtin_trusted(keyring, type, payload);
 	if (ret != -ENOKEY)
 		return ret;
 
diff --git a/security/keys/key.c b/security/keys/key.c
index deb881754e03..bd5a272f28a6 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -227,7 +227,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		      key_perm_t perm, unsigned long flags,
 		      int (*restrict_link)(struct key *,
 					   const struct key_type *,
-					   unsigned long,
 					   const union key_payload *))
 {
 	struct key_user *user = NULL;
@@ -300,8 +299,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 
 	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
 		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
-	if (flags & KEY_ALLOC_TRUSTED)
-		key->flags |= 1 << KEY_FLAG_TRUSTED;
 	if (flags & KEY_ALLOC_BUILT_IN)
 		key->flags |= 1 << KEY_FLAG_BUILTIN;
 
@@ -504,7 +501,7 @@ int key_instantiate_and_link(struct key *key,
 	if (keyring) {
 		if (keyring->restrict_link) {
 			ret = keyring->restrict_link(keyring, key->type,
-						     key->flags, &prep.payload);
+						     &prep.payload);
 			if (ret < 0)
 				goto error;
 		}
@@ -811,7 +808,6 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	int ret;
 	int (*restrict_link)(struct key *,
 			     const struct key_type *,
-			     unsigned long,
 			     const union key_payload *) = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
@@ -843,7 +839,6 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	prep.data = payload;
 	prep.datalen = plen;
 	prep.quotalen = index_key.type->def_datalen;
-	prep.trusted = flags & KEY_ALLOC_TRUSTED;
 	prep.expiry = TIME_T_MAX;
 	if (index_key.type->preparse) {
 		ret = index_key.type->preparse(&prep);
@@ -860,9 +855,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	index_key.desc_len = strlen(index_key.description);
 
 	if (restrict_link) {
-		unsigned long kflags = prep.trusted ? KEY_FLAG_TRUSTED : 0;
-		ret = restrict_link(keyring,
-				    index_key.type, kflags, &prep.payload);
+		ret = restrict_link(keyring, index_key.type, &prep.payload);
 		if (ret < 0) {
 			key_ref = ERR_PTR(ret);
 			goto error_free_prep;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index d2d1f3378008..c91e4e0cea08 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -494,7 +494,6 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  unsigned long flags,
 			  int (*restrict_link)(struct key *,
 					       const struct key_type *,
-					       unsigned long,
 					       const union key_payload *),
 			  struct key *dest)
 {
@@ -515,34 +514,10 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 }
 EXPORT_SYMBOL(keyring_alloc);
 
-/**
- * keyring_restrict_trusted_only - Restrict additions to a keyring to trusted keys only
- * @keyring: The keyring being added to.
- * @type: The type of key being added.
- * @flags: The key flags.
- * @payload: The payload of the key intended to be added.
- *
- * Reject the addition of any links to a keyring that point to keys that aren't
- * marked as being trusted.  It can be overridden by passing
- * KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when adding a key
- * to a keyring.
- *
- * This is meant to be passed as the restrict_link parameter to
- * keyring_alloc().
- */
-int keyring_restrict_trusted_only(struct key *keyring,
-				  const struct key_type *type,
-				  unsigned long flags,
-				  const union key_payload *payload)
-{
-	return flags & KEY_FLAG_TRUSTED ? 0 : -EPERM;
-}
-
 /**
  * restrict_link_reject - Give -EPERM to restrict link
  * @keyring: The keyring being added to.
  * @type: The type of key being added.
- * @flags: The key flags.
  * @payload: The payload of the key intended to be added.
  *
  * Reject the addition of any links to a keyring.  It can be overridden by
@@ -554,7 +529,6 @@ int keyring_restrict_trusted_only(struct key *keyring,
  */
 int restrict_link_reject(struct key *keyring,
 			 const struct key_type *type,
-			 unsigned long flags,
 			 const union key_payload *payload)
 {
 	return -EPERM;
@@ -1248,8 +1222,7 @@ static int __key_link_check_restriction(struct key *keyring, struct key *key)
 {
 	if (!keyring->restrict_link)
 		return 0;
-	return keyring->restrict_link(keyring,
-				      key->type, key->flags, &key->payload);
+	return keyring->restrict_link(keyring, key->type, &key->payload);
 }
 
 /**
-- 
cgit v1.2.3


From 9a6f2b0113c8fce815db7c9d23754bdea4b428a0 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 11 Apr 2016 21:40:05 +0200
Subject: net: mdio: Fix lockdep falls positive splat

MDIO devices can be stacked upon each other. The current code supports
two levels, which until recently has been enough for a DSA mdio bus on
top of another bus. Now we have hardware which has an MDIO mux in the
middle.

Define an MDIO MUTEX class with three levels.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-mux.c | 10 ++--------
 drivers/net/phy/mdio_bus.c |  4 ++--
 include/linux/mdio.h       | 11 +++++++++++
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c
index 308ade0eb1b6..5c81d6faf304 100644
--- a/drivers/net/phy/mdio-mux.c
+++ b/drivers/net/phy/mdio-mux.c
@@ -45,13 +45,7 @@ static int mdio_mux_read(struct mii_bus *bus, int phy_id, int regnum)
 	struct mdio_mux_parent_bus *pb = cb->parent;
 	int r;
 
-	/* In theory multiple mdio_mux could be stacked, thus creating
-	 * more than a single level of nesting.  But in practice,
-	 * SINGLE_DEPTH_NESTING will cover the vast majority of use
-	 * cases.  We use it, instead of trying to handle the general
-	 * case.
-	 */
-	mutex_lock_nested(&pb->mii_bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&pb->mii_bus->mdio_lock, MDIO_MUTEX_MUX);
 	r = pb->switch_fn(pb->current_child, cb->bus_number, pb->switch_data);
 	if (r)
 		goto out;
@@ -76,7 +70,7 @@ static int mdio_mux_write(struct mii_bus *bus, int phy_id,
 
 	int r;
 
-	mutex_lock_nested(&pb->mii_bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&pb->mii_bus->mdio_lock, MDIO_MUTEX_MUX);
 	r = pb->switch_fn(pb->current_child, cb->bus_number, pb->switch_data);
 	if (r)
 		goto out;
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 0cba64f1ecf4..751202a285a6 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -457,7 +457,7 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum)
 
 	BUG_ON(in_interrupt());
 
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 	retval = bus->read(bus, addr, regnum);
 	mutex_unlock(&bus->mdio_lock);
 
@@ -509,7 +509,7 @@ int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val)
 
 	BUG_ON(in_interrupt());
 
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&bus->mdio_lock, MDIO_MUTEX_NESTED);
 	err = bus->write(bus, addr, regnum, val);
 	mutex_unlock(&bus->mdio_lock);
 
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 5bfd99d1a40a..bf9d1d750693 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -13,6 +13,17 @@
 
 struct mii_bus;
 
+/* Multiple levels of nesting are possible. However typically this is
+ * limited to nested DSA like layer, a MUX layer, and the normal
+ * user. Instead of trying to handle the general case, just define
+ * these cases.
+ */
+enum mdio_mutex_lock_class {
+	MDIO_MUTEX_NORMAL,
+	MDIO_MUTEX_MUX,
+	MDIO_MUTEX_NESTED,
+};
+
 struct mdio_device {
 	struct device dev;
 
-- 
cgit v1.2.3


From 757d12e5849be549076901b0d33c60d5f360269c Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Tue, 12 Apr 2016 21:07:06 +0530
Subject: dmaengine: ensure dmaengine helpers check valid callback

dmaengine has various device callbacks and exposes helper
functions to invoke these. These helpers should check if channel,
device and callback is valid or not before invoking them.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 017433712833..30de0197263a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -804,6 +804,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single(
 	sg_dma_address(&sg) = buf;
 	sg_dma_len(&sg) = len;
 
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, &sg, 1,
 						  dir, flags, NULL);
 }
@@ -812,6 +815,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_sg(
 	struct dma_chan *chan, struct scatterlist *sgl,	unsigned int sg_len,
 	enum dma_transfer_direction dir, unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, sgl, sg_len,
 						  dir, flags, NULL);
 }
@@ -823,6 +829,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_rio_sg(
 	enum dma_transfer_direction dir, unsigned long flags,
 	struct rio_dma_ext *rio_ext)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_slave_sg)
+		return NULL;
+
 	return chan->device->device_prep_slave_sg(chan, sgl, sg_len,
 						  dir, flags, rio_ext);
 }
@@ -833,6 +842,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
 		size_t period_len, enum dma_transfer_direction dir,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_dma_cyclic)
+		return NULL;
+
 	return chan->device->device_prep_dma_cyclic(chan, buf_addr, buf_len,
 						period_len, dir, flags);
 }
@@ -841,6 +853,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_interleaved_dma)
+		return NULL;
+
 	return chan->device->device_prep_interleaved_dma(chan, xt, flags);
 }
 
@@ -848,7 +863,7 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags)
 {
-	if (!chan || !chan->device)
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memset)
 		return NULL;
 
 	return chan->device->device_prep_dma_memset(chan, dest, value,
@@ -861,6 +876,9 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 		struct scatterlist *src_sg, unsigned int src_nents,
 		unsigned long flags)
 {
+	if (!chan || !chan->device || !chan->device->device_prep_dma_sg)
+		return NULL;
+
 	return chan->device->device_prep_dma_sg(chan, dst_sg, dst_nents,
 			src_sg, src_nents, flags);
 }
-- 
cgit v1.2.3


From 37e58237a16b94fcd2c2d1b7e9c6e1ca661c231b Mon Sep 17 00:00:00 2001
From: Ming Lin <ming.l@ssi.samsung.com>
Date: Tue, 22 Mar 2016 00:24:44 -0700
Subject: block: add offset in blk_add_request_payload()

We could kmalloc() the payload, so need the offset in page.

Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c         | 5 +++--
 drivers/block/skd_main.c | 2 +-
 drivers/scsi/sd.c        | 2 +-
 include/linux/blkdev.h   | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index b60537b2c35b..c50227796a26 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1523,6 +1523,7 @@ EXPORT_SYMBOL(blk_put_request);
  * blk_add_request_payload - add a payload to a request
  * @rq: request to update
  * @page: page backing the payload
+ * @offset: offset in page
  * @len: length of the payload.
  *
  * This allows to later add a payload to an already submitted request by
@@ -1533,12 +1534,12 @@ EXPORT_SYMBOL(blk_put_request);
  * discard requests should ever use it.
  */
 void blk_add_request_payload(struct request *rq, struct page *page,
-		unsigned int len)
+		int offset, unsigned int len)
 {
 	struct bio *bio = rq->bio;
 
 	bio->bi_io_vec->bv_page = page;
-	bio->bi_io_vec->bv_offset = 0;
+	bio->bi_io_vec->bv_offset = offset;
 	bio->bi_io_vec->bv_len = len;
 
 	bio->bi_iter.bi_size = len;
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 586f9168ffa4..9a9ec212fab8 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -562,7 +562,7 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
 	put_unaligned_be32(count, &buf[16]);
 
 	req = skreq->req;
-	blk_add_request_payload(req, page, len);
+	blk_add_request_payload(req, page, 0, len);
 }
 
 static void skd_request_fn_not_online(struct request_queue *q);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index f52b74cf8d1e..69b0a4a7a15f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -779,7 +779,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 	 * discarded on disk. This allows us to report completion on the full
 	 * amount of blocks described by the request.
 	 */
-	blk_add_request_payload(rq, page, len);
+	blk_add_request_payload(rq, page, 0, len);
 	ret = scsi_init_io(cmd);
 	rq->__data_len = nr_bytes;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 669e419d6234..bbaa76757018 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -779,7 +779,7 @@ extern struct request *blk_make_request(struct request_queue *, struct bio *,
 extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
-		unsigned int len);
+		int offset, unsigned int len);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
-- 
cgit v1.2.3


From e0489487ec9cd79ee1fa0dc5d3789c08b0e51a2c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 10 Mar 2016 13:58:46 +0200
Subject: blk-mq: Export tagset iter function

Its useful to iterate on all the active tags in cases
where we will need to fail all the queues IO.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
[hch: carefully check for valid tagsets]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 12 ++++++++++++
 include/linux/blk-mq.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index abdbb47405cb..2fd04286f103 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -474,6 +474,18 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 }
 EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
 
+void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
+		busy_tag_iter_fn *fn, void *priv)
+{
+	int i;
+
+	for (i = 0; i < tagset->nr_hw_queues; i++) {
+		if (tagset->tags && tagset->tags[i])
+			blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+	}
+}
+EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9ac9799b702b..c808fec1ce44 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -240,6 +240,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
+void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
+		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
-- 
cgit v1.2.3


From e8f1e1630b0a98685d1a3521e8aba0dc7e68082c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 10 Mar 2016 13:58:49 +0200
Subject: blk-mq: Make blk_mq_all_tag_busy_iter static

No caller outside the blk-mq code so we can settle
with it static.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 5 ++---
 include/linux/blk-mq.h | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2fd04286f103..56a0c37a3d06 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -464,15 +464,14 @@ static void bt_tags_for_each(struct blk_mq_tags *tags,
 	}
 }
 
-void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
-		void *priv)
+static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
 		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
 	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
 			false);
 }
-EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
 
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c808fec1ce44..2498fdf3a503 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -238,8 +238,6 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
-		void *priv);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
-- 
cgit v1.2.3


From 9fd4dcece43a53e5a9e65a973df5693702ee6401 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:13 +0100
Subject: debugfs: prevent access to possibly dead file_operations at file open

Nothing prevents a dentry found by path lookup before a return of
__debugfs_remove() to actually get opened after that return. Now, after
the return of __debugfs_remove(), there are no guarantees whatsoever
regarding the memory the corresponding inode's file_operations object
had been kept in.

Since __debugfs_remove() is seldomly invoked, usually from module exit
handlers only, the race is hard to trigger and the impact is very low.

A discussion of the problem outlined above as well as a suggested
solution can be found in the (sub-)thread rooted at

  http://lkml.kernel.org/g/20130401203445.GA20862@ZenIV.linux.org.uk
  ("Yet another pipe related oops.")

Basically, Greg KH suggests to introduce an intermediate fops and
Al Viro points out that a pointer to the original ones may be stored in
->d_fsdata.

Follow this line of reasoning:
- Add SRCU as a reverse dependency of DEBUG_FS.
- Introduce a srcu_struct object for the debugfs subsystem.
- In debugfs_create_file(), store a pointer to the original
  file_operations object in ->d_fsdata.
- Make debugfs_remove() and debugfs_remove_recursive() wait for a
  SRCU grace period after the dentry has been delete()'d and before they
  return to their callers.
- Introduce an intermediate file_operations object named
  "debugfs_open_proxy_file_operations". It's ->open() functions checks,
  under the protection of a SRCU read lock, whether the dentry is still
  alive, i.e. has not been d_delete()'d and if so, tries to acquire a
  reference on the owning module.
  On success, it sets the file object's ->f_op to the original
  file_operations and forwards the ongoing open() call to the original
  ->open().
- For clarity, rename the former debugfs_file_operations to
  debugfs_noop_file_operations -- they are in no way canonical.

The choice of SRCU over "normal" RCU is justified by the fact, that the
former may also be used to protect ->i_private data from going away
during the execution of a file's readers and writers which may (and do)
sleep.

Finally, introduce the fs/debugfs/internal.h header containing some
declarations internal to the debugfs implementation.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 91 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/debugfs/inode.c      | 13 ++++++-
 fs/debugfs/internal.h   | 24 +++++++++++++
 include/linux/debugfs.h |  3 --
 lib/Kconfig.debug       |  1 +
 5 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 fs/debugfs/internal.h

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d2ba12e23ed9..736ab3c988f2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,6 +22,9 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
 
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -35,13 +38,99 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-const struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_noop_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		simple_open,
 	.llseek =	noop_llseek,
 };
 
+/**
+ * debugfs_use_file_start - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ * @srcu_idx: a pointer to some memory to store a SRCU index in.
+ *
+ * Up to a matching call to debugfs_use_file_finish(), any
+ * successive call into the file removing functions debugfs_remove()
+ * and debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_use_file_start() without worrying about
+ * lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ *
+ * Regardless of the return code, any call to
+ * debugfs_use_file_start() must be followed by a matching call
+ * to debugfs_use_file_finish().
+ */
+static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+	__acquires(&debugfs_srcu)
+{
+	*srcu_idx = srcu_read_lock(&debugfs_srcu);
+	barrier();
+	if (d_unlinked(dentry))
+		return -EIO;
+	return 0;
+}
+
+/**
+ * debugfs_use_file_finish - mark the end of file data access
+ * @srcu_idx: the SRCU index "created" by a former call to
+ *            debugfs_use_file_start().
+ *
+ * Allow any ongoing concurrent call into debugfs_remove() or
+ * debugfs_remove_recursive() blocked by a former call to
+ * debugfs_use_file_start() to proceed and return to its caller.
+ */
+static void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+{
+	srcu_read_unlock(&debugfs_srcu, srcu_idx);
+}
+
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+#define REAL_FOPS_DEREF(dentry)					\
+	((const struct file_operations *)(dentry)->d_fsdata)
+
+static int open_proxy_open(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = NULL;
+	int srcu_idx, r;
+
+	r = debugfs_use_file_start(dentry, &srcu_idx);
+	if (r) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = fops_get(real_fops);
+	if (!real_fops) {
+		/* Huh? Module did not clean up after itself at exit? */
+		WARN(1, "debugfs file owner did not clean up at exit: %pd",
+			dentry);
+		r = -ENXIO;
+		goto out;
+	}
+	replace_fops(filp, real_fops);
+
+	if (real_fops->open)
+		r = real_fops->open(inode, filp);
+
+out:
+	fops_put(real_fops);
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+const struct file_operations debugfs_open_proxy_file_operations = {
+	.open = open_proxy_open,
+};
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b1e7f35f3cd4..2905dd160575 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,9 +27,14 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
 
 #define DEBUGFS_DEFAULT_MODE	0700
 
+DEFINE_SRCU(debugfs_srcu);
+
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
@@ -341,8 +346,12 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 		return failed_creating(dentry);
 
 	inode->i_mode = mode;
-	inode->i_fop = fops ? fops : &debugfs_file_operations;
 	inode->i_private = data;
+
+	inode->i_fop = fops ? &debugfs_open_proxy_file_operations
+		: &debugfs_noop_file_operations;
+	dentry->d_fsdata = (void *)fops;
+
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
 	return end_creating(dentry);
@@ -570,6 +579,7 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
@@ -647,6 +657,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
+	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
new file mode 100644
index 000000000000..c7aaa5cb6685
--- /dev/null
+++ b/fs/debugfs/internal.h
@@ -0,0 +1,24 @@
+/*
+ *  internal.h - declarations internal to debugfs
+ *
+ *  Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ */
+
+#ifndef _DEBUGFS_INTERNAL_H_
+#define _DEBUGFS_INTERNAL_H_
+
+struct file_operations;
+struct srcu_struct;
+
+/* declared over in file.c */
+extern const struct file_operations debugfs_noop_file_operations;
+extern const struct file_operations debugfs_open_proxy_file_operations;
+
+extern struct srcu_struct debugfs_srcu;
+
+#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 981e53ab84e8..fcafe2d389f9 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -43,9 +43,6 @@ extern struct dentry *arch_debugfs_dir;
 
 #if defined(CONFIG_DEBUG_FS)
 
-/* declared over in file.c */
-extern const struct file_operations debugfs_file_operations;
-
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..ddb0e8337aae 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -257,6 +257,7 @@ config PAGE_OWNER
 
 config DEBUG_FS
 	bool "Debug Filesystem"
+	select SRCU
 	help
 	  debugfs is a virtual file system that kernel developers use to put
 	  debugging files into.  Enable this option to be able to read and
-- 
cgit v1.2.3


From 49d200deaa680501f19a247b1fffb29301e51d2b Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:14 +0100
Subject: debugfs: prevent access to removed files' private data

Upon return of debugfs_remove()/debugfs_remove_recursive(), it might
still be attempted to access associated private file data through
previously opened struct file objects. If that data has been freed by
the caller of debugfs_remove*() in the meanwhile, the reading/writing
process would either encounter a fault or, if the memory address in
question has been reassigned again, unrelated data structures could get
overwritten.

However, since debugfs files are seldomly removed, usually from module
exit handlers only, the impact is very low.

Currently, there are ~1000 call sites of debugfs_create_file() spread
throughout the whole tree and touching all of those struct file_operations
in order to make them file removal aware by means of checking the result of
debugfs_use_file_start() from within their methods is unfeasible.

Instead, wrap the struct file_operations by a lifetime managing proxy at
file open:
- In debugfs_create_file(), the original fops handed in has got stashed
  away in ->d_fsdata already.
- In debugfs_create_file(), install a proxy file_operations factory,
  debugfs_full_proxy_file_operations, at ->i_fop.

This proxy factory has got an ->open() method only. It carries out some
lifetime checks and if successful, dynamically allocates and sets up a new
struct file_operations proxy at ->f_op. Afterwards, it forwards to the
->open() of the original struct file_operations in ->d_fsdata, if any.

The dynamically set up proxy at ->f_op has got a lifetime managing wrapper
set for each of the methods defined in the original struct file_operations
in ->d_fsdata.

Its ->release()er frees the proxy again and forwards to the original
->release(), if any.

In order not to mislead the VFS layer, it is strictly necessary to leave
those fields blank in the proxy that have been NULL in the original
struct file_operations also, i.e. aren't supported. This is why there is a
need for dynamically allocated proxies. The choice made not to allocate a
proxy instance for every dentry at file creation, but for every
struct file object instantiated thereof is justified by the expected usage
pattern of debugfs, namely that in general very few files get opened more
than once at a time.

The wrapper methods set in the struct file_operations implement lifetime
managing by means of the SRCU protection facilities already in place for
debugfs:
They set up a SRCU read side critical section and check whether the dentry
is still alive by means of debugfs_use_file_start(). If so, they forward
the call to the original struct file_operation stored in ->d_fsdata, still
under the protection of the SRCU read side critical section.
This SRCU read side critical section prevents any pending debugfs_remove()
and friends to return to their callers. Since a file's private data must
only be freed after the return of debugfs_remove(), the ongoing proxied
call is guarded against any file removal race.

If, on the other hand, the initial call to debugfs_use_file_start() detects
that the dentry is dead, the wrapper simply returns -EIO and does not
forward the call. Note that the ->poll() wrapper is special in that its
signature does not allow for the return of arbitrary -EXXX values and thus,
POLLHUP is returned here.

In order not to pollute debugfs with wrapper definitions that aren't ever
needed, I chose not to define a wrapper for every struct file_operations
method possible. Instead, a wrapper is defined only for the subset of
methods which are actually set by any debugfs users.
Currently, these are:

  ->llseek()
  ->read()
  ->write()
  ->unlocked_ioctl()
  ->poll()

The ->release() wrapper is special in that it does not protect the original
->release() in any way from dead files in order not to leak resources.
Thus, any ->release() handed to debugfs must implement file lifetime
management manually, if needed.
For only 33 out of a total of 434 releasers handed in to debugfs, it could
not be verified immediately whether they access data structures that might
have been freed upon a debugfs_remove() return in the meanwhile.

Export debugfs_use_file_start() and debugfs_use_file_finish() in order to
allow any ->release() to manually implement file lifetime management.

For a set of common cases of struct file_operations implemented by the
debugfs_core itself, future patches will incorporate file lifetime
management directly within those in order to allow for their unproxied
operation. Rename the original, non-proxying "debugfs_create_file()" to
"debugfs_create_file_unsafe()" and keep it for future internal use by
debugfs itself. Factor out code common to both into the new
__debugfs_create_file().

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 157 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/debugfs/inode.c      |  70 ++++++++++++++-------
 fs/debugfs/internal.h   |   6 +-
 include/linux/debugfs.h |  20 ++++++
 4 files changed, 226 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 736ab3c988f2..6eb58a8ed03c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -23,9 +23,12 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/srcu.h>
+#include <asm/poll.h>
 
 #include "internal.h"
 
+struct poll_table_struct;
+
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
 {
@@ -66,7 +69,7 @@ const struct file_operations debugfs_noop_file_operations = {
  * debugfs_use_file_start() must be followed by a matching call
  * to debugfs_use_file_finish().
  */
-static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 	__acquires(&debugfs_srcu)
 {
 	*srcu_idx = srcu_read_lock(&debugfs_srcu);
@@ -75,6 +78,7 @@ static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 		return -EIO;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(debugfs_use_file_start);
 
 /**
  * debugfs_use_file_finish - mark the end of file data access
@@ -85,10 +89,11 @@ static int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
  * debugfs_remove_recursive() blocked by a former call to
  * debugfs_use_file_start() to proceed and return to its caller.
  */
-static void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
 {
 	srcu_read_unlock(&debugfs_srcu, srcu_idx);
 }
+EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
@@ -131,6 +136,154 @@ const struct file_operations debugfs_open_proxy_file_operations = {
 	.open = open_proxy_open,
 };
 
+#define PROTO(args...) args
+#define ARGS(args...) args
+
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)		\
+static ret_type full_proxy_ ## name(proto)				\
+{									\
+	const struct dentry *dentry = F_DENTRY(filp);			\
+	const struct file_operations *real_fops =			\
+		REAL_FOPS_DEREF(dentry);				\
+	int srcu_idx;							\
+	ret_type r;							\
+									\
+	r = debugfs_use_file_start(dentry, &srcu_idx);			\
+	if (likely(!r))						\
+		r = real_fops->name(args);				\
+	debugfs_use_file_finish(srcu_idx);				\
+	return r;							\
+}
+
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+		PROTO(struct file *filp, loff_t offset, int whence),
+		ARGS(filp, offset, whence));
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+		PROTO(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+		PROTO(struct file *filp, const char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
+		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
+		ARGS(filp, cmd, arg));
+
+static unsigned int full_proxy_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	int srcu_idx;
+	unsigned int r = 0;
+
+	if (debugfs_use_file_start(dentry, &srcu_idx)) {
+		debugfs_use_file_finish(srcu_idx);
+		return POLLHUP;
+	}
+
+	r = real_fops->poll(filp, wait);
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+static int full_proxy_release(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	const struct file_operations *proxy_fops = filp->f_op;
+	int r = 0;
+
+	/*
+	 * We must not protect this against removal races here: the
+	 * original releaser should be called unconditionally in order
+	 * not to leak any resources. Releasers must not assume that
+	 * ->i_private is still being meaningful here.
+	 */
+	if (real_fops->release)
+		r = real_fops->release(inode, filp);
+
+	replace_fops(filp, d_inode(dentry)->i_fop);
+	kfree((void *)proxy_fops);
+	fops_put(real_fops);
+	return 0;
+}
+
+static void __full_proxy_fops_init(struct file_operations *proxy_fops,
+				const struct file_operations *real_fops)
+{
+	proxy_fops->release = full_proxy_release;
+	if (real_fops->llseek)
+		proxy_fops->llseek = full_proxy_llseek;
+	if (real_fops->read)
+		proxy_fops->read = full_proxy_read;
+	if (real_fops->write)
+		proxy_fops->write = full_proxy_write;
+	if (real_fops->poll)
+		proxy_fops->poll = full_proxy_poll;
+	if (real_fops->unlocked_ioctl)
+		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
+}
+
+static int full_proxy_open(struct inode *inode, struct file *filp)
+{
+	const struct dentry *dentry = F_DENTRY(filp);
+	const struct file_operations *real_fops = NULL;
+	struct file_operations *proxy_fops = NULL;
+	int srcu_idx, r;
+
+	r = debugfs_use_file_start(dentry, &srcu_idx);
+	if (r) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = fops_get(real_fops);
+	if (!real_fops) {
+		/* Huh? Module did not cleanup after itself at exit? */
+		WARN(1, "debugfs file owner did not clean up at exit: %pd",
+			dentry);
+		r = -ENXIO;
+		goto out;
+	}
+
+	proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
+	if (!proxy_fops) {
+		r = -ENOMEM;
+		goto free_proxy;
+	}
+	__full_proxy_fops_init(proxy_fops, real_fops);
+	replace_fops(filp, proxy_fops);
+
+	if (real_fops->open) {
+		r = real_fops->open(inode, filp);
+
+		if (filp->f_op != proxy_fops) {
+			/* No protection against file removal anymore. */
+			WARN(1, "debugfs file owner replaced proxy fops: %pd",
+				dentry);
+			goto free_proxy;
+		}
+	}
+
+	goto out;
+free_proxy:
+	kfree(proxy_fops);
+	fops_put(real_fops);
+out:
+	debugfs_use_file_finish(srcu_idx);
+	return r;
+}
+
+const struct file_operations debugfs_full_proxy_file_operations = {
+	.open = full_proxy_open,
+};
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2905dd160575..136f269f01de 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -300,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry)
 	return dentry;
 }
 
+static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
+				struct dentry *parent, void *data,
+				const struct file_operations *proxy_fops,
+				const struct file_operations *real_fops)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	if (!(mode & S_IFMT))
+		mode |= S_IFREG;
+	BUG_ON(!S_ISREG(mode));
+	dentry = start_creating(name, parent);
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = debugfs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = mode;
+	inode->i_private = data;
+
+	inode->i_fop = proxy_fops;
+	dentry->d_fsdata = (void *)real_fops;
+
+	d_instantiate(dentry, inode);
+	fsnotify_create(d_inode(dentry->d_parent), dentry);
+	return end_creating(dentry);
+}
+
 /**
  * debugfs_create_file - create a file in the debugfs filesystem
  * @name: a pointer to a string containing the name of the file to create.
@@ -330,33 +361,24 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
-	struct dentry *dentry;
-	struct inode *inode;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-	BUG_ON(!S_ISREG(mode));
-	dentry = start_creating(name, parent);
-
-	if (IS_ERR(dentry))
-		return NULL;
-
-	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
-		return failed_creating(dentry);
 
-	inode->i_mode = mode;
-	inode->i_private = data;
+	return __debugfs_create_file(name, mode, parent, data,
+				fops ? &debugfs_full_proxy_file_operations :
+					&debugfs_noop_file_operations,
+				fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
 
-	inode->i_fop = fops ? &debugfs_open_proxy_file_operations
-		: &debugfs_noop_file_operations;
-	dentry->d_fsdata = (void *)fops;
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops)
+{
 
-	d_instantiate(dentry, inode);
-	fsnotify_create(d_inode(dentry->d_parent), dentry);
-	return end_creating(dentry);
+	return __debugfs_create_file(name, mode, parent, data,
+				fops ? &debugfs_open_proxy_file_operations :
+					&debugfs_noop_file_operations,
+				fops);
 }
-EXPORT_SYMBOL_GPL(debugfs_create_file);
 
 /**
  * debugfs_create_file_size - create a file in the debugfs filesystem
@@ -579,6 +601,7 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+
 	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -657,6 +680,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
+
 	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index c7aaa5cb6685..bba52634b995 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -13,12 +13,14 @@
 #define _DEBUGFS_INTERNAL_H_
 
 struct file_operations;
-struct srcu_struct;
 
 /* declared over in file.c */
 extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
+extern const struct file_operations debugfs_full_proxy_file_operations;
 
-extern struct srcu_struct debugfs_srcu;
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+					struct dentry *parent, void *data,
+					const struct file_operations *fops);
 
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index fcafe2d389f9..a63e6ea3321c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -19,9 +19,11 @@
 #include <linux/seq_file.h>
 
 #include <linux/types.h>
+#include <linux/compiler.h>
 
 struct device;
 struct file_operations;
+struct srcu_struct;
 
 struct debugfs_blob_wrapper {
 	void *data;
@@ -41,6 +43,8 @@ struct debugfs_regset32 {
 
 extern struct dentry *arch_debugfs_dir;
 
+extern struct srcu_struct debugfs_srcu;
+
 #if defined(CONFIG_DEBUG_FS)
 
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
@@ -65,6 +69,11 @@ struct dentry *debugfs_create_automount(const char *name,
 void debugfs_remove(struct dentry *dentry);
 void debugfs_remove_recursive(struct dentry *dentry);
 
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+	__acquires(&debugfs_srcu);
+
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
+
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
@@ -173,6 +182,17 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+static inline int debugfs_use_file_start(const struct dentry *dentry,
+					int *srcu_idx)
+	__acquires(&debugfs_srcu)
+{
+	return 0;
+}
+
+static inline void debugfs_use_file_finish(int srcu_idx)
+	__releases(&debugfs_srcu)
+{ }
+
 static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, char *new_name)
 {
-- 
cgit v1.2.3


From c64688081490321f2d23a292ef24e60bb321f3f1 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 22 Mar 2016 14:11:15 +0100
Subject: debugfs: add support for self-protecting attribute file fops

In order to protect them against file removal issues, debugfs_create_file()
creates a lifetime managing proxy around each struct file_operations
handed in.

In cases where this struct file_operations is able to manage file lifetime
by itself already, the proxy created by debugfs is a waste of resources.

The most common class of struct file_operations given to debugfs are those
defined by means of the DEFINE_SIMPLE_ATTRIBUTE() macro.

Introduce a DEFINE_DEBUGFS_ATTRIBUTE() macro to allow any
struct file_operations of this class to be easily made file lifetime aware
and thus, to be operated unproxied.

Specifically, introduce debugfs_attr_read() and debugfs_attr_write()
which wrap simple_attr_read() and simple_attr_write() under the protection
of a debugfs_use_file_start()/debugfs_use_file_finish() pair.

Make DEFINE_DEBUGFS_ATTRIBUTE() set the defined struct file_operations'
->read() and ->write() members to these wrappers.

Export debugfs_create_file_unsafe() in order to allow debugfs users to
create their files in non-proxying operation mode.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 28 ++++++++++++++++++++++++++++
 fs/debugfs/inode.c      | 28 ++++++++++++++++++++++++++++
 include/linux/debugfs.h | 26 ++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6eb58a8ed03c..8ef56d9499a4 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -284,6 +284,34 @@ const struct file_operations debugfs_full_proxy_file_operations = {
 	.open = full_proxy_open,
 };
 
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+			size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+	int srcu_idx;
+
+	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+	if (likely(!ret))
+		ret = simple_attr_read(file, buf, len, ppos);
+	debugfs_use_file_finish(srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_read);
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+	int srcu_idx;
+
+	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+	if (likely(!ret))
+		ret = simple_attr_write(file, buf, len, ppos);
+	debugfs_use_file_finish(srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write);
+
 static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
 					  struct dentry *parent, void *value,
 				          const struct file_operations *fops,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 136f269f01de..41e079a8da26 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -369,6 +369,33 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 
+/**
+ * debugfs_create_file_unsafe - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * debugfs_create_file_unsafe() is completely analogous to
+ * debugfs_create_file(), the only difference being that the fops
+ * handed it will not get protected against file removals by the
+ * debugfs core.
+ *
+ * It is your responsibility to protect your struct file_operation
+ * methods against file removals by means of debugfs_use_file_start()
+ * and debugfs_use_file_finish(). ->open() is still protected by
+ * debugfs though.
+ *
+ * Any struct file_operations defined by means of
+ * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
+ * thus, may be used here.
+ */
 struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
@@ -379,6 +406,7 @@ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
 					&debugfs_noop_file_operations,
 				fops);
 }
+EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
 
 /**
  * debugfs_create_file_size - create a file in the debugfs filesystem
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index a63e6ea3321c..1438e2322d5c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -50,6 +50,9 @@ extern struct srcu_struct debugfs_srcu;
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops);
 
 struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
@@ -74,6 +77,26 @@ int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 
 void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
 
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+			size_t len, loff_t *ppos);
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+			size_t len, loff_t *ppos);
+
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)		\
+static int __fops ## _open(struct inode *inode, struct file *file)	\
+{									\
+	__simple_attr_check_format(__fmt, 0ull);			\
+	return simple_attr_open(inode, file, __get, __set, __fmt);	\
+}									\
+static const struct file_operations __fops = {				\
+	.owner	 = THIS_MODULE,					\
+	.open	 = __fops ## _open,					\
+	.release = simple_attr_release,				\
+	.read	 = debugfs_attr_read,					\
+	.write	 = debugfs_attr_write,					\
+	.llseek  = generic_file_llseek,				\
+}
+
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
@@ -193,6 +216,9 @@ static inline void debugfs_use_file_finish(int srcu_idx)
 	__releases(&debugfs_srcu)
 { }
 
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+	static const struct file_operations __fops = { 0 }
+
 static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, char *new_name)
 {
-- 
cgit v1.2.3


From 93e9d8e836cb1a9a58b33eb6643bf061c6119ef2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 12 Apr 2016 12:32:46 -0600
Subject: block: add ability to flag write back caching on a device

Add an internal helper and flag for setting whether a queue has
write back caching, or write through (or none). Add a sysfs file
to show this as well, and make it changeable from user space.

This will replace the (awkward) blk_queue_flush() interface that
drivers currently use to inform the block layer of write cache state
and capabilities.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/block/queue-sysfs.txt |  9 +++++++++
 block/blk-settings.c                | 26 +++++++++++++++++++++++++
 block/blk-sysfs.c                   | 39 +++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h              |  3 +++
 4 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index e5d914845be6..dce25d848d92 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -141,6 +141,15 @@ control of this block device to that new IO scheduler. Note that writing
 an IO scheduler name to this file will attempt to load that IO scheduler
 module, if it isn't already present in the system.
 
+write_cache (RW)
+----------------
+When read, this file will display whether the device has write back
+caching enabled or not. It will return "write back" for the former
+case, and "write through" for the latter. Writing to this file can
+change the kernels view of the device, but it doesn't alter the
+device state. This means that it might not be safe to toggle the
+setting from "write back" to "write through", since that will also
+eliminate cache flushes issued by the kernel.
 
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 331e4eee0dda..c903bee43cf8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -846,6 +846,32 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
+/**
+ * blk_queue_write_cache - configure queue's write cache
+ * @q:		the request queue for the device
+ * @wc:		write back cache on or off
+ * @fua:	device supports FUA writes, if true
+ *
+ * Tell the block layer about the write cache of @q.
+ */
+void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
+{
+	spin_lock_irq(q->queue_lock);
+	if (wc) {
+		queue_flag_set(QUEUE_FLAG_WC, q);
+		q->flush_flags = REQ_FLUSH;
+	} else
+		queue_flag_clear(QUEUE_FLAG_WC, q);
+	if (fua) {
+		if (wc)
+			q->flush_flags |= REQ_FUA;
+		queue_flag_set(QUEUE_FLAG_FUA, q);
+	} else
+		queue_flag_clear(QUEUE_FLAG_FUA, q);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_write_cache);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 995b58d46ed1..99205965f559 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -347,6 +347,38 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_wc_show(struct request_queue *q, char *page)
+{
+	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+		return sprintf(page, "write back\n");
+
+	return sprintf(page, "write through\n");
+}
+
+static ssize_t queue_wc_store(struct request_queue *q, const char *page,
+			      size_t count)
+{
+	int set = -1;
+
+	if (!strncmp(page, "write back", 10))
+		set = 1;
+	else if (!strncmp(page, "write through", 13) ||
+		 !strncmp(page, "none", 4))
+		set = 0;
+
+	if (set == -1)
+		return -EINVAL;
+
+	spin_lock_irq(q->queue_lock);
+	if (set)
+		queue_flag_set(QUEUE_FLAG_WC, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_WC, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return count;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -478,6 +510,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_wc_entry = {
+	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wc_show,
+	.store = queue_wc_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -503,6 +541,7 @@ static struct attribute *default_attrs[] = {
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
 	&queue_poll_entry.attr,
+	&queue_wc_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bbaa76757018..ba72687c5654 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -491,6 +491,8 @@ struct request_queue {
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_POLL	       22	/* IO polling enabled if set */
+#define QUEUE_FLAG_WC	       23	/* Write back caching */
+#define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -1009,6 +1011,7 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
+extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
-- 
cgit v1.2.3


From 2245f6de6c68b225986229a2de78c240536f7f38 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 30 Mar 2016 10:19:30 -0600
Subject: block: kill blk_queue_flush()

We don't have any drivers left using it, so kill it off. Update
documentation to use the newer blk_queue_write_cache().

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/block/writeback_cache_control.txt |  4 ++--
 block/blk-settings.c                            | 20 --------------------
 include/linux/blkdev.h                          |  1 -
 3 files changed, 2 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
index 83407d36630a..59e0516cbf6b 100644
--- a/Documentation/block/writeback_cache_control.txt
+++ b/Documentation/block/writeback_cache_control.txt
@@ -71,7 +71,7 @@ requests that have a payload.  For devices with volatile write caches the
 driver needs to tell the block layer that it supports flushing caches by
 doing:
 
-	blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
+	blk_queue_write_cache(sdkp->disk->queue, true, false);
 
 and handle empty REQ_FLUSH requests in its prep_fn/request_fn.  Note that
 REQ_FLUSH requests with a payload are automatically turned into a sequence
@@ -79,7 +79,7 @@ of an empty REQ_FLUSH request followed by the actual write by the block
 layer.  For devices that also support the FUA bit the block layer needs
 to be told to pass through the REQ_FUA bit using:
 
-	blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
+	blk_queue_write_cache(sdkp->disk->queue, true, true);
 
 and the driver must handle write requests that have the REQ_FUA bit set
 in prep_fn/request_fn.  If the FUA bit is not natively supported the block
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c903bee43cf8..80d9327a214b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -820,26 +820,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-/**
- * blk_queue_flush - configure queue's cache flush capability
- * @q:		the request queue for the device
- * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
- *
- * Tell block layer cache flush capability of @q.  If it supports
- * flushing, REQ_FLUSH should be set.  If it supports bypassing
- * write cache for individual writes, REQ_FUA should be set.
- */
-void blk_queue_flush(struct request_queue *q, unsigned int flush)
-{
-	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
-
-	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
-		flush &= ~REQ_FUA;
-
-	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
-}
-EXPORT_SYMBOL_GPL(blk_queue_flush);
-
 void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 {
 	q->flush_not_queueable = !queueable;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba72687c5654..f3f232fa505d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1009,7 +1009,6 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
-extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-- 
cgit v1.2.3


From bc405cd69a728d0a82bae8395fe43ec7b0afd1c6 Mon Sep 17 00:00:00 2001
From: Alexandre Macabies <web+oss@zopieux.com>
Date: Tue, 12 Apr 2016 18:53:00 +0200
Subject: ieee802154: add security bit check function

ieee802154_is_secen checks if the 802.15.4 security bit is set in the
frame control field.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Alexandre Macabies <web+oss@zopieux.com>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Acked-by: Alan Ott <alan@signal11.us>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index d3e415674dac..56090f195339 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -218,6 +218,7 @@ enum {
 /* frame control handling */
 #define IEEE802154_FCTL_FTYPE		0x0003
 #define IEEE802154_FCTL_ACKREQ		0x0020
+#define IEEE802154_FCTL_SECEN		0x0004
 #define IEEE802154_FCTL_INTRA_PAN	0x0040
 
 #define IEEE802154_FTYPE_DATA		0x0001
@@ -232,6 +233,15 @@ static inline int ieee802154_is_data(__le16 fc)
 		cpu_to_le16(IEEE802154_FTYPE_DATA);
 }
 
+/**
+ * ieee802154_is_secen - check if Security bit is set
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline bool ieee802154_is_secen(__le16 fc)
+{
+	return fc & cpu_to_le16(IEEE802154_FCTL_SECEN);
+}
+
 /**
  * ieee802154_is_ackreq - check if acknowledgment request bit is set
  * @fc: frame control bytes in little-endian byteorder
-- 
cgit v1.2.3


From b7594148c73cb506487b5f00a6574beceea0e3a0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 11 Apr 2016 11:04:14 +0200
Subject: ieee802154: cleanups for ieee802154.h

This patch removes some const from non-pointer types and fixes the
function name for the ieee802154_is_valid_extended_unicast_addr
comment.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Acked-by: Jukka Rissanen <jukka.rissanen@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 56090f195339..9d84a924b747 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -270,17 +270,17 @@ static inline bool ieee802154_is_intra_pan(__le16 fc)
  *
  * @len: psdu len with (MHR + payload + MFR)
  */
-static inline bool ieee802154_is_valid_psdu_len(const u8 len)
+static inline bool ieee802154_is_valid_psdu_len(u8 len)
 {
 	return (len == IEEE802154_ACK_PSDU_LEN ||
 		(len >= IEEE802154_MIN_PSDU_LEN && len <= IEEE802154_MTU));
 }
 
 /**
- * ieee802154_is_valid_psdu_len - check if extended addr is valid
+ * ieee802154_is_valid_extended_unicast_addr - check if extended addr is valid
  * @addr: extended addr to check
  */
-static inline bool ieee802154_is_valid_extended_unicast_addr(const __le64 addr)
+static inline bool ieee802154_is_valid_extended_unicast_addr(__le64 addr)
 {
 	/* Bail out if the address is all zero, or if the group
 	 * address bit is set.
-- 
cgit v1.2.3


From 118a5cf8ae236cdfa1eb4f21530843a8494722ef Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 11 Apr 2016 11:04:15 +0200
Subject: ieee802154: add short address helpers

This patch introduce some short address handling functionality into
ieee802154 headers.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Acked-by: Jukka Rissanen <jukka.rissanen@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 29 +++++++++++++++++++++++++++++
 include/net/mac802154.h    | 10 ++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 9d84a924b747..acedbb68a5a3 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -47,6 +47,7 @@
 #define IEEE802154_ADDR_SHORT_UNSPEC	0xfffe
 
 #define IEEE802154_EXTENDED_ADDR_LEN	8
+#define IEEE802154_SHORT_ADDR_LEN	2
 
 #define IEEE802154_LIFS_PERIOD		40
 #define IEEE802154_SIFS_PERIOD		12
@@ -289,6 +290,34 @@ static inline bool ieee802154_is_valid_extended_unicast_addr(__le64 addr)
 		!(addr & cpu_to_le64(0x0100000000000000ULL)));
 }
 
+/**
+ * ieee802154_is_broadcast_short_addr - check if short addr is broadcast
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_broadcast_short_addr(__le16 addr)
+{
+	return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST));
+}
+
+/**
+ * ieee802154_is_unspec_short_addr - check if short addr is unspecified
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_unspec_short_addr(__le16 addr)
+{
+	return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC));
+}
+
+/**
+ * ieee802154_is_valid_src_short_addr - check if source short address is valid
+ * @addr: short addr to check
+ */
+static inline bool ieee802154_is_valid_src_short_addr(__le16 addr)
+{
+	return !(ieee802154_is_broadcast_short_addr(addr) ||
+		 ieee802154_is_unspec_short_addr(addr));
+}
+
 /**
  * ieee802154_random_extended_addr - generates a random extended address
  * @addr: extended addr pointer to place the random address
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 6cd7a70706a9..e465c8551ac3 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -287,6 +287,16 @@ static inline void ieee802154_le16_to_be16(void *be16_dst, const void *le16_src)
 	put_unaligned_be16(get_unaligned_le16(le16_src), be16_dst);
 }
 
+/**
+ * ieee802154_be16_to_le16 - copies and convert be16 to le16
+ * @le16_dst: le16 destination pointer
+ * @be16_src: be16 source pointer
+ */
+static inline void ieee802154_be16_to_le16(void *le16_dst, const void *be16_src)
+{
+	put_unaligned_le16(get_unaligned_be16(be16_src), le16_dst);
+}
+
 /**
  * ieee802154_alloc_hw - Allocate a new hardware device
  *
-- 
cgit v1.2.3


From c422025c185fb2bb28df65b1bbed7953480c7f87 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 18 Mar 2016 16:24:41 +0200
Subject: dmaengine: dw: rename masters to reflect actual topology

The source and destination masters are reflecting buses or their layers to
where the different devices can be connected. The patch changes the master
names to reflect which one is related to which independently on the transfer
direction.

The outcome of the change is that the memory data width is now always limited
by a data width of the master which is dedicated to communicate to memory.

The patch will not break anything since all current users have the same data
width for all masters. Though it would be nice to revisit avr32 platforms to
check what is the actual hardware topology in use there. It seems that it has
one bus and two masters on it as stated by Table 8-2, that's why everything
works independently on the master in use. The purpose of the sequential patch
is to fix the driver for configuration of more than one bus.

The change is done in the assumption that src_master and dst_master are
reflecting a connection to the memory and peripheral correspondently on avr32
and otherwise on the rest.

Acked-by: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt |  4 ++--
 arch/avr32/mach-at32ap/at32ap700x.c                | 16 ++++++++--------
 drivers/ata/sata_dwc_460ex.c                       |  4 ++--
 drivers/dma/dw/core.c                              | 19 +++++++++----------
 drivers/dma/dw/platform.c                          | 12 ++++++------
 drivers/dma/dw/regs.h                              |  4 ++--
 drivers/spi/spi-pxa2xx-pci.c                       |  8 ++++----
 drivers/tty/serial/8250/8250_pci.c                 |  8 ++++----
 include/linux/platform_data/dma-dw.h               |  8 ++++----
 9 files changed, 41 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index c261598164a7..c99c1ffac199 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -47,8 +47,8 @@ The four cells in order are:
 
 1. A phandle pointing to the DMA controller
 2. The DMA request line number
-3. Source master for transfers on allocated channel
-4. Destination master for transfers on allocated channel
+3. Memory master for transfers on allocated channel
+4. Peripheral master for transfers on allocated channel
 
 Example:
 	
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index bf445aa48282..00d6dcc1d9b6 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -1365,8 +1365,8 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 	slave->dma_dev = &dw_dmac0_device.dev;
 	slave->src_id = 0;
 	slave->dst_id = 1;
-	slave->src_master = 1;
-	slave->dst_master = 0;
+	slave->m_master = 1;
+	slave->p_master = 0;
 
 	data->dma_slave = slave;
 	data->dma_filter = at32_mci_dma_filter;
@@ -2061,16 +2061,16 @@ at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data,
 	if (flags & AC97C_CAPTURE) {
 		rx_dws->dma_dev = &dw_dmac0_device.dev;
 		rx_dws->src_id = 3;
-		rx_dws->src_master = 0;
-		rx_dws->dst_master = 1;
+		rx_dws->m_master = 0;
+		rx_dws->p_master = 1;
 	}
 
 	/* Check if DMA slave interface for playback should be configured. */
 	if (flags & AC97C_PLAYBACK) {
 		tx_dws->dma_dev = &dw_dmac0_device.dev;
 		tx_dws->dst_id = 4;
-		tx_dws->src_master = 0;
-		tx_dws->dst_master = 1;
+		tx_dws->m_master = 0;
+		tx_dws->p_master = 1;
 	}
 
 	if (platform_device_add_data(pdev, data,
@@ -2141,8 +2141,8 @@ at32_add_device_abdac(unsigned int id, struct atmel_abdac_pdata *data)
 
 	dws->dma_dev = &dw_dmac0_device.dev;
 	dws->dst_id = 2;
-	dws->src_master = 0;
-	dws->dst_master = 1;
+	dws->m_master = 0;
+	dws->p_master = 1;
 
 	if (platform_device_add_data(pdev, data,
 				sizeof(struct atmel_abdac_pdata)))
diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 902034991517..80bdcabc293f 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -201,8 +201,8 @@ static struct sata_dwc_host_priv host_pvt;
 static struct dw_dma_slave sata_dwc_dma_dws = {
 	.src_id = 0,
 	.dst_id = 0,
-	.src_master = 0,
-	.dst_master = 1,
+	.m_master = 1,
+	.p_master = 0,
 };
 
 /*
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 97199b3c25a2..5bd7873a02c6 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -50,8 +50,8 @@
 		 | DWC_CTLL_SRC_MSIZE(_smsize)			\
 		 | DWC_CTLL_LLP_D_EN				\
 		 | DWC_CTLL_LLP_S_EN				\
-		 | DWC_CTLL_DMS(_dwc->dst_master)		\
-		 | DWC_CTLL_SMS(_dwc->src_master));		\
+		 | DWC_CTLL_DMS(_dwc->p_master)			\
+		 | DWC_CTLL_SMS(_dwc->m_master));		\
 	})
 
 /*
@@ -709,8 +709,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 	dwc->direction = DMA_MEM_TO_MEM;
 
-	data_width = min_t(unsigned int, dw->data_width[dwc->src_master],
-			   dw->data_width[dwc->dst_master]);
+	data_width = dw->data_width[dwc->m_master];
 
 	src_width = dst_width = min_t(unsigned int, data_width,
 				      dwc_fast_ffs(src | dest | len));
@@ -802,7 +801,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
 
-		data_width = dw->data_width[dwc->src_master];
+		data_width = dw->data_width[dwc->m_master];
 
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
@@ -859,7 +858,7 @@ slave_sg_todev_fill_desc:
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
 
-		data_width = dw->data_width[dwc->dst_master];
+		data_width = dw->data_width[dwc->m_master];
 
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
@@ -937,8 +936,8 @@ bool dw_dma_filter(struct dma_chan *chan, void *param)
 	dwc->src_id = dws->src_id;
 	dwc->dst_id = dws->dst_id;
 
-	dwc->src_master = dws->src_master;
-	dwc->dst_master = dws->dst_master;
+	dwc->m_master = dws->m_master;
+	dwc->p_master = dws->p_master;
 
 	return true;
 }
@@ -1227,8 +1226,8 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	dwc->src_id = 0;
 	dwc->dst_id = 0;
 
-	dwc->src_master = 0;
-	dwc->dst_master = 0;
+	dwc->m_master = 0;
+	dwc->p_master = 0;
 
 	dwc->initialized = false;
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 26edbe3a27ac..23616c57645c 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -42,13 +42,13 @@ static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec,
 
 	slave.src_id = dma_spec->args[0];
 	slave.dst_id = dma_spec->args[0];
-	slave.src_master = dma_spec->args[1];
-	slave.dst_master = dma_spec->args[2];
+	slave.m_master = dma_spec->args[1];
+	slave.p_master = dma_spec->args[2];
 
 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
-		    slave.src_master >= dw->nr_masters ||
-		    slave.dst_master >= dw->nr_masters))
+		    slave.m_master >= dw->nr_masters ||
+		    slave.p_master >= dw->nr_masters))
 		return NULL;
 
 	dma_cap_zero(cap);
@@ -66,8 +66,8 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param)
 		.dma_dev = dma_spec->dev,
 		.src_id = dma_spec->slave_id,
 		.dst_id = dma_spec->slave_id,
-		.src_master = 1,
-		.dst_master = 0,
+		.m_master = 0,
+		.p_master = 1,
 	};
 
 	return dw_dma_filter(chan, &slave);
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 0a50c18d85b8..a63d62bbffe2 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -249,8 +249,8 @@ struct dw_dma_chan {
 	/* custom slave configuration */
 	u8			src_id;
 	u8			dst_id;
-	u8			src_master;
-	u8			dst_master;
+	u8			m_master;
+	u8			p_master;
 
 	/* configuration passed via .device_config */
 	struct dma_slave_config dma_sconfig;
diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index 520ed1dd5780..4fd7f9802f1b 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -144,16 +144,16 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev,
 		struct dw_dma_slave *slave = c->tx_param;
 
 		slave->dma_dev = &dma_dev->dev;
-		slave->src_master = 1;
-		slave->dst_master = 0;
+		slave->m_master = 0;
+		slave->p_master = 1;
 	}
 
 	if (c->rx_param) {
 		struct dw_dma_slave *slave = c->rx_param;
 
 		slave->dma_dev = &dma_dev->dev;
-		slave->src_master = 1;
-		slave->dst_master = 0;
+		slave->m_master = 0;
+		slave->p_master = 1;
 	}
 
 	spi_pdata.dma_filter = lpss_dma_filter;
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 98862aa5bb58..5eea74d7f9f4 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1454,13 +1454,13 @@ byt_serial_setup(struct serial_private *priv,
 		return -EINVAL;
 	}
 
-	rx_param->src_master = 1;
-	rx_param->dst_master = 0;
+	rx_param->m_master = 0;
+	rx_param->p_master = 1;
 
 	dma->rxconf.src_maxburst = 16;
 
-	tx_param->src_master = 1;
-	tx_param->dst_master = 0;
+	tx_param->m_master = 0;
+	tx_param->p_master = 1;
 
 	dma->txconf.dst_maxburst = 16;
 
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 03b6095d3b18..b881b978e486 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -21,15 +21,15 @@
  * @dma_dev:	required DMA master device
  * @src_id:	src request line
  * @dst_id:	dst request line
- * @src_master: src master for transfers on allocated channel.
- * @dst_master: dest master for transfers on allocated channel.
+ * @m_master:	memory master for transfers on allocated channel
+ * @p_master:	peripheral master for transfers on allocated channel
  */
 struct dw_dma_slave {
 	struct device		*dma_dev;
 	u8			src_id;
 	u8			dst_id;
-	u8			src_master;
-	u8			dst_master;
+	u8			m_master;
+	u8			p_master;
 };
 
 /**
-- 
cgit v1.2.3


From c888a8f95ae5b1067855235b3b71c1ebccf504f5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 13 Apr 2016 13:33:19 -0600
Subject: block: kill off q->flush_flags

Now that we converted everything to the newer block write cache
interface, kill off the queue flush_flags and queueable flush
entries.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                    |  3 ++-
 block/blk-flush.c                   | 11 ++++++-----
 block/blk-settings.c                | 18 ++++++++++--------
 drivers/block/xen-blkback/xenbus.c  |  2 +-
 drivers/md/dm-table.c               | 12 ++++++------
 drivers/md/raid5-cache.c            |  3 ++-
 drivers/target/target_core_iblock.c |  6 +++---
 include/linux/blkdev.h              |  5 ++---
 8 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c50227796a26..2475b1c72773 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1964,7 +1964,8 @@ generic_make_request_checks(struct bio *bio)
 	 * drivers without flush support don't have to worry
 	 * about them.
 	 */
-	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
 		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
 		if (!nr_sectors) {
 			err = 0;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9c423e53324a..b1c91d229e5e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,17 +95,18 @@ enum {
 static bool blk_kick_flush(struct request_queue *q,
 			   struct blk_flush_queue *fq);
 
-static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 {
 	unsigned int policy = 0;
 
 	if (blk_rq_sectors(rq))
 		policy |= REQ_FSEQ_DATA;
 
-	if (fflags & REQ_FLUSH) {
+	if (fflags & (1UL << QUEUE_FLAG_WC)) {
 		if (rq->cmd_flags & REQ_FLUSH)
 			policy |= REQ_FSEQ_PREFLUSH;
-		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+		if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
+		    (rq->cmd_flags & REQ_FUA))
 			policy |= REQ_FSEQ_POSTFLUSH;
 	}
 	return policy;
@@ -384,7 +385,7 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 void blk_insert_flush(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	unsigned int fflags = q->flush_flags;	/* may change, cache */
+	unsigned long fflags = q->queue_flags;	/* may change, cache */
 	unsigned int policy = blk_flush_policy(fflags, rq);
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 
@@ -393,7 +394,7 @@ void blk_insert_flush(struct request *rq)
 	 * REQ_FLUSH and FUA for the driver.
 	 */
 	rq->cmd_flags &= ~REQ_FLUSH;
-	if (!(fflags & REQ_FUA))
+	if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
 		rq->cmd_flags &= ~REQ_FUA;
 
 	/*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 80d9327a214b..f679ae122843 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -822,7 +822,12 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
 void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 {
-	q->flush_not_queueable = !queueable;
+	spin_lock_irq(q->queue_lock);
+	if (queueable)
+		clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
+	else
+		set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
+	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
@@ -837,16 +842,13 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 {
 	spin_lock_irq(q->queue_lock);
-	if (wc) {
+	if (wc)
 		queue_flag_set(QUEUE_FLAG_WC, q);
-		q->flush_flags = REQ_FLUSH;
-	} else
+	else
 		queue_flag_clear(QUEUE_FLAG_WC, q);
-	if (fua) {
-		if (wc)
-			q->flush_flags |= REQ_FUA;
+	if (fua)
 		queue_flag_set(QUEUE_FLAG_FUA, q);
-	} else
+	else
 		queue_flag_clear(QUEUE_FLAG_FUA, q);
 	spin_unlock_irq(q->queue_lock);
 }
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 26aa080e243c..3355f1cdd4e5 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -477,7 +477,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 		vbd->type |= VDISK_REMOVABLE;
 
 	q = bdev_get_queue(bdev);
-	if (q && q->flush_flags)
+	if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 		vbd->flush_support = true;
 
 	if (q && blk_queue_secdiscard(q))
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1ffc0abe11..626a5ec04466 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1348,13 +1348,13 @@ static void dm_table_verify_integrity(struct dm_table *t)
 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
 				sector_t start, sector_t len, void *data)
 {
-	unsigned flush = (*(unsigned *)data);
+	unsigned long flush = (unsigned long) data;
 	struct request_queue *q = bdev_get_queue(dev->bdev);
 
-	return q && (q->flush_flags & flush);
+	return q && (q->queue_flags & flush);
 }
 
-static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 {
 	struct dm_target *ti;
 	unsigned i = 0;
@@ -1375,7 +1375,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 			return true;
 
 		if (ti->type->iterate_devices &&
-		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
+		    ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
 			return true;
 	}
 
@@ -1518,9 +1518,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 
-	if (dm_table_supports_flush(t, REQ_FLUSH)) {
+	if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
 		wc = true;
-		if (dm_table_supports_flush(t, REQ_FUA))
+		if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
 			fua = true;
 	}
 	blk_queue_write_cache(q, wc, fua);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9531f5f05b93..26f14970a858 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1188,6 +1188,7 @@ ioerr:
 
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
+	struct request_queue *q = bdev_get_queue(rdev->bdev);
 	struct r5l_log *log;
 
 	if (PAGE_SIZE != 4096)
@@ -1197,7 +1198,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 		return -ENOMEM;
 	log->rdev = rdev;
 
-	log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
+	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
 
 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
 				       sizeof(rdev->mddev->uuid));
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 026a758e5778..7c4efb4417b0 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -687,10 +687,10 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		 * Force writethrough using WRITE_FUA if a volatile write cache
 		 * is not enabled, or if initiator set the Force Unit Access bit.
 		 */
-		if (q->flush_flags & REQ_FUA) {
+		if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
 			if (cmd->se_cmd_flags & SCF_FUA)
 				rw = WRITE_FUA;
-			else if (!(q->flush_flags & REQ_FLUSH))
+			else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
@@ -836,7 +836,7 @@ static bool iblock_get_write_cache(struct se_device *dev)
 	struct block_device *bd = ib_dev->ibd_bd;
 	struct request_queue *q = bdev_get_queue(bd);
 
-	return q->flush_flags & REQ_FLUSH;
+	return test_bit(QUEUE_FLAG_WC, &q->queue_flags);
 }
 
 static const struct target_backend_ops iblock_ops = {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3f232fa505d..57c085917da6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -433,8 +433,6 @@ struct request_queue {
 	/*
 	 * for flush operations
 	 */
-	unsigned int		flush_flags;
-	unsigned int		flush_not_queueable:1;
 	struct blk_flush_queue	*fq;
 
 	struct list_head	requeue_list;
@@ -493,6 +491,7 @@ struct request_queue {
 #define QUEUE_FLAG_POLL	       22	/* IO polling enabled if set */
 #define QUEUE_FLAG_WC	       23	/* Write back caching */
 #define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
+#define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -1365,7 +1364,7 @@ static inline unsigned int block_size(struct block_device *bdev)
 
 static inline bool queue_flush_queueable(struct request_queue *q)
 {
-	return !q->flush_not_queueable;
+	return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
 }
 
 typedef struct {struct page *v;} Sector;
-- 
cgit v1.2.3


From 7d35812c3214afa5b37a675113555259cfd67b98 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:23 +0200
Subject: netfilter: x_tables: add and use xt_check_entry_offsets

Currently arp/ip and ip6tables each implement a short helper to check that
the target offset is large enough to hold one xt_entry_target struct and
that t->u.target_size fits within the current rule.

Unfortunately these checks are not sufficient.

To avoid adding new tests to all of ip/ip6/arptables move the current
checks into a helper, then extend this helper in followup patches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++++
 net/ipv4/netfilter/arp_tables.c    | 11 +----------
 net/ipv4/netfilter/ip_tables.c     | 12 +-----------
 net/ipv6/netfilter/ip6_tables.c    | 12 +-----------
 net/netfilter/x_tables.c           | 34 ++++++++++++++++++++++++++++++++++
 5 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 80a305b85323..0de0862897a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,6 +242,10 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset);
+
 int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 		   bool inv_proto);
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ec37f7c3a033..74668c1d3243 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -496,19 +496,10 @@ next:
 
 static inline int check_entry(const struct arpt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
-		return -EINVAL;
-
-	t = arpt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static inline int check_target(struct arpt_entry *e, const char *name)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 503038ea7735..71c204d4ca5f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -590,20 +590,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ipt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 126f2a0f006a..24ae7f458b3b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -602,20 +602,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ip6t_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 582c9cfd6567..1f44bfa8dd94 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -541,6 +541,40 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 #endif /* CONFIG_COMPAT */
 
+/**
+ * xt_check_entry_offsets - validate arp/ip/ip6t_entry
+ *
+ * @base: pointer to arp/ip/ip6t_entry
+ * @target_offset: the arp/ip/ip6_t->target_offset
+ * @next_offset: the arp/ip/ip6_t->next_offset
+ *
+ * validates that target_offset and next_offset are sane.
+ *
+ * The arp/ip/ip6t_entry structure @base must have passed following tests:
+ * - it must point to a valid memory location
+ * - base to base + next_offset must be accessible, i.e. not exceed allocated
+ *   length.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset)
+{
+	const struct xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_entry_offsets);
+
 int xt_check_target(struct xt_tgchk_param *par,
 		    unsigned int size, u_int8_t proto, bool inv_proto)
 {
-- 
cgit v1.2.3


From fc1221b3a163d1386d1052184202d5dc50d302d1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:26 +0200
Subject: netfilter: x_tables: add compat version of xt_check_entry_offsets

32bit rulesets have different layout and alignment requirements, so once
more integrity checks get added to xt_check_entry_offsets it will reject
well-formed 32bit rulesets.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  3 +++
 net/ipv4/netfilter/arp_tables.c    |  3 ++-
 net/ipv4/netfilter/ip_tables.c     |  3 ++-
 net/ipv6/netfilter/ip6_tables.c    |  3 ++-
 net/netfilter/x_tables.c           | 22 ++++++++++++++++++++++
 5 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 0de0862897a4..08de48bbe92e 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -494,6 +494,9 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset);
 
 #endif /* CONFIG_COMPAT */
 #endif /* _X_TABLES_H */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 24ad92a60b7a..ab8952a49bfa 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1254,7 +1254,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+					    e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cdf18502a047..7d24c872723f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1513,7 +1513,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e3783116ed48..73eee7b5fd60 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1525,7 +1525,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ec1b7183fff9..fa206ceb269f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -539,6 +539,27 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset)
+{
+	const struct compat_xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (t->u.target_size < sizeof(*t))
+		return -EINVAL;
+
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_compat_check_entry_offsets);
 #endif /* CONFIG_COMPAT */
 
 /**
@@ -549,6 +570,7 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
+ * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
-- 
cgit v1.2.3


From ce683e5f9d045e5d67d1312a42b359cb2ab2a13c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:28 +0200
Subject: netfilter: x_tables: check for bogus target offset

We're currently asserting that targetoff + targetsize <= nextoff.

Extend it to also check that targetoff is >= sizeof(xt_entry).
Since this is generic code, add an argument pointing to the start of the
match/target, we can then derive the base structure size from the delta.

We also need the e->elems pointer in a followup change to validate matches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++--
 net/ipv4/netfilter/arp_tables.c    |  5 +++--
 net/ipv4/netfilter/ip_tables.c     |  5 +++--
 net/ipv6/netfilter/ip6_tables.c    |  5 +++--
 net/netfilter/x_tables.c           | 17 +++++++++++++++--
 5 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 08de48bbe92e..30cfb1e943fb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,7 +242,7 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
-int xt_check_entry_offsets(const void *base,
+int xt_check_entry_offsets(const void *base, const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset);
 
@@ -494,7 +494,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ab8952a49bfa..95ed4e454c60 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -592,7 +592,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1254,7 +1255,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+	ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset,
 					    e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7d24c872723f..baab033d74e0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -754,7 +754,8 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1513,7 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73eee7b5fd60..6957627c7931 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -766,7 +766,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1525,7 +1526,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 1cb7a271c024..e2a6f2a9051b 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -546,14 +546,17 @@ struct compat_xt_standard_target {
 	compat_uint_t verdict;
 };
 
-/* see xt_check_entry_offsets */
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct compat_xt_entry_target *t;
 	const char *e = base;
 
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
@@ -577,12 +580,16 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * xt_check_entry_offsets - validate arp/ip/ip6t_entry
  *
  * @base: pointer to arp/ip/ip6t_entry
+ * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems
  * @target_offset: the arp/ip/ip6_t->target_offset
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
  * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
+ * This function does not validate the targets or matches themselves, it
+ * only tests that all the offsets and sizes are correct.
+ *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
  * - base to base + next_offset must be accessible, i.e. not exceed allocated
@@ -591,12 +598,18 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * Return: 0 on success, negative errno on failure.
  */
 int xt_check_entry_offsets(const void *base,
+			   const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct xt_entry_target *t;
 	const char *e = base;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 0188346f21e6546498c2a0f84888797ad4063fc5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 14:17:33 +0200
Subject: netfilter: x_tables: xt_compat_match_from_user doesn't need a retval

Always returned 0.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  2 +-
 net/ipv4/netfilter/arp_tables.c    | 17 +++++------------
 net/ipv4/netfilter/ip_tables.c     | 26 +++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    | 27 +++++++++------------------
 net/netfilter/x_tables.c           |  5 ++---
 5 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 30cfb1e943fb..e2da9b90f1b8 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -484,7 +484,7 @@ void xt_compat_init_offsets(u_int8_t af, unsigned int number);
 int xt_compat_calc_jump(u_int8_t af, unsigned int offset);
 
 int xt_compat_match_offset(const struct xt_match *match);
-int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 			      unsigned int *size);
 int xt_compat_match_to_user(const struct xt_entry_match *m,
 			    void __user **dstptr, unsigned int *size);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1d1386dc159b..be514c676fad 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1310,7 +1310,7 @@ out:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1319,9 +1319,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 	struct xt_target *target;
 	struct arpt_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct arpt_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct arpt_entry));
@@ -1342,7 +1341,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int translate_compat_table(struct xt_table_info **pinfo,
@@ -1421,16 +1419,11 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	entry1 = newinfo->entries;
 	pos = entry1;
 	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
 	xt_compat_flush_offsets(NFPROTO_ARP);
 	xt_compat_unlock(NFPROTO_ARP);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d70418604503..5c20eef980c1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1568,7 +1568,7 @@ release_matches:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1577,10 +1577,9 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 	struct xt_target *target;
 	struct ipt_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 	struct xt_entry_match *ematch;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct ipt_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct ipt_entry));
@@ -1589,11 +1588,9 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 	*dstptr += sizeof(struct ipt_entry);
 	*size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
 
-	xt_ematch_foreach(ematch, e) {
-		ret = xt_compat_match_from_user(ematch, dstptr, size);
-		if (ret != 0)
-			return ret;
-	}
+	xt_ematch_foreach(ematch, e)
+		xt_compat_match_from_user(ematch, dstptr, size);
+
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = compat_ipt_get_target(e);
 	target = t->u.kernel.target;
@@ -1606,7 +1603,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int
@@ -1729,16 +1725,12 @@ translate_compat_table(struct net *net,
 	entry1 = newinfo->entries;
 	pos = entry1;
 	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
 	xt_compat_flush_offsets(AF_INET);
 	xt_compat_unlock(AF_INET);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 8d082c557771..620d54c1c119 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1580,7 +1580,7 @@ release_matches:
 	return ret;
 }
 
-static int
+static void
 compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
@@ -1588,10 +1588,9 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 	struct xt_entry_target *t;
 	struct ip6t_entry *de;
 	unsigned int origsize;
-	int ret, h;
+	int h;
 	struct xt_entry_match *ematch;
 
-	ret = 0;
 	origsize = *size;
 	de = (struct ip6t_entry *)*dstptr;
 	memcpy(de, e, sizeof(struct ip6t_entry));
@@ -1600,11 +1599,9 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 	*dstptr += sizeof(struct ip6t_entry);
 	*size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
 
-	xt_ematch_foreach(ematch, e) {
-		ret = xt_compat_match_from_user(ematch, dstptr, size);
-		if (ret != 0)
-			return ret;
-	}
+	xt_ematch_foreach(ematch, e)
+		xt_compat_match_from_user(ematch, dstptr, size);
+
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = compat_ip6t_get_target(e);
 	xt_compat_target_from_user(t, dstptr, size);
@@ -1616,7 +1613,6 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 		if ((unsigned char *)de - base < newinfo->underflow[h])
 			newinfo->underflow[h] -= origsize - *size;
 	}
-	return ret;
 }
 
 static int compat_check_entry(struct ip6t_entry *e, struct net *net,
@@ -1737,17 +1733,12 @@ translate_compat_table(struct net *net,
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
-	size = compatr->size;
-	xt_entry_foreach(iter0, entry0, compatr->size) {
-		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  newinfo, entry1);
-		if (ret != 0)
-			break;
-	}
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
 	xt_compat_flush_offsets(AF_INET6);
 	xt_compat_unlock(AF_INET6);
-	if (ret)
-		goto free_newinfo;
 
 	ret = -ELOOP;
 	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f9aa9715c32e..7e7173b68344 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -526,8 +526,8 @@ int xt_compat_match_offset(const struct xt_match *match)
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_offset);
 
-int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
-			      unsigned int *size)
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+			       unsigned int *size)
 {
 	const struct xt_match *match = m->u.kernel.match;
 	struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
@@ -549,7 +549,6 @@ int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 
 	*size += off;
 	*dstptr += msize;
-	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
 
-- 
cgit v1.2.3


From d7591f0c41ce3e67600a982bab6989ef0f07b3ce Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Apr 2016 15:37:59 +0200
Subject: netfilter: x_tables: introduce and use xt_copy_counters_from_user

The three variants use same copy&pasted code, condense this into a
helper and use that.

Make sure info.name is 0-terminated.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  3 ++
 net/ipv4/netfilter/arp_tables.c    | 48 +++----------------------
 net/ipv4/netfilter/ip_tables.c     | 48 +++----------------------
 net/ipv6/netfilter/ip6_tables.c    | 49 +++----------------------
 net/netfilter/x_tables.c           | 74 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index e2da9b90f1b8..4dd9306c9d56 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -251,6 +251,9 @@ int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
 		    bool inv_proto);
 
+void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
+				 struct xt_counters_info *info, bool compat);
+
 struct xt_table *xt_register_table(struct net *net,
 				   const struct xt_table *table,
 				   struct xt_table_info *bootstrap,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8cefb7a2606b..60f5161abcb4 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1123,55 +1123,17 @@ static int do_add_counters(struct net *net, const void __user *user,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	const char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct arpt_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
 
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
 
-	t = xt_find_table_lock(net, NFPROTO_ARP, name);
+	t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1179,7 +1141,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 9340ce0a7549..735d1ee8c1ab 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1307,55 +1307,17 @@ do_add_counters(struct net *net, const void __user *user,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	const char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct ipt_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
 
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
 
-	t = xt_find_table_lock(net, AF_INET, name);
+	t = xt_find_table_lock(net, AF_INET, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1363,7 +1325,7 @@ do_add_counters(struct net *net, const void __user *user,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index aa010856a255..73e606c719ef 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1319,55 +1319,16 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
-	unsigned int num_counters;
-	char *name;
-	int size;
-	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
 	struct ip6t_entry *iter;
 	unsigned int addend;
-#ifdef CONFIG_COMPAT
-	struct compat_xt_counters_info compat_tmp;
-
-	if (compat) {
-		ptmp = &compat_tmp;
-		size = sizeof(struct compat_xt_counters_info);
-	} else
-#endif
-	{
-		ptmp = &tmp;
-		size = sizeof(struct xt_counters_info);
-	}
-
-	if (copy_from_user(ptmp, user, size) != 0)
-		return -EFAULT;
-
-#ifdef CONFIG_COMPAT
-	if (compat) {
-		num_counters = compat_tmp.num_counters;
-		name = compat_tmp.name;
-	} else
-#endif
-	{
-		num_counters = tmp.num_counters;
-		name = tmp.name;
-	}
-
-	if (len != size + num_counters * sizeof(struct xt_counters))
-		return -EINVAL;
-
-	paddc = vmalloc(len - size);
-	if (!paddc)
-		return -ENOMEM;
-
-	if (copy_from_user(paddc, user + size, len - size) != 0) {
-		ret = -EFAULT;
-		goto free;
-	}
 
-	t = xt_find_table_lock(net, AF_INET6, name);
+	paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
+	t = xt_find_table_lock(net, AF_INET6, tmp.name);
 	if (IS_ERR_OR_NULL(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
@@ -1375,7 +1336,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 
 	local_bh_disable();
 	private = t->private;
-	if (private->number != num_counters) {
+	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 9ec23ffa43b4..c69c892231d7 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -752,6 +752,80 @@ int xt_check_target(struct xt_tgchk_param *par,
 }
 EXPORT_SYMBOL_GPL(xt_check_target);
 
+/**
+ * xt_copy_counters_from_user - copy counters and metadata from userspace
+ *
+ * @user: src pointer to userspace memory
+ * @len: alleged size of userspace memory
+ * @info: where to store the xt_counters_info metadata
+ * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
+ *
+ * Copies counter meta data from @user and stores it in @info.
+ *
+ * vmallocs memory to hold the counters, then copies the counter data
+ * from @user to the new memory and returns a pointer to it.
+ *
+ * If @compat is true, @info gets converted automatically to the 64bit
+ * representation.
+ *
+ * The metadata associated with the counters is stored in @info.
+ *
+ * Return: returns pointer that caller has to test via IS_ERR().
+ * If IS_ERR is false, caller has to vfree the pointer.
+ */
+void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
+				 struct xt_counters_info *info, bool compat)
+{
+	void *mem;
+	u64 size;
+
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		/* structures only differ in size due to alignment */
+		struct compat_xt_counters_info compat_tmp;
+
+		if (len <= sizeof(compat_tmp))
+			return ERR_PTR(-EINVAL);
+
+		len -= sizeof(compat_tmp);
+		if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+			return ERR_PTR(-EFAULT);
+
+		strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+		info->num_counters = compat_tmp.num_counters;
+		user += sizeof(compat_tmp);
+	} else
+#endif
+	{
+		if (len <= sizeof(*info))
+			return ERR_PTR(-EINVAL);
+
+		len -= sizeof(*info);
+		if (copy_from_user(info, user, sizeof(*info)) != 0)
+			return ERR_PTR(-EFAULT);
+
+		info->name[sizeof(info->name) - 1] = '\0';
+		user += sizeof(*info);
+	}
+
+	size = sizeof(struct xt_counters);
+	size *= info->num_counters;
+
+	if (size != (u64)len)
+		return ERR_PTR(-EINVAL);
+
+	mem = vmalloc(len);
+	if (!mem)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(mem, user, len) == 0)
+		return mem;
+
+	vfree(mem);
+	return ERR_PTR(-EFAULT);
+}
+EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+
 #ifdef CONFIG_COMPAT
 int xt_compat_target_offset(const struct xt_target *target)
 {
-- 
cgit v1.2.3


From f9a7cbbf18f1640907d6ca345b8337e4b50ea56f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Fri, 8 Apr 2016 17:51:54 +0200
Subject: net: force inlining of netif_tx_start/stop_queue, sock_hold,
 __sock_put

Sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122
Arguably, gcc should do better, but gcc people aren't willing
to invest time into it, asking to use __always_inline instead.

With this .config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os,
the following functions get deinlined many times.

netif_tx_stop_queue: 207 copies, 590 calls:
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 80 8f e0 01 00 00 01 lock orb $0x1,0x1e0(%rdi)
	5d                      pop    %rbp
	c3                      retq

netif_tx_start_queue: 47 copies, 111 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 80 a7 e0 01 00 00 fe lock andb $0xfe,0x1e0(%rdi)
	5d                      pop    %rbp
	c3                      retq

sock_hold: 39 copies, 124 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 ff 87 80 00 00 00    lock incl 0x80(%rdi)
	5d                      pop    %rbp
	c3                      retq

__sock_put: 6 copies, 13 calls
	55                      push   %rbp
	48 89 e5                mov    %rsp,%rbp
	f0 ff 8f 80 00 00 00    lock decl 0x80(%rdi)
	5d                      pop    %rbp
	c3                      retq

This patch fixes this via s/inline/__always_inline/.

Code size decrease after the patch is ~2.5k:

    text      data      bss       dec     hex filename
56719876  56364551 36196352 149280779 8e5d80b vmlinux_before
56717440  56364551 36196352 149278343 8e5ce87 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: David S. Miller <davem@davemloft.net>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 include/net/sock.h        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 166402ae3324..e906c6570b38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2787,7 +2787,7 @@ static inline void netif_tx_schedule_all(struct net_device *dev)
 		netif_schedule_queue(netdev_get_tx_queue(dev, i));
 }
 
-static inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
+static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
 {
 	clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
@@ -2837,7 +2837,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 	}
 }
 
-static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
+static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index baba58770ac5..d997ec13a643 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -569,7 +569,7 @@ static inline bool __sk_del_node_init(struct sock *sk)
    modifications.
  */
 
-static inline void sock_hold(struct sock *sk)
+static __always_inline void sock_hold(struct sock *sk)
 {
 	atomic_inc(&sk->sk_refcnt);
 }
@@ -577,7 +577,7 @@ static inline void sock_hold(struct sock *sk)
 /* Ungrab socket in the context, which assumes that socket refcnt
    cannot hit zero, f.e. it is true in context of any socketcall.
  */
-static inline void __sock_put(struct sock *sk)
+static __always_inline void __sock_put(struct sock *sk)
 {
 	atomic_dec(&sk->sk_refcnt);
 }
-- 
cgit v1.2.3


From 743b03a83297690f0bd38c452a3bbb47d2be300a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 9 Apr 2016 11:29:58 -0700
Subject: net: remove netdevice gso_min_segs

After introduction of ndo_features_check(), we believe that very
specific checks for rare features should not be done in core
networking stack.

No driver uses gso_min_segs yet, so we revert this feature and save
few instructions per tx packet in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 +---
 net/core/dev.c            | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e906c6570b38..9884fe9a6552 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1586,8 +1586,6 @@ enum netdev_priv_flags {
  *	@gso_max_size:	Maximum size of generic segmentation offload
  *	@gso_max_segs:	Maximum number of segments that can be passed to the
  *			NIC for GSO
- *	@gso_min_segs:	Minimum number of segments that can be passed to the
- *			NIC for GSO
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -1858,7 +1856,7 @@ struct net_device {
 	unsigned int		gso_max_size;
 #define GSO_MAX_SEGS		65535
 	u16			gso_max_segs;
-	u16			gso_min_segs;
+
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index d51343a821ed..09fb1ace9dc8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2831,7 +2831,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	netdev_features_t features = dev->features;
 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 
-	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
+	if (gso_segs > dev->gso_max_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
 	/* If encapsulation offload request, verify we are testing
@@ -7429,7 +7429,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
-	dev->gso_min_segs = 0;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
-- 
cgit v1.2.3


From 95114344ea78649b1797d00ab6e88147bef66fa4 Mon Sep 17 00:00:00 2001
From: Rahul Verma <rahul.verma@qlogic.com>
Date: Sun, 10 Apr 2016 12:42:59 +0300
Subject: qed*: remove version dependency

Inbox drivers don't need versioning scheme in order to guarantee
compatibility, as both qed and qede are compiled from same codebase.

Signed-off-by: Rahul Verma <rahul.verma@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |  2 --
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  8 +-------
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 11 -----------
 drivers/net/ethernet/qlogic/qede/qede.h      |  2 --
 drivers/net/ethernet/qlogic/qede/qede_main.c | 11 +----------
 include/linux/qed/qed_eth_if.h               |  2 +-
 include/linux/qed/qed_if.h                   |  9 ---------
 7 files changed, 3 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index fcb8e9ba51d9..a3ee9df16dfe 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -507,6 +507,4 @@ u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
 
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 
-#define QED_ETH_INTERFACE_VERSION       300
-
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 3f35c6ca9252..e848d5a1f7f6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2043,14 +2043,8 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.get_vport_stats = &qed_get_vport_stats,
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version)
+const struct qed_eth_ops *qed_get_eth_ops(void)
 {
-	if (version != QED_ETH_INTERFACE_VERSION) {
-		pr_notice("Cannot supply ethtool operations [%08x != %08x]\n",
-			  version, QED_ETH_INTERFACE_VERSION);
-		return NULL;
-	}
-
 	return &qed_eth_ops_pass;
 }
 EXPORT_SYMBOL(qed_get_eth_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 26d40db07ddd..c31d485f72d6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1172,14 +1172,3 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.chain_free = &qed_chain_free,
 	.set_led = &qed_set_led,
 };
-
-u32 qed_get_protocol_version(enum qed_protocol protocol)
-{
-	switch (protocol) {
-	case QED_PROTOCOL_ETH:
-		return QED_ETH_INTERFACE_VERSION;
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL(qed_get_protocol_version);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index d023251544d9..e0a696a57d4d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -32,8 +32,6 @@
 		__stringify(QEDE_REVISION_VERSION) "."		\
 		__stringify(QEDE_ENGINEERING_VERSION)
 
-#define QEDE_ETH_INTERFACE_VERSION	300
-
 #define DRV_MODULE_SYM		qede
 
 struct qede_stats {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 518af329502d..a55d93eb41fa 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -141,19 +141,10 @@ static
 int __init qede_init(void)
 {
 	int ret;
-	u32 qed_ver;
 
 	pr_notice("qede_init: %s\n", version);
 
-	qed_ver = qed_get_protocol_version(QED_PROTOCOL_ETH);
-	if (qed_ver !=  QEDE_ETH_INTERFACE_VERSION) {
-		pr_notice("Version mismatch [%08x != %08x]\n",
-			  qed_ver,
-			  QEDE_ETH_INTERFACE_VERSION);
-		return -EINVAL;
-	}
-
-	qed_ops = qed_get_eth_ops(QEDE_ETH_INTERFACE_VERSION);
+	qed_ops = qed_get_eth_ops();
 	if (!qed_ops) {
 		pr_notice("Failed to get qed ethtool operations\n");
 		return -EINVAL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e1d69834a11f..e00c8dbfc324 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -167,7 +167,7 @@ struct qed_eth_ops {
 				struct qed_eth_stats *stats);
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version);
+const struct qed_eth_ops *qed_get_eth_ops(void);
 void qed_put_eth_ops(void);
 
 #endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 1f7599c77cd4..b007011e1c82 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -271,15 +271,6 @@ struct qed_common_ops {
 		       enum qed_led_mode mode);
 };
 
-/**
- * @brief qed_get_protocol_version
- *
- * @param protocol
- *
- * @return version supported by qed for given protocol driver
- */
-u32 qed_get_protocol_version(enum qed_protocol protocol);
-
 #define MASK_FIELD(_name, _value) \
 	((_value) &= (_name ## _MASK))
 
-- 
cgit v1.2.3


From 8c5ebd0c792a097fcc0e526debbe0887ee378ae5 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Sun, 10 Apr 2016 12:43:00 +0300
Subject: qed: add Rx flow hash/indirection support.

Adds the required API for passing RSS-related configuration from qede.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c | 17 +----------------
 include/linux/qed/qed_eth_if.h           |  1 +
 include/linux/qed/qed_if.h               | 11 +++++++++++
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e848d5a1f7f6..5005497ee23e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -35,19 +35,6 @@
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 
-enum qed_rss_caps {
-	QED_RSS_IPV4		= 0x1,
-	QED_RSS_IPV6		= 0x2,
-	QED_RSS_IPV4_TCP	= 0x4,
-	QED_RSS_IPV6_TCP	= 0x8,
-	QED_RSS_IPV4_UDP	= 0x10,
-	QED_RSS_IPV6_UDP	= 0x20,
-};
-
-/* Should be the same as ETH_RSS_IND_TABLE_ENTRIES_NUM */
-#define QED_RSS_IND_TABLE_SIZE 128
-#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
-
 struct qed_rss_params {
 	u8	update_rss_config;
 	u8	rss_enable;
@@ -1744,9 +1731,7 @@ static int qed_update_vport(struct qed_dev *cdev,
 		sp_rss_params.update_rss_capabilities = 1;
 		sp_rss_params.update_rss_ind_table = 1;
 		sp_rss_params.update_rss_key = 1;
-		sp_rss_params.rss_caps = QED_RSS_IPV4 |
-					 QED_RSS_IPV6 |
-					 QED_RSS_IPV4_TCP | QED_RSS_IPV6_TCP;
+		sp_rss_params.rss_caps = params->rss_params.rss_caps;
 		sp_rss_params.rss_table_size_log = 7; /* 2^7 = 128 */
 		memcpy(sp_rss_params.rss_ind_table,
 		       params->rss_params.rss_ind_table,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e00c8dbfc324..795c9902e02f 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -27,6 +27,7 @@ struct qed_dev_eth_info {
 struct qed_update_vport_rss_params {
 	u16	rss_ind_table[128];
 	u32	rss_key[10];
+	u8	rss_caps;
 };
 
 struct qed_update_vport_params {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b007011e1c82..67e8c206b2c1 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -515,4 +515,15 @@ static inline void internal_ram_wr(void __iomem *addr,
 	__internal_ram_wr(NULL, addr, size, data);
 }
 
+enum qed_rss_caps {
+	QED_RSS_IPV4		= 0x1,
+	QED_RSS_IPV6		= 0x2,
+	QED_RSS_IPV4_TCP	= 0x4,
+	QED_RSS_IPV6_TCP	= 0x8,
+	QED_RSS_IPV4_UDP	= 0x10,
+	QED_RSS_IPV6_UDP	= 0x20,
+};
+
+#define QED_RSS_IND_TABLE_SIZE 128
+#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
 #endif
-- 
cgit v1.2.3


From 04cf31a759ef575f750a63777cee95500e410994 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 24 Mar 2016 22:04:01 +1100
Subject: ftrace: Make ftrace_location_range() global

In order to support live patching on powerpc we would like to call
ftrace_location_range(), so make it global.

Signed-off-by: Torsten Duwe <duwe@suse.de>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 81de7123959d..3481a8e405f9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -455,6 +455,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable);
 int ftrace_test_record(struct dyn_ftrace *rec, int enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
+unsigned long ftrace_location_range(unsigned long start, unsigned long end);
 unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec);
 unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca592f977b2..e1b3f2312db0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1533,7 +1533,19 @@ static int ftrace_cmp_recs(const void *a, const void *b)
 	return 0;
 }
 
-static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
+/**
+ * ftrace_location_range - return the first address of a traced location
+ *	if it touches the given ip range
+ * @start: start of range to search.
+ * @end: end of range to search (inclusive). @end points to the last byte
+ *	to check.
+ *
+ * Returns rec->ip if the related ftrace location is a least partly within
+ * the given address range. That is, the first address of the instruction
+ * that is either a NOP or call to the function tracer. It checks the ftrace
+ * internal tables to determine if the address belongs or not.
+ */
+unsigned long ftrace_location_range(unsigned long start, unsigned long end)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-- 
cgit v1.2.3


From d17322feecf80152303426dd724577025d1fbd7e Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 9 Apr 2016 10:52:26 +0200
Subject: gpio: sx150x: move platform data into driver

The sx150x has some platform data definition in <linux/i2c/sx150x.h>
but this file is only included from the driver in the whole kernel
so move its contents into the driver.

Cc: Wei Chen <Wei.Chen@csr.com>
Cc: Peter Rosin <peda@axentia.se>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-sx150x.c | 60 ++++++++++++++++++++++++++++++++-
 include/linux/i2c/sx150x.h | 82 ----------------------------------------------
 2 files changed, 59 insertions(+), 83 deletions(-)
 delete mode 100644 include/linux/i2c/sx150x.h

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-sx150x.c b/drivers/gpio/gpio-sx150x.c
index d57e8ad0bfd2..d4501d5f8b8e 100644
--- a/drivers/gpio/gpio-sx150x.c
+++ b/drivers/gpio/gpio-sx150x.c
@@ -25,7 +25,6 @@
 #include <linux/irq.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/i2c/sx150x.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -85,6 +84,65 @@ struct sx150x_device_data {
 	} pri;
 };
 
+/**
+ * struct sx150x_platform_data - config data for SX150x driver
+ * @gpio_base: The index number of the first GPIO assigned to this
+ *             GPIO expander.  The expander will create a block of
+ *             consecutively numbered gpios beginning at the given base,
+ *             with the size of the block depending on the model of the
+ *             expander chip.
+ * @oscio_is_gpo: If set to true, the driver will configure OSCIO as a GPO
+ *                instead of as an oscillator, increasing the size of the
+ *                GP(I)O pool created by this expander by one.  The
+ *                output-only GPO pin will be added at the end of the block.
+ * @io_pullup_ena: A bit-mask which enables or disables the pull-up resistor
+ *                 for each IO line in the expander.  Setting the bit at
+ *                 position n will enable the pull-up for the IO at
+ *                 the corresponding offset.  For chips with fewer than
+ *                 16 IO pins, high-end bits are ignored.
+ * @io_pulldn_ena: A bit-mask which enables-or disables the pull-down
+ *                 resistor for each IO line in the expander. Setting the
+ *                 bit at position n will enable the pull-down for the IO at
+ *                 the corresponding offset.  For chips with fewer than
+ *                 16 IO pins, high-end bits are ignored.
+ * @io_open_drain_ena: A bit-mask which enables-or disables open-drain
+ *                     operation for each IO line in the expander. Setting the
+ *                     bit at position n enables open-drain operation for
+ *                     the IO at the corresponding offset.  Clearing the bit
+ *                     enables regular push-pull operation for that IO.
+ *                     For chips with fewer than 16 IO pins, high-end bits
+ *                     are ignored.
+ * @io_polarity: A bit-mask which enables polarity inversion for each IO line
+ *               in the expander.  Setting the bit at position n inverts
+ *               the polarity of that IO line, while clearing it results
+ *               in normal polarity. For chips with fewer than 16 IO pins,
+ *               high-end bits are ignored.
+ * @irq_summary: The 'summary IRQ' line to which the GPIO expander's INT line
+ *               is connected, via which it reports interrupt events
+ *               across all GPIO lines.  This must be a real,
+ *               pre-existing IRQ line.
+ *               Setting this value < 0 disables the irq_chip functionality
+ *               of the driver.
+ * @irq_base: The first 'virtual IRQ' line at which our block of GPIO-based
+ *            IRQ lines will appear.  Similarly to gpio_base, the expander
+ *            will create a block of irqs beginning at this number.
+ *            This value is ignored if irq_summary is < 0.
+ * @reset_during_probe: If set to true, the driver will trigger a full
+ *                      reset of the chip at the beginning of the probe
+ *                      in order to place it in a known state.
+ */
+struct sx150x_platform_data {
+	unsigned gpio_base;
+	bool     oscio_is_gpo;
+	u16      io_pullup_ena;
+	u16      io_pulldn_ena;
+	u16      io_open_drain_ena;
+	u16      io_polarity;
+	int      irq_summary;
+	unsigned irq_base;
+	bool     reset_during_probe;
+};
+
 struct sx150x_chip {
 	struct gpio_chip                 gpio_chip;
 	struct i2c_client               *client;
diff --git a/include/linux/i2c/sx150x.h b/include/linux/i2c/sx150x.h
deleted file mode 100644
index 52baa79d69a7..000000000000
--- a/include/linux/i2c/sx150x.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Driver for the Semtech SX150x I2C GPIO Expanders
- *
- * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-#ifndef __LINUX_I2C_SX150X_H
-#define __LINUX_I2C_SX150X_H
-
-/**
- * struct sx150x_platform_data - config data for SX150x driver
- * @gpio_base: The index number of the first GPIO assigned to this
- *             GPIO expander.  The expander will create a block of
- *             consecutively numbered gpios beginning at the given base,
- *             with the size of the block depending on the model of the
- *             expander chip.
- * @oscio_is_gpo: If set to true, the driver will configure OSCIO as a GPO
- *                instead of as an oscillator, increasing the size of the
- *                GP(I)O pool created by this expander by one.  The
- *                output-only GPO pin will be added at the end of the block.
- * @io_pullup_ena: A bit-mask which enables or disables the pull-up resistor
- *                 for each IO line in the expander.  Setting the bit at
- *                 position n will enable the pull-up for the IO at
- *                 the corresponding offset.  For chips with fewer than
- *                 16 IO pins, high-end bits are ignored.
- * @io_pulldn_ena: A bit-mask which enables-or disables the pull-down
- *                 resistor for each IO line in the expander. Setting the
- *                 bit at position n will enable the pull-down for the IO at
- *                 the corresponding offset.  For chips with fewer than
- *                 16 IO pins, high-end bits are ignored.
- * @io_open_drain_ena: A bit-mask which enables-or disables open-drain
- *                     operation for each IO line in the expander. Setting the
- *                     bit at position n enables open-drain operation for
- *                     the IO at the corresponding offset.  Clearing the bit
- *                     enables regular push-pull operation for that IO.
- *                     For chips with fewer than 16 IO pins, high-end bits
- *                     are ignored.
- * @io_polarity: A bit-mask which enables polarity inversion for each IO line
- *               in the expander.  Setting the bit at position n inverts
- *               the polarity of that IO line, while clearing it results
- *               in normal polarity. For chips with fewer than 16 IO pins,
- *               high-end bits are ignored.
- * @irq_summary: The 'summary IRQ' line to which the GPIO expander's INT line
- *               is connected, via which it reports interrupt events
- *               across all GPIO lines.  This must be a real,
- *               pre-existing IRQ line.
- *               Setting this value < 0 disables the irq_chip functionality
- *               of the driver.
- * @irq_base: The first 'virtual IRQ' line at which our block of GPIO-based
- *            IRQ lines will appear.  Similarly to gpio_base, the expander
- *            will create a block of irqs beginning at this number.
- *            This value is ignored if irq_summary is < 0.
- * @reset_during_probe: If set to true, the driver will trigger a full
- *                      reset of the chip at the beginning of the probe
- *                      in order to place it in a known state.
- */
-struct sx150x_platform_data {
-	unsigned gpio_base;
-	bool     oscio_is_gpo;
-	u16      io_pullup_ena;
-	u16      io_pulldn_ena;
-	u16      io_open_drain_ena;
-	u16      io_polarity;
-	int      irq_summary;
-	unsigned irq_base;
-	bool     reset_during_probe;
-};
-
-#endif /* __LINUX_I2C_SX150X_H */
-- 
cgit v1.2.3


From cbc53e08a793b073e79f42ca33f1f3568703540d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:44:51 -0400
Subject: GSO: Add GSO type for fixed IPv4 ID

This patch adds support for TSO using IPv4 headers with a fixed IP ID
field.  This is meant to allow us to do a lossless GRO in the case of TCP
flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
headers.

In addition I am adding a feature that for now I am referring to TSO with
IP ID mangling.  Basically when this flag is enabled the device has the
option to either output the flow with incrementing IP IDs or with a fixed
IP ID regardless of what the original IP ID ordering was.  This is useful
in cases where the DF bit is set and we do not care if the original IP ID
value is maintained.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  3 +++
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          | 20 +++++++++++---------
 net/core/dev.c                  | 34 +++++++++++++++++++++++++++++-----
 net/core/ethtool.c              |  1 +
 net/ipv4/af_inet.c              | 19 +++++++++++--------
 net/ipv4/gre_offload.c          |  1 +
 net/ipv4/tcp_offload.c          |  4 +++-
 net/ipv6/ip6_offload.c          |  3 ++-
 net/mpls/mpls_gso.c             |  1 +
 10 files changed, 63 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..7cf272a4b5c8 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
 	NETIF_F_UFO_BIT,		/* ... UDPv4 fragmentation */
 	NETIF_F_GSO_ROBUST_BIT,		/* ... ->SKB_GSO_DODGY */
 	NETIF_F_TSO_ECN_BIT,		/* ... TCP ECN support */
+	NETIF_F_TSO_MANGLEID_BIT,	/* ... IPV4 ID mangling allowed */
 	NETIF_F_TSO6_BIT,		/* ... TCPv6 segmentation */
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
@@ -120,6 +121,7 @@ enum {
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +149,7 @@ enum {
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
+				 NETIF_F_TSO_MANGLEID | \
 				 NETIF_F_TSO6 | NETIF_F_UFO)
 
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9884fe9a6552..8e372d01b3c1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3992,6 +3992,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_UDP     != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..5fba16658f9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,25 @@ enum {
 	/* This indicates the tcp segment has CWR set. */
 	SKB_GSO_TCP_ECN = 1 << 3,
 
-	SKB_GSO_TCPV6 = 1 << 4,
+	SKB_GSO_TCP_FIXEDID = 1 << 4,
 
-	SKB_GSO_FCOE = 1 << 5,
+	SKB_GSO_TCPV6 = 1 << 5,
 
-	SKB_GSO_GRE = 1 << 6,
+	SKB_GSO_FCOE = 1 << 6,
 
-	SKB_GSO_GRE_CSUM = 1 << 7,
+	SKB_GSO_GRE = 1 << 7,
 
-	SKB_GSO_IPIP = 1 << 8,
+	SKB_GSO_GRE_CSUM = 1 << 8,
 
-	SKB_GSO_SIT = 1 << 9,
+	SKB_GSO_IPIP = 1 << 9,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 10,
+	SKB_GSO_SIT = 1 << 10,
 
-	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+	SKB_GSO_UDP_TUNNEL = 1 << 11,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/dev.c b/net/core/dev.c
index 09fb1ace9dc8..e896b1953ab6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2825,14 +2825,36 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb,
 	return vlan_features_check(skb, features);
 }
 
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+					    struct net_device *dev,
+					    netdev_features_t features)
+{
+	u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+	if (gso_segs > dev->gso_max_segs)
+		return features & ~NETIF_F_GSO_MASK;
+
+	/* Make sure to clear the IPv4 ID mangling feature if
+	 * the IPv4 header has the potential to be fragmented.
+	 */
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+		struct iphdr *iph = skb->encapsulation ?
+				    inner_ip_hdr(skb) : ip_hdr(skb);
+
+		if (!(iph->frag_off & htons(IP_DF)))
+			features &= ~NETIF_F_TSO_MANGLEID;
+	}
+
+	return features;
+}
+
 netdev_features_t netif_skb_features(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
 	netdev_features_t features = dev->features;
-	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 
-	if (gso_segs > dev->gso_max_segs)
-		features &= ~NETIF_F_GSO_MASK;
+	if (skb_is_gso(skb))
+		features = gso_features_check(skb, dev, features);
 
 	/* If encapsulation offload request, verify we are testing
 	 * hardware encapsulation features instead of standard
@@ -6976,9 +6998,11 @@ int register_netdevice(struct net_device *dev)
 	dev->features |= NETIF_F_SOFT_FEATURES;
 	dev->wanted_features = dev->features & dev->hw_features;
 
-	if (!(dev->flags & IFF_LOOPBACK)) {
+	if (!(dev->flags & IFF_LOOPBACK))
 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
-	}
+
+	if (dev->hw_features & NETIF_F_TSO)
+		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 
 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 	 */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6a7f99661c2f..9494c41cc77c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -79,6 +79,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_UFO_BIT] =              "tx-udp-fragmentation",
 	[NETIF_F_GSO_ROBUST_BIT] =       "tx-gso-robust",
 	[NETIF_F_TSO_ECN_BIT] =          "tx-tcp-ecn-segmentation",
+	[NETIF_F_TSO_MANGLEID_BIT] =	 "tx-tcp-mangleid-segmentation",
 	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8217cd22f921..5bbea9a0ce96 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1195,10 +1195,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 					netdev_features_t features)
 {
+	bool udpfrag = false, fixedid = false, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
-	bool udpfrag, encap;
 	struct iphdr *iph;
 	int proto;
 	int nhoff;
@@ -1217,6 +1217,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TCP_FIXEDID |
 		       SKB_GSO_TUNNEL_REMCSUM |
 		       0)))
 		goto out;
@@ -1248,11 +1249,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
-	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
-		udpfrag = proto == IPPROTO_UDP && encap;
-	else
-		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+	if (!skb->encapsulation || encap) {
+		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+		/* fixed ID is invalid if DF bit is not set */
+		if (fixedid && !(iph->frag_off & htons(IP_DF)))
+			goto out;
+	}
 
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment))
@@ -1265,12 +1269,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
 		if (udpfrag) {
-			iph->id = htons(id);
 			iph->frag_off = htons(offset >> 3);
 			if (skb->next)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else {
+		} else if (!fixedid) {
 			iph->id = htons(id++);
 		}
 		iph->tot_len = htons(skb->len - nhoff);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a5bd4317866..6376b0cdf693 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -32,6 +32,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_UDP |
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
+				  SKB_GSO_TCP_FIXEDID |
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 773083b7f1e9..08dd25d835af 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -89,6 +89,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			     ~(SKB_GSO_TCPV4 |
 			       SKB_GSO_DODGY |
 			       SKB_GSO_TCP_ECN |
+			       SKB_GSO_TCP_FIXEDID |
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
 			       SKB_GSO_GRE_CSUM |
@@ -98,7 +99,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_UDP_TUNNEL_CSUM |
 			       SKB_GSO_TUNNEL_REMCSUM |
 			       0) ||
-			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+			     !(type & (SKB_GSO_TCPV4 |
+				       SKB_GSO_TCPV6))))
 			goto out;
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 204af2219471..b3a779393d71 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -73,6 +73,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP |
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
+		       SKB_GSO_TCP_FIXEDID |
+		       SKB_GSO_TCPV6 |
 		       SKB_GSO_GRE |
 		       SKB_GSO_GRE_CSUM |
 		       SKB_GSO_IPIP |
@@ -80,7 +82,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_TCPV6 |
 		       0)))
 		goto out;
 
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 0183b32da942..bbcf60465e5c 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -31,6 +31,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_TCPV6 |
 				  SKB_GSO_UDP |
 				  SKB_GSO_DODGY |
+				  SKB_GSO_TCP_FIXEDID |
 				  SKB_GSO_TCP_ECN)))
 		goto out;
 
-- 
cgit v1.2.3


From 1530545ed64b42e87acb43c0c16401bd1ebae6bf Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:44:57 -0400
Subject: GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID
 values

This patch does two things.

First it allows TCP to aggregate TCP frames with a fixed IPv4 ID field.  As
a result we should now be able to aggregate flows that were converted from
IPv6 to IPv4.  In addition this allows us more flexibility for future
implementations of segmentation as we may be able to use a fixed IP ID when
segmenting the flow.

The second thing this does is that it places limitations on the outer IPv4
ID header in the case of tunneled frames.  Specifically it forces the IP ID
to be incrementing by 1 unless the DF bit is set in the outer IPv4 header.
This way we can avoid creating overlapping series of IP IDs that could
possibly be fragmented if the frame goes through GRO and is then
resegmented via GSO.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  5 ++++-
 net/core/dev.c            |  1 +
 net/ipv4/af_inet.c        | 35 ++++++++++++++++++++++++++++-------
 net/ipv4/tcp_offload.c    | 16 +++++++++++++++-
 net/ipv6/ip6_offload.c    |  8 ++++++--
 5 files changed, 54 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8e372d01b3c1..2d70c521d516 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2121,7 +2121,10 @@ struct napi_gro_cb {
 	/* Used in GRE, set in fou/gue_gro_receive */
 	u8	is_fou:1;
 
-	/* 6 bit hole */
+	/* Used to determine if flush_id can be ignored */
+	u8	is_atomic:1;
+
+	/* 5 bit hole */
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index e896b1953ab6..b78b586b1856 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4462,6 +4462,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->encap_mark = 0;
 		NAPI_GRO_CB(skb)->is_fou = 0;
+		NAPI_GRO_CB(skb)->is_atomic = 1;
 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
 		/* Setup for GRO checksum validation */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5bbea9a0ce96..8564cab96189 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,6 +1328,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 
 	for (p = *head; p; p = p->next) {
 		struct iphdr *iph2;
+		u16 flush_id;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
@@ -1351,16 +1352,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 			(iph->tos ^ iph2->tos) |
 			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
-		/* Save the IP ID check to be included later when we get to
-		 * the transport layer so only the inner most IP ID is checked.
-		 * This is because some GSO/TSO implementations do not
-		 * correctly increment the IP ID for the outer hdrs.
-		 */
-		NAPI_GRO_CB(p)->flush_id =
-			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
 		NAPI_GRO_CB(p)->flush |= flush;
+
+		/* We need to store of the IP ID check to be included later
+		 * when we can verify that this packet does in fact belong
+		 * to a given flow.
+		 */
+		flush_id = (u16)(id - ntohs(iph2->id));
+
+		/* This bit of code makes it much easier for us to identify
+		 * the cases where we are doing atomic vs non-atomic IP ID
+		 * checks.  Specifically an atomic check can return IP ID
+		 * values 0 - 0xFFFF, while a non-atomic check can only
+		 * return 0 or 0xFFFF.
+		 */
+		if (!NAPI_GRO_CB(p)->is_atomic ||
+		    !(iph->frag_off & htons(IP_DF))) {
+			flush_id ^= NAPI_GRO_CB(p)->count;
+			flush_id = flush_id ? 0xFFFF : 0;
+		}
+
+		/* If the previous IP ID value was based on an atomic
+		 * datagram we can overwrite the value and ignore it.
+		 */
+		if (NAPI_GRO_CB(skb)->is_atomic)
+			NAPI_GRO_CB(p)->flush_id = flush_id;
+		else
+			NAPI_GRO_CB(p)->flush_id |= flush_id;
 	}
 
+	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
 	NAPI_GRO_CB(skb)->flush |= flush;
 	skb_set_network_header(skb, off);
 	/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 08dd25d835af..d1ffd55289bd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -239,7 +239,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 found:
 	/* Include the IP ID check below from the inner most IP hdr */
-	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+	flush = NAPI_GRO_CB(p)->flush;
 	flush |= (__force int)(flags & TCP_FLAG_CWR);
 	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
 		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -248,6 +248,17 @@ found:
 		flush |= *(u32 *)((u8 *)th + i) ^
 			 *(u32 *)((u8 *)th2 + i);
 
+	/* When we receive our second frame we can made a decision on if we
+	 * continue this flow as an atomic flow with a fixed ID or if we use
+	 * an incrementing ID.
+	 */
+	if (NAPI_GRO_CB(p)->flush_id != 1 ||
+	    NAPI_GRO_CB(p)->count != 1 ||
+	    !NAPI_GRO_CB(p)->is_atomic)
+		flush |= NAPI_GRO_CB(p)->flush_id;
+	else
+		NAPI_GRO_CB(p)->is_atomic = false;
+
 	mss = skb_shinfo(p)->gso_size;
 
 	flush |= (len - 1) >= mss;
@@ -316,6 +327,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 				  iph->daddr, 0);
 	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
 
+	if (NAPI_GRO_CB(skb)->is_atomic)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
 	return tcp_gro_complete(skb);
 }
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b3a779393d71..061adcda65f3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -240,10 +240,14 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
 		NAPI_GRO_CB(p)->flush |= flush;
 
-		/* Clear flush_id, there's really no concept of ID in IPv6. */
-		NAPI_GRO_CB(p)->flush_id = 0;
+		/* If the previous IP ID value was based on an atomic
+		 * datagram we can overwrite the value and ignore it.
+		 */
+		if (NAPI_GRO_CB(skb)->is_atomic)
+			NAPI_GRO_CB(p)->flush_id = 0;
 	}
 
+	NAPI_GRO_CB(skb)->is_atomic = true;
 	NAPI_GRO_CB(skb)->flush |= flush;
 
 	skb_gro_postpull_rcsum(skb, iph, nlen);
-- 
cgit v1.2.3


From 802ab55adc39a06940a1b384e9fd0387fc762d7e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Sun, 10 Apr 2016 21:45:03 -0400
Subject: GSO: Support partial segmentation offload

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  5 +++++
 include/linux/netdevice.h       |  2 ++
 include/linux/skbuff.h          |  9 +++++++--
 net/core/dev.c                  | 36 +++++++++++++++++++++++++++++++++---
 net/core/ethtool.c              |  1 +
 net/core/skbuff.c               | 29 ++++++++++++++++++++++++++++-
 net/ipv4/af_inet.c              | 20 ++++++++++++++++----
 net/ipv4/gre_offload.c          | 26 +++++++++++++++++++++-----
 net/ipv4/tcp_offload.c          | 10 ++++++++--
 net/ipv4/udp_offload.c          | 27 +++++++++++++++++++++------
 net/ipv6/ip6_offload.c          | 10 +++++++++-
 11 files changed, 151 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7cf272a4b5c8..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+	NETIF_F_GSO_PARTIAL_BIT,	/* ... Only segment inner-most L4
+					 *     in hardware and all other
+					 *     headers in software.
+					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d70c521d516..a3bb534576a3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1654,6 +1654,7 @@ struct net_device {
 	netdev_features_t	vlan_features;
 	netdev_features_t	hw_enc_features;
 	netdev_features_t	mpls_features;
+	netdev_features_t	gso_partial_features;
 
 	int			ifindex;
 	int			group;
@@ -4004,6 +4005,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+	SKB_GSO_PARTIAL = 1 << 13,
+
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-	int	mac_offset;
+	union {
+		int	mac_offset;
+		int	data_offset;
+	};
 	int	encap_level;
 	__wsum	csum;
 	__u16	csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index b78b586b1856..556dd09af3b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 			return ERR_PTR(err);
 	}
 
+	/* Only report GSO partial support if it will enable us to
+	 * support segmentation on this frame without needing additional
+	 * work.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+		struct net_device *dev = skb->dev;
+
+		partial_features |= dev->features & dev->gso_partial_features;
+		if (!skb_gso_ok(skb, features | partial_features))
+			features &= ~NETIF_F_GSO_PARTIAL;
+	}
+
 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
@@ -2834,8 +2847,17 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
 	if (gso_segs > dev->gso_max_segs)
 		return features & ~NETIF_F_GSO_MASK;
 
-	/* Make sure to clear the IPv4 ID mangling feature if
-	 * the IPv4 header has the potential to be fragmented.
+	/* Support for GSO partial features requires software
+	 * intervention before we can actually process the packets
+	 * so we need to strip support for any partial features now
+	 * and we can pull them back in after we have partially
+	 * segmented the frame.
+	 */
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+		features &= ~dev->gso_partial_features;
+
+	/* Make sure to clear the IPv4 ID mangling feature if the
+	 * IPv4 header has the potential to be fragmented.
 	 */
 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 		struct iphdr *iph = skb->encapsulation ?
@@ -6729,6 +6751,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
+	/* GSO partial features require GSO partial be set */
+	if ((features & dev->gso_partial_features) &&
+	    !(features & NETIF_F_GSO_PARTIAL)) {
+		netdev_dbg(dev,
+			   "Dropping partially supported GSO features since no GSO partial.\n");
+		features &= ~dev->gso_partial_features;
+	}
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	if (dev->netdev_ops->ndo_busy_poll)
 		features |= NETIF_F_BUSY_POLL;
@@ -7011,7 +7041,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* Make NETIF_F_SG inheritable to tunnel devices.
 	 */
-	dev->hw_enc_features |= NETIF_F_SG;
+	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 
 	/* Make NETIF_F_SG inheritable to MPLS.
 	 */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9494c41cc77c..e0cf20a3b3dd 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -88,6 +88,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04c2d1c8c87..4cc594cdaada 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3076,8 +3076,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	struct sk_buff *frag_skb = head_skb;
 	unsigned int offset = doffset;
 	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+	unsigned int partial_segs = 0;
 	unsigned int headroom;
-	unsigned int len;
+	unsigned int len = head_skb->len;
 	__be16 proto;
 	bool csum;
 	int sg = !!(features & NETIF_F_SG);
@@ -3094,6 +3095,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 	csum = !!can_checksum_protocol(features, proto);
 
+	/* GSO partial only requires that we trim off any excess that
+	 * doesn't fit into an MSS sized block, so take care of that
+	 * now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		partial_segs = len / mss;
+		mss *= partial_segs;
+	}
+
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
 
@@ -3281,6 +3291,23 @@ perform_csum_check:
 	 */
 	segs->prev = tail;
 
+	/* Update GSO info on first skb in partial sequence. */
+	if (partial_segs) {
+		int type = skb_shinfo(head_skb)->gso_type;
+
+		/* Update type to add partial and then remove dodgy if set */
+		type |= SKB_GSO_PARTIAL;
+		type &= ~SKB_GSO_DODGY;
+
+		/* Update GSO info and prepare to start updating headers on
+		 * our way back down the stack of protocols.
+		 */
+		skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+		skb_shinfo(segs)->gso_segs = partial_segs;
+		skb_shinfo(segs)->gso_type = type;
+		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+	}
+
 	/* Following permits correct backpressure, for protocols
 	 * using skb_set_owner_w().
 	 * Idea is to tranfert ownership from head_skb to last segment.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8564cab96189..2e6e65fc4d20 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1200,7 +1200,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	const struct net_offload *ops;
 	unsigned int offset = 0;
 	struct iphdr *iph;
-	int proto;
+	int proto, tot_len;
 	int nhoff;
 	int ihl;
 	int id;
@@ -1219,6 +1219,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TCP_FIXEDID |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -1273,10 +1274,21 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 			if (skb->next)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else if (!fixedid) {
-			iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
+			if (!fixedid) {
+				iph->id = htons(id);
+				id += skb_shinfo(skb)->gso_segs;
+			}
+			tot_len = skb_shinfo(skb)->gso_size +
+				  SKB_GSO_CB(skb)->data_offset +
+				  skb->head - (unsigned char *)iph;
+		} else {
+			if (!fixedid)
+				iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
 		}
-		iph->tot_len = htons(skb->len - nhoff);
+		iph->tot_len = htons(tot_len);
 		ip_send_check(iph);
 		if (encap)
 			skb_reset_inner_headers(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6376b0cdf693..20557f211408 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
-				  SKB_GSO_SIT)))
+				  SKB_GSO_SIT |
+				  SKB_GSO_PARTIAL)))
 		goto out;
 
 	if (!skb->encapsulation)
@@ -87,7 +88,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		struct gre_base_hdr *greh;
-		__be32 *pcsum;
+		__sum16 *pcsum;
 
 		/* Set up inner headers if we are offloading inner checksum */
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -107,10 +108,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 			continue;
 
 		greh = (struct gre_base_hdr *)skb_transport_header(skb);
-		pcsum = (__be32 *)(greh + 1);
+		pcsum = (__sum16 *)(greh + 1);
+
+		if (skb_is_gso(skb)) {
+			unsigned int partial_adj;
+
+			/* Adjust checksum to account for the fact that
+			 * the partial checksum is based on actual size
+			 * whereas headers should be based on MSS size.
+			 */
+			partial_adj = skb->len + skb_headroom(skb) -
+				      SKB_GSO_CB(skb)->data_offset -
+				      skb_shinfo(skb)->gso_size;
+			*pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+		} else {
+			*pcsum = 0;
+		}
 
-		*pcsum = 0;
-		*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+		*(pcsum + 1) = 0;
+		*pcsum = gso_make_checksum(skb, 0);
 	} while ((skb = skb->next));
 out:
 	return segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index d1ffd55289bd..02737b607aa7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -109,6 +109,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	/* GSO partial only requires splitting the frame into an MSS
+	 * multiple and possibly a remainder.  So update the mss now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL)
+		mss = skb->len - (skb->len % mss);
+
 	copy_destructor = gso_skb->destructor == tcp_wfree;
 	ooo_okay = gso_skb->ooo_okay;
 	/* All segments but the first should have ooo_okay cleared */
@@ -133,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
 					       (__force u32)delta));
 
-	do {
+	while (skb->next) {
 		th->fin = th->psh = 0;
 		th->check = newcheck;
 
@@ -153,7 +159,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 		th->seq = htonl(seq);
 		th->cwr = 0;
-	} while (skb->next);
+	}
 
 	/* Following permits TCP Small Queues to work well with GSO :
 	 * The callback to TCP stack will be called at the time last frag
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6230cf4b0d2d..097060def7f0 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -39,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	 * 16 bit length field due to the header being added outside of an
 	 * IP or IPv6 frame that was already limited to 64K - 1.
 	 */
-	partial = csum_sub(csum_unfold(uh->check),
-			   (__force __wsum)htonl(skb->len));
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+		partial = (__force __wsum)uh->len;
+	else
+		partial = (__force __wsum)htonl(skb->len);
+	partial = csum_sub(csum_unfold(uh->check), partial);
 
 	/* setup inner skb. */
 	skb->encapsulation = 0;
@@ -89,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
-		__be16 len;
+		unsigned int len;
 
 		if (remcsum)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -107,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb_set_transport_header(skb, udp_offset);
-		len = htons(skb->len - udp_offset);
+		len = skb->len - udp_offset;
 		uh = udp_hdr(skb);
-		uh->len = len;
+
+		/* If we are only performing partial GSO the inner header
+		 * will be using a length value equal to only one MSS sized
+		 * segment instead of the entire frame.
+		 */
+		if (skb_is_gso(skb)) {
+			uh->len = htons(skb_shinfo(skb)->gso_size +
+					SKB_GSO_CB(skb)->data_offset +
+					skb->head - (unsigned char *)uh);
+		} else {
+			uh->len = htons(len);
+		}
 
 		if (!need_csum)
 			continue;
 
-		uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+		uh->check = ~csum_fold(csum_add(partial,
+				       (__force __wsum)htonl(len)));
 
 		if (skb->encapsulation || !offload_csum) {
 			uh->check = gso_make_checksum(skb, ~uh->check);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 061adcda65f3..f5eb184e1093 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,6 +63,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int proto;
 	struct frag_hdr *fptr;
 	unsigned int unfrag_ip6hlen;
+	unsigned int payload_len;
 	u8 *prevhdr;
 	int offset = 0;
 	bool encap, udpfrag;
@@ -82,6 +83,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -118,7 +120,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+		if (skb_is_gso(skb))
+			payload_len = skb_shinfo(skb)->gso_size +
+				      SKB_GSO_CB(skb)->data_offset +
+				      skb->head - (unsigned char *)(ipv6h + 1);
+		else
+			payload_len = skb->len - nhoff - sizeof(*ipv6h);
+		ipv6h->payload_len = htons(payload_len);
 		skb->network_header = (u8 *)ipv6h - skb->head;
 
 		if (udpfrag) {
-- 
cgit v1.2.3


From 435faee1aae9c1ac231f89e4faf0437bfe29f425 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:51 +0200
Subject: bpf, verifier: add ARG_PTR_TO_RAW_STACK type

When passing buffers from eBPF stack space into a helper function, we have
ARG_PTR_TO_STACK argument type for helpers available. The verifier makes sure
that such buffers are initialized, within boundaries, etc.

However, the downside with this is that we have a couple of helper functions
such as bpf_skb_load_bytes() that fill out the passed buffer in the expected
success case anyway, so zero initializing them prior to the helper call is
unneeded/wasted instructions in the eBPF program that can be avoided.

Therefore, add a new helper function argument type called ARG_PTR_TO_RAW_STACK.
The idea is to skip the STACK_MISC check in check_stack_boundary() and color
the related stack slots as STACK_MISC after we checked all call arguments.

Helper functions using ARG_PTR_TO_RAW_STACK must make sure that every path of
the helper function will fill the provided buffer area, so that we cannot leak
any uninitialized stack memory. This f.e. means that error paths need to
memset() the buffers, but the expected fast-path doesn't have to do this
anymore.

Since there's no such helper needing more than at most one ARG_PTR_TO_RAW_STACK
argument, we can keep it simple and don't need to check for multiple areas.
Should in future such a use-case really appear, we have check_raw_mode() that
will make sure we implement support for it first.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  5 +++++
 kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b2365a6eba3d..5fb3c610fa96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
+	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
+				 * need to be initialized, helper function must fill
+				 * all bytes or clear them in error case.
+				 */
+
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
 	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 202f8f738542..9c843a5417da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -207,6 +207,9 @@ struct verifier_env {
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
+	bool raw_mode;
+	int regno;
+	int access_size;
 };
 
 /* verbose verifier prints what it's seeing
@@ -789,7 +792,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * and all elements of stack are initialized
  */
 static int check_stack_boundary(struct verifier_env *env, int regno,
-				int access_size, bool zero_size_allowed)
+				int access_size, bool zero_size_allowed,
+				struct bpf_call_arg_meta *meta)
 {
 	struct verifier_state *state = &env->cur_state;
 	struct reg_state *regs = state->regs;
@@ -815,6 +819,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (meta && meta->raw_mode) {
+		meta->access_size = access_size;
+		meta->regno = regno;
+		return 0;
+	}
+
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
 			verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -859,7 +869,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 	} else if (arg_type == ARG_PTR_TO_CTX) {
 		expected_type = PTR_TO_CTX;
-	} else if (arg_type == ARG_PTR_TO_STACK) {
+	} else if (arg_type == ARG_PTR_TO_STACK ||
+		   arg_type == ARG_PTR_TO_RAW_STACK) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -867,6 +878,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		 */
 		if (reg->type == CONST_IMM && reg->imm == 0)
 			expected_type = CONST_IMM;
+		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -896,7 +908,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
-					   false);
+					   false, NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -907,7 +919,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno,
-					   meta->map_ptr->value_size, false);
+					   meta->map_ptr->value_size,
+					   false, NULL);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -922,7 +935,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno - 1, reg->imm,
-					   zero_size_allowed);
+					   zero_size_allowed, meta);
 	}
 
 	return err;
@@ -953,6 +966,24 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	return 0;
 }
 
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+
+	return count > 1 ? -EINVAL : 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -984,6 +1015,15 @@ static int check_call(struct verifier_env *env, int func_id)
 
 	memset(&meta, 0, sizeof(meta));
 
+	/* We only support one arg being in raw mode at the moment, which
+	 * is sufficient for the helper functions we have right now.
+	 */
+	err = check_raw_mode(fn);
+	if (err) {
+		verbose("kernel subsystem misconfigured func %d\n", func_id);
+		return err;
+	}
+
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -1001,6 +1041,15 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	/* Mark slots with STACK_MISC in case of raw mode, stack offset
+	 * is inferred from register state.
+	 */
+	for (i = 0; i < meta.access_size; i++) {
+		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		if (err)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		reg = regs + caller_saved[i];
-- 
cgit v1.2.3


From 58bc67fc32b1c67fb045f4828a67134dc8fee631 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 10 Jul 2015 15:23:28 +0300
Subject: ARM: OMAP2+: gpmc: Add platform data

Add a platform data structure for GPMC. It contains all the necessary
platform information that needs to be passed from platform init code
to GPMC driver.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/omap-gpmc.h               |  3 +--
 include/linux/platform_data/gpmc-omap.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/platform_data/gpmc-omap.h

(limited to 'include/linux')

diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index d833eb4dd446..45d9075be1e5 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -7,8 +7,7 @@
  *  option) any later version.
  */
 
-/* Maximum Number of Chip Selects */
-#define GPMC_CS_NUM		8
+#include <linux/platform_data/gpmc-omap.h>
 
 #define GPMC_CONFIG_WP		0x00000005
 
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
new file mode 100644
index 000000000000..6804a8b387d7
--- /dev/null
+++ b/include/linux/platform_data/gpmc-omap.h
@@ -0,0 +1,30 @@
+/*
+ * OMAP GPMC Platform data
+ *
+ * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com
+ *	Roger Quadros <rogerq@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _GPMC_OMAP_H_
+#define _GPMC_OMAP_H_
+
+/* Maximum Number of Chip Selects */
+#define GPMC_CS_NUM		8
+
+/* Data for each chip select */
+struct gpmc_omap_cs_data {
+	bool valid;			/* data is valid */
+	bool is_nand;			/* device within this CS is NAND */
+	struct platform_device *pdev;	/* device within this CS region */
+	unsigned int pdata_size;
+};
+
+struct gpmc_omap_platform_data {
+	struct gpmc_omap_cs_data cs[GPMC_CS_NUM];
+};
+
+#endif /* _GPMC_OMAP_H */
-- 
cgit v1.2.3


From fabe7d7756d17f5da4bd80fa2373c4bd93ed39e5 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 10 Jul 2015 15:23:29 +0300
Subject: ARM: OMAP2+: gpmc: Add gpmc timings and settings to platform data

Add device_timings, gpmc_timings and gpmc_setting to
gpmc platform data.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/omap-gpmc.h               | 139 -------------------------------
 include/linux/platform_data/gpmc-omap.h | 142 ++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index 45d9075be1e5..2dcef1c8c8d4 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -14,145 +14,6 @@
 #define GPMC_IRQ_FIFOEVENTENABLE	0x01
 #define GPMC_IRQ_COUNT_EVENT		0x02
 
-#define GPMC_BURST_4			4	/* 4 word burst */
-#define GPMC_BURST_8			8	/* 8 word burst */
-#define GPMC_BURST_16			16	/* 16 word burst */
-#define GPMC_DEVWIDTH_8BIT		1	/* 8-bit device width */
-#define GPMC_DEVWIDTH_16BIT		2	/* 16-bit device width */
-#define GPMC_MUX_AAD			1	/* Addr-Addr-Data multiplex */
-#define GPMC_MUX_AD			2	/* Addr-Data multiplex */
-
-/* bool type time settings */
-struct gpmc_bool_timings {
-	bool cycle2cyclediffcsen;
-	bool cycle2cyclesamecsen;
-	bool we_extra_delay;
-	bool oe_extra_delay;
-	bool adv_extra_delay;
-	bool cs_extra_delay;
-	bool time_para_granularity;
-};
-
-/*
- * Note that all values in this struct are in nanoseconds except sync_clk
- * (which is in picoseconds), while the register values are in gpmc_fck cycles.
- */
-struct gpmc_timings {
-	/* Minimum clock period for synchronous mode (in picoseconds) */
-	u32 sync_clk;
-
-	/* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
-	u32 cs_on;		/* Assertion time */
-	u32 cs_rd_off;		/* Read deassertion time */
-	u32 cs_wr_off;		/* Write deassertion time */
-
-	/* ADV signal timings corresponding to GPMC_CONFIG3 */
-	u32 adv_on;		/* Assertion time */
-	u32 adv_rd_off;		/* Read deassertion time */
-	u32 adv_wr_off;		/* Write deassertion time */
-	u32 adv_aad_mux_on;	/* ADV assertion time for AAD */
-	u32 adv_aad_mux_rd_off;	/* ADV read deassertion time for AAD */
-	u32 adv_aad_mux_wr_off;	/* ADV write deassertion time for AAD */
-
-	/* WE signals timings corresponding to GPMC_CONFIG4 */
-	u32 we_on;		/* WE assertion time */
-	u32 we_off;		/* WE deassertion time */
-
-	/* OE signals timings corresponding to GPMC_CONFIG4 */
-	u32 oe_on;		/* OE assertion time */
-	u32 oe_off;		/* OE deassertion time */
-	u32 oe_aad_mux_on;	/* OE assertion time for AAD */
-	u32 oe_aad_mux_off;	/* OE deassertion time for AAD */
-
-	/* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
-	u32 page_burst_access;	/* Multiple access word delay */
-	u32 access;		/* Start-cycle to first data valid delay */
-	u32 rd_cycle;		/* Total read cycle time */
-	u32 wr_cycle;		/* Total write cycle time */
-
-	u32 bus_turnaround;
-	u32 cycle2cycle_delay;
-
-	u32 wait_monitoring;
-	u32 clk_activation;
-
-	/* The following are only on OMAP3430 */
-	u32 wr_access;		/* WRACCESSTIME */
-	u32 wr_data_mux_bus;	/* WRDATAONADMUXBUS */
-
-	struct gpmc_bool_timings bool_timings;
-};
-
-/* Device timings in picoseconds */
-struct gpmc_device_timings {
-	u32 t_ceasu;	/* address setup to CS valid */
-	u32 t_avdasu;	/* address setup to ADV valid */
-	/* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
-	 * of tusb using these timings even for sync whilst
-	 * ideally for adv_rd/(wr)_off it should have considered
-	 * t_avdh instead. This indirectly necessitates r/w
-	 * variations of t_avdp as it is possible to have one
-	 * sync & other async
-	 */
-	u32 t_avdp_r;	/* ADV low time (what about t_cer ?) */
-	u32 t_avdp_w;
-	u32 t_aavdh;	/* address hold time */
-	u32 t_oeasu;	/* address setup to OE valid */
-	u32 t_aa;	/* access time from ADV assertion */
-	u32 t_iaa;	/* initial access time */
-	u32 t_oe;	/* access time from OE assertion */
-	u32 t_ce;	/* access time from CS asertion */
-	u32 t_rd_cycle;	/* read cycle time */
-	u32 t_cez_r;	/* read CS deassertion to high Z */
-	u32 t_cez_w;	/* write CS deassertion to high Z */
-	u32 t_oez;	/* OE deassertion to high Z */
-	u32 t_weasu;	/* address setup to WE valid */
-	u32 t_wpl;	/* write assertion time */
-	u32 t_wph;	/* write deassertion time */
-	u32 t_wr_cycle;	/* write cycle time */
-
-	u32 clk;
-	u32 t_bacc;	/* burst access valid clock to output delay */
-	u32 t_ces;	/* CS setup time to clk */
-	u32 t_avds;	/* ADV setup time to clk */
-	u32 t_avdh;	/* ADV hold time from clk */
-	u32 t_ach;	/* address hold time from clk */
-	u32 t_rdyo;	/* clk to ready valid */
-
-	u32 t_ce_rdyz;	/* XXX: description ?, or use t_cez instead */
-	u32 t_ce_avd;	/* CS on to ADV on delay */
-
-	/* XXX: check the possibility of combining
-	 * cyc_aavhd_oe & cyc_aavdh_we
-	 */
-	u8 cyc_aavdh_oe;/* read address hold time in cycles */
-	u8 cyc_aavdh_we;/* write address hold time in cycles */
-	u8 cyc_oe;	/* access time from OE assertion in cycles */
-	u8 cyc_wpl;	/* write deassertion time in cycles */
-	u32 cyc_iaa;	/* initial access time in cycles */
-
-	/* extra delays */
-	bool ce_xdelay;
-	bool avd_xdelay;
-	bool oe_xdelay;
-	bool we_xdelay;
-};
-
-struct gpmc_settings {
-	bool burst_wrap;	/* enables wrap bursting */
-	bool burst_read;	/* enables read page/burst mode */
-	bool burst_write;	/* enables write page/burst mode */
-	bool device_nand;	/* device is NAND */
-	bool sync_read;		/* enables synchronous reads */
-	bool sync_write;	/* enables synchronous writes */
-	bool wait_on_read;	/* monitor wait on reads */
-	bool wait_on_write;	/* monitor wait on writes */
-	u32 burst_len;		/* page/burst length */
-	u32 device_width;	/* device bus width (8 or 16 bit) */
-	u32 mux_add_data;	/* multiplex address & data */
-	u32 wait_pin;		/* wait-pin to be used */
-};
-
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
 			     struct gpmc_settings *gpmc_s,
 			     struct gpmc_device_timings *dev_t);
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
index 6804a8b387d7..67ccdb0e1606 100644
--- a/include/linux/platform_data/gpmc-omap.h
+++ b/include/linux/platform_data/gpmc-omap.h
@@ -15,10 +15,152 @@
 /* Maximum Number of Chip Selects */
 #define GPMC_CS_NUM		8
 
+/* bool type time settings */
+struct gpmc_bool_timings {
+	bool cycle2cyclediffcsen;
+	bool cycle2cyclesamecsen;
+	bool we_extra_delay;
+	bool oe_extra_delay;
+	bool adv_extra_delay;
+	bool cs_extra_delay;
+	bool time_para_granularity;
+};
+
+/*
+ * Note that all values in this struct are in nanoseconds except sync_clk
+ * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+ */
+struct gpmc_timings {
+	/* Minimum clock period for synchronous mode (in picoseconds) */
+	u32 sync_clk;
+
+	/* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
+	u32 cs_on;		/* Assertion time */
+	u32 cs_rd_off;		/* Read deassertion time */
+	u32 cs_wr_off;		/* Write deassertion time */
+
+	/* ADV signal timings corresponding to GPMC_CONFIG3 */
+	u32 adv_on;		/* Assertion time */
+	u32 adv_rd_off;		/* Read deassertion time */
+	u32 adv_wr_off;		/* Write deassertion time */
+	u32 adv_aad_mux_on;	/* ADV assertion time for AAD */
+	u32 adv_aad_mux_rd_off;	/* ADV read deassertion time for AAD */
+	u32 adv_aad_mux_wr_off;	/* ADV write deassertion time for AAD */
+
+	/* WE signals timings corresponding to GPMC_CONFIG4 */
+	u32 we_on;		/* WE assertion time */
+	u32 we_off;		/* WE deassertion time */
+
+	/* OE signals timings corresponding to GPMC_CONFIG4 */
+	u32 oe_on;		/* OE assertion time */
+	u32 oe_off;		/* OE deassertion time */
+	u32 oe_aad_mux_on;	/* OE assertion time for AAD */
+	u32 oe_aad_mux_off;	/* OE deassertion time for AAD */
+
+	/* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
+	u32 page_burst_access;	/* Multiple access word delay */
+	u32 access;		/* Start-cycle to first data valid delay */
+	u32 rd_cycle;		/* Total read cycle time */
+	u32 wr_cycle;		/* Total write cycle time */
+
+	u32 bus_turnaround;
+	u32 cycle2cycle_delay;
+
+	u32 wait_monitoring;
+	u32 clk_activation;
+
+	/* The following are only on OMAP3430 */
+	u32 wr_access;		/* WRACCESSTIME */
+	u32 wr_data_mux_bus;	/* WRDATAONADMUXBUS */
+
+	struct gpmc_bool_timings bool_timings;
+};
+
+/* Device timings in picoseconds */
+struct gpmc_device_timings {
+	u32 t_ceasu;	/* address setup to CS valid */
+	u32 t_avdasu;	/* address setup to ADV valid */
+	/* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
+	 * of tusb using these timings even for sync whilst
+	 * ideally for adv_rd/(wr)_off it should have considered
+	 * t_avdh instead. This indirectly necessitates r/w
+	 * variations of t_avdp as it is possible to have one
+	 * sync & other async
+	 */
+	u32 t_avdp_r;	/* ADV low time (what about t_cer ?) */
+	u32 t_avdp_w;
+	u32 t_aavdh;	/* address hold time */
+	u32 t_oeasu;	/* address setup to OE valid */
+	u32 t_aa;	/* access time from ADV assertion */
+	u32 t_iaa;	/* initial access time */
+	u32 t_oe;	/* access time from OE assertion */
+	u32 t_ce;	/* access time from CS asertion */
+	u32 t_rd_cycle;	/* read cycle time */
+	u32 t_cez_r;	/* read CS deassertion to high Z */
+	u32 t_cez_w;	/* write CS deassertion to high Z */
+	u32 t_oez;	/* OE deassertion to high Z */
+	u32 t_weasu;	/* address setup to WE valid */
+	u32 t_wpl;	/* write assertion time */
+	u32 t_wph;	/* write deassertion time */
+	u32 t_wr_cycle;	/* write cycle time */
+
+	u32 clk;
+	u32 t_bacc;	/* burst access valid clock to output delay */
+	u32 t_ces;	/* CS setup time to clk */
+	u32 t_avds;	/* ADV setup time to clk */
+	u32 t_avdh;	/* ADV hold time from clk */
+	u32 t_ach;	/* address hold time from clk */
+	u32 t_rdyo;	/* clk to ready valid */
+
+	u32 t_ce_rdyz;	/* XXX: description ?, or use t_cez instead */
+	u32 t_ce_avd;	/* CS on to ADV on delay */
+
+	/* XXX: check the possibility of combining
+	 * cyc_aavhd_oe & cyc_aavdh_we
+	 */
+	u8 cyc_aavdh_oe;/* read address hold time in cycles */
+	u8 cyc_aavdh_we;/* write address hold time in cycles */
+	u8 cyc_oe;	/* access time from OE assertion in cycles */
+	u8 cyc_wpl;	/* write deassertion time in cycles */
+	u32 cyc_iaa;	/* initial access time in cycles */
+
+	/* extra delays */
+	bool ce_xdelay;
+	bool avd_xdelay;
+	bool oe_xdelay;
+	bool we_xdelay;
+};
+
+#define GPMC_BURST_4			4	/* 4 word burst */
+#define GPMC_BURST_8			8	/* 8 word burst */
+#define GPMC_BURST_16			16	/* 16 word burst */
+#define GPMC_DEVWIDTH_8BIT		1	/* 8-bit device width */
+#define GPMC_DEVWIDTH_16BIT		2	/* 16-bit device width */
+#define GPMC_MUX_AAD			1	/* Addr-Addr-Data multiplex */
+#define GPMC_MUX_AD			2	/* Addr-Data multiplex */
+
+struct gpmc_settings {
+	bool burst_wrap;	/* enables wrap bursting */
+	bool burst_read;	/* enables read page/burst mode */
+	bool burst_write;	/* enables write page/burst mode */
+	bool device_nand;	/* device is NAND */
+	bool sync_read;		/* enables synchronous reads */
+	bool sync_write;	/* enables synchronous writes */
+	bool wait_on_read;	/* monitor wait on reads */
+	bool wait_on_write;	/* monitor wait on writes */
+	u32 burst_len;		/* page/burst length */
+	u32 device_width;	/* device bus width (8 or 16 bit) */
+	u32 mux_add_data;	/* multiplex address & data */
+	u32 wait_pin;		/* wait-pin to be used */
+};
+
 /* Data for each chip select */
 struct gpmc_omap_cs_data {
 	bool valid;			/* data is valid */
 	bool is_nand;			/* device within this CS is NAND */
+	struct gpmc_settings *settings;
+	struct gpmc_device_timings *device_timings;
+	struct gpmc_timings *gpmc_timings;
 	struct platform_device *pdev;	/* device within this CS region */
 	unsigned int pdata_size;
 };
-- 
cgit v1.2.3


From f47fcad63f6847ea677c6c7030f30fd6438e0052 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 5 Aug 2015 13:58:01 +0300
Subject: memory: omap-gpmc: Introduce GPMC to NAND interface

The OMAP GPMC module has certain registers dedicated for NAND
access and some NAND bits mixed with other GPMC functionality.

For the NAND dedicated registers we have the struct gpmc_nand_regs.

The NAND driver needs to access NAND specific bits from the
following non-dedicated registers
- EMPTYWRITEBUFFERSTATUS from GPMC_STATUS

For accessing these bits we introduce the struct gpmc_nand_ops.

Add gpmc_omap_get_nand_ops() that returns the gpmc_nand_ops along
with updating the gpmc_nand_regs. This API will be called by the
OMAP NAND driver to access the necessary bits in GPMC register space.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/memory/omap-gpmc.c | 21 +++++++++++++++++++++
 include/linux/omap-gpmc.h  | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index 21825ddce4a3..0b62afd86f7e 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1118,6 +1118,27 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 	}
 }
 
+static struct gpmc_nand_ops nand_ops;
+
+/**
+ * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
+ * @regs: the GPMC NAND register map exclusive for NAND use.
+ * @cs: GPMC chip select number on which the NAND sits. The
+ *      register map returned will be specific to this chip select.
+ *
+ * Returns NULL on error e.g. invalid cs.
+ */
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
+{
+	if (cs >= gpmc_cs_num)
+		return NULL;
+
+	gpmc_update_nand_reg(reg, cs);
+
+	return &nand_ops;
+}
+EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
+
 int gpmc_get_client_irq(unsigned irq_config)
 {
 	int i;
diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index 2dcef1c8c8d4..dc2ada6fb9b4 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -14,14 +14,45 @@
 #define GPMC_IRQ_FIFOEVENTENABLE	0x01
 #define GPMC_IRQ_COUNT_EVENT		0x02
 
+/**
+ * gpmc_nand_ops - Interface between NAND and GPMC
+ * @nand_write_buffer_empty: get the NAND write buffer empty status.
+ */
+struct gpmc_nand_ops {
+	bool (*nand_writebuffer_empty)(void);
+};
+
+struct gpmc_nand_regs;
+
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+					     int cs);
+#else
+static inline gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+						    int cs)
+{
+	return NULL;
+}
+#endif /* CONFIG_OMAP_GPMC */
+
+/*--------------------------------*/
+
+/* deprecated APIs */
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
+#else
+static inline void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
+{
+}
+#endif /* CONFIG_OMAP_GPMC */
+/*--------------------------------*/
+
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
 			     struct gpmc_settings *gpmc_s,
 			     struct gpmc_device_timings *dev_t);
 
-struct gpmc_nand_regs;
 struct device_node;
 
-extern void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
 extern int gpmc_get_client_irq(unsigned irq_config);
 
 extern unsigned int gpmc_ticks_to_ns(unsigned int ticks);
-- 
cgit v1.2.3


From 384258f252727c67772bbd48dad3185a30ba50d3 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Thu, 30 Jul 2015 14:49:23 +0300
Subject: memory: omap-gpmc: Implement IRQ domain for NAND IRQs

GPMC provides 2 interrupts for NAND use. i.e. fifoevent and termcount.
Use IRQ domain for this. NAND device tree node can then
get the necessary interrupts by using gpmc as the interrupt parent.

Legacy boot uses gpmc_get_client_irq to get the
NAND interrupts from the GPMC IRQ domain.
Get rid of custom bitmasks and use IRQ domain for that
as well.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 Documentation/devicetree/bindings/bus/ti-gpmc.txt |   8 +
 drivers/memory/omap-gpmc.c                        | 246 ++++++++++++----------
 include/linux/omap-gpmc.h                         |   5 +-
 3 files changed, 144 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/bus/ti-gpmc.txt b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
index 01683707060b..13f13786f992 100644
--- a/Documentation/devicetree/bindings/bus/ti-gpmc.txt
+++ b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
@@ -32,6 +32,12 @@ Required properties:
 			bootloader) are used for the physical address decoding.
 			As this will change in the future, filling correct
 			values here is a requirement.
+ - interrupt-controller: The GPMC driver implements and interrupt controller for
+			the NAND events "fifoevent" and "termcount".
+			The interrupt number mapping is as follows
+			0 - NAND_fifoevent
+			1 - NAND_termcount
+ - interrupt-cells:	Must be set to 2
 
 Timing properties for child nodes. All are optional and default to 0.
 
@@ -130,6 +136,8 @@ Example for an AM33xx board:
 		#address-cells = <2>;
 		#size-cells = <1>;
 		ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
+		interrupt-controller;
+		#interrupt-cells = <2>;
 
 		/* child nodes go here */
 	};
diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index 90dfba5a8f55..e28d6bc2500a 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -127,7 +128,6 @@
 #define GPMC_CONFIG_RDY_BSY	0x00000001
 #define GPMC_CONFIG_DEV_SIZE	0x00000002
 #define GPMC_CONFIG_DEV_TYPE	0x00000003
-#define GPMC_SET_IRQ_STATUS	0x00000004
 
 #define GPMC_CONFIG1_WRAPBURST_SUPP     (1 << 31)
 #define GPMC_CONFIG1_READMULTIPLE_SUPP  (1 << 30)
@@ -176,8 +176,6 @@
 #define GPMC_CONFIG_WRITEPROTECT	0x00000010
 #define WR_RD_PIN_MONITORING		0x00600000
 
-#define GPMC_ENABLE_IRQ		0x0000000d
-
 /* ECC commands */
 #define GPMC_ECC_READ		0 /* Reset Hardware ECC for read */
 #define GPMC_ECC_WRITE		1 /* Reset Hardware ECC for write */
@@ -201,11 +199,6 @@ struct gpmc_cs_data {
 	struct resource mem;
 };
 
-struct gpmc_client_irq	{
-	unsigned		irq;
-	u32			bitmask;
-};
-
 /* Structure to save gpmc cs context */
 struct gpmc_cs_config {
 	u32 config1;
@@ -233,9 +226,13 @@ struct omap3_gpmc_regs {
 	struct gpmc_cs_config cs_context[GPMC_CS_NUM];
 };
 
-static struct gpmc_client_irq gpmc_client_irq[GPMC_NR_IRQ];
-static struct irq_chip gpmc_irq_chip;
-static int gpmc_irq_start;
+struct gpmc_device {
+	struct device *dev;
+	int irq;
+	struct irq_chip irq_chip;
+};
+
+static struct irq_domain *gpmc_irq_domain;
 
 static struct resource	gpmc_mem_root;
 static struct gpmc_cs_data gpmc_cs[GPMC_CS_NUM];
@@ -243,8 +240,6 @@ static DEFINE_SPINLOCK(gpmc_mem_lock);
 /* Define chip-selects as reserved by default until probe completes */
 static unsigned int gpmc_cs_num = GPMC_CS_NUM;
 static unsigned int gpmc_nr_waitpins;
-static struct device *gpmc_dev;
-static int gpmc_irq;
 static resource_size_t phys_base, mem_size;
 static unsigned gpmc_capability;
 static void __iomem *gpmc_base;
@@ -1056,14 +1051,6 @@ int gpmc_configure(int cmd, int wval)
 	u32 regval;
 
 	switch (cmd) {
-	case GPMC_ENABLE_IRQ:
-		gpmc_write_reg(GPMC_IRQENABLE, wval);
-		break;
-
-	case GPMC_SET_IRQ_STATUS:
-		gpmc_write_reg(GPMC_IRQSTATUS, wval);
-		break;
-
 	case GPMC_CONFIG_WP:
 		regval = gpmc_read_reg(GPMC_CONFIG);
 		if (wval)
@@ -1153,85 +1140,97 @@ EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
 
 int gpmc_get_client_irq(unsigned irq_config)
 {
-	int i;
-
-	if (hweight32(irq_config) > 1)
+	if (!gpmc_irq_domain) {
+		pr_warn("%s called before GPMC IRQ domain available\n",
+			__func__);
 		return 0;
+	}
 
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (gpmc_client_irq[i].bitmask & irq_config)
-			return gpmc_client_irq[i].irq;
+	if (irq_config >= GPMC_NR_IRQ)
+		return 0;
 
-	return 0;
+	return irq_create_mapping(gpmc_irq_domain, irq_config);
 }
 
-static int gpmc_irq_endis(unsigned irq, bool endis)
+static int gpmc_irq_endis(unsigned long hwirq, bool endis)
 {
-	int i;
 	u32 regval;
 
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (irq == gpmc_client_irq[i].irq) {
-			regval = gpmc_read_reg(GPMC_IRQENABLE);
-			if (endis)
-				regval |= gpmc_client_irq[i].bitmask;
-			else
-				regval &= ~gpmc_client_irq[i].bitmask;
-			gpmc_write_reg(GPMC_IRQENABLE, regval);
-			break;
-		}
+	regval = gpmc_read_reg(GPMC_IRQENABLE);
+	if (endis)
+		regval |= BIT(hwirq);
+	else
+		regval &= ~BIT(hwirq);
+	gpmc_write_reg(GPMC_IRQENABLE, regval);
 
 	return 0;
 }
 
 static void gpmc_irq_disable(struct irq_data *p)
 {
-	gpmc_irq_endis(p->irq, false);
+	gpmc_irq_endis(p->hwirq, false);
 }
 
 static void gpmc_irq_enable(struct irq_data *p)
 {
-	gpmc_irq_endis(p->irq, true);
+	gpmc_irq_endis(p->hwirq, true);
 }
 
 static void gpmc_irq_noop(struct irq_data *data) { }
 
 static unsigned int gpmc_irq_noop_ret(struct irq_data *data) { return 0; }
 
-static int gpmc_setup_irq(void)
+static int gpmc_irq_map(struct irq_domain *d, unsigned int virq,
+			irq_hw_number_t hw)
 {
-	int i;
+	struct gpmc_device *gpmc = d->host_data;
+
+	irq_set_chip_data(virq, gpmc);
+	irq_set_chip_and_handler(virq, &gpmc->irq_chip, handle_simple_irq);
+	irq_modify_status(virq, IRQ_NOREQUEST, IRQ_NOAUTOEN);
+
+	return 0;
+}
+
+static const struct irq_domain_ops gpmc_irq_domain_ops = {
+	.map    = gpmc_irq_map,
+	.xlate  = irq_domain_xlate_twocell,
+};
+
+static irqreturn_t gpmc_handle_irq(int irq, void *data)
+{
+	int hwirq, virq;
 	u32 regval;
+	struct gpmc_device *gpmc = data;
 
-	if (!gpmc_irq)
-		return -EINVAL;
+	regval = gpmc_read_reg(GPMC_IRQSTATUS);
 
-	gpmc_irq_start = irq_alloc_descs(-1, 0, GPMC_NR_IRQ, 0);
-	if (gpmc_irq_start < 0) {
-		pr_err("irq_alloc_descs failed\n");
-		return gpmc_irq_start;
-	}
+	if (!regval)
+		return IRQ_NONE;
 
-	gpmc_irq_chip.name = "gpmc";
-	gpmc_irq_chip.irq_startup = gpmc_irq_noop_ret;
-	gpmc_irq_chip.irq_enable = gpmc_irq_enable;
-	gpmc_irq_chip.irq_disable = gpmc_irq_disable;
-	gpmc_irq_chip.irq_shutdown = gpmc_irq_noop;
-	gpmc_irq_chip.irq_ack = gpmc_irq_noop;
-	gpmc_irq_chip.irq_mask = gpmc_irq_noop;
-	gpmc_irq_chip.irq_unmask = gpmc_irq_noop;
-
-	gpmc_client_irq[0].bitmask = GPMC_IRQ_FIFOEVENTENABLE;
-	gpmc_client_irq[1].bitmask = GPMC_IRQ_COUNT_EVENT;
-
-	for (i = 0; i < GPMC_NR_IRQ; i++) {
-		gpmc_client_irq[i].irq = gpmc_irq_start + i;
-		irq_set_chip_and_handler(gpmc_client_irq[i].irq,
-					&gpmc_irq_chip, handle_simple_irq);
-		irq_modify_status(gpmc_client_irq[i].irq, IRQ_NOREQUEST,
-				  IRQ_NOAUTOEN);
+	for (hwirq = 0; hwirq < GPMC_NR_IRQ; hwirq++) {
+		if (regval & BIT(hwirq)) {
+			virq = irq_find_mapping(gpmc_irq_domain, hwirq);
+			if (!virq) {
+				dev_warn(gpmc->dev,
+					 "spurious irq detected hwirq %d, virq %d\n",
+					 hwirq, virq);
+			}
+
+			generic_handle_irq(virq);
+		}
 	}
 
+	gpmc_write_reg(GPMC_IRQSTATUS, regval);
+
+	return IRQ_HANDLED;
+}
+
+static int gpmc_setup_irq(struct gpmc_device *gpmc)
+{
+	u32 regval;
+	int rc;
+
 	/* Disable interrupts */
 	gpmc_write_reg(GPMC_IRQENABLE, 0);
 
@@ -1239,22 +1238,46 @@ static int gpmc_setup_irq(void)
 	regval = gpmc_read_reg(GPMC_IRQSTATUS);
 	gpmc_write_reg(GPMC_IRQSTATUS, regval);
 
-	return request_irq(gpmc_irq, gpmc_handle_irq, 0, "gpmc", NULL);
+	gpmc->irq_chip.name = "gpmc";
+	gpmc->irq_chip.irq_startup = gpmc_irq_noop_ret;
+	gpmc->irq_chip.irq_enable = gpmc_irq_enable;
+	gpmc->irq_chip.irq_disable = gpmc_irq_disable;
+	gpmc->irq_chip.irq_shutdown = gpmc_irq_noop;
+	gpmc->irq_chip.irq_ack = gpmc_irq_noop;
+	gpmc->irq_chip.irq_mask = gpmc_irq_noop;
+	gpmc->irq_chip.irq_unmask = gpmc_irq_noop;
+
+	gpmc_irq_domain = irq_domain_add_linear(gpmc->dev->of_node,
+						GPMC_NR_IRQ,
+						&gpmc_irq_domain_ops,
+						gpmc);
+	if (!gpmc_irq_domain) {
+		dev_err(gpmc->dev, "IRQ domain add failed\n");
+		return -ENODEV;
+	}
+
+	rc = request_irq(gpmc->irq, gpmc_handle_irq, 0, "gpmc", gpmc);
+	if (rc) {
+		dev_err(gpmc->dev, "failed to request irq %d: %d\n",
+			gpmc->irq, rc);
+		irq_domain_remove(gpmc_irq_domain);
+		gpmc_irq_domain = NULL;
+	}
+
+	return rc;
 }
 
-static int gpmc_free_irq(void)
+static int gpmc_free_irq(struct gpmc_device *gpmc)
 {
-	int i;
+	int hwirq;
 
-	if (gpmc_irq)
-		free_irq(gpmc_irq, NULL);
+	free_irq(gpmc->irq, gpmc);
 
-	for (i = 0; i < GPMC_NR_IRQ; i++) {
-		irq_set_handler(gpmc_client_irq[i].irq, NULL);
-		irq_set_chip(gpmc_client_irq[i].irq, &no_irq_chip);
-	}
+	for (hwirq = 0; hwirq < GPMC_NR_IRQ; hwirq++)
+		irq_dispose_mapping(irq_find_mapping(gpmc_irq_domain, hwirq));
 
-	irq_free_descs(gpmc_irq_start, GPMC_NR_IRQ);
+	irq_domain_remove(gpmc_irq_domain);
+	gpmc_irq_domain = NULL;
 
 	return 0;
 }
@@ -2154,6 +2177,14 @@ static int gpmc_probe(struct platform_device *pdev)
 	int rc;
 	u32 l;
 	struct resource *res;
+	struct gpmc_device *gpmc;
+
+	gpmc = devm_kzalloc(&pdev->dev, sizeof(*gpmc), GFP_KERNEL);
+	if (!gpmc)
+		return -ENOMEM;
+
+	gpmc->dev = &pdev->dev;
+	platform_set_drvdata(pdev, gpmc);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL)
@@ -2167,15 +2198,16 @@ static int gpmc_probe(struct platform_device *pdev)
 		return PTR_ERR(gpmc_base);
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (res == NULL)
-		dev_warn(&pdev->dev, "Failed to get resource: irq\n");
-	else
-		gpmc_irq = res->start;
+	if (!res) {
+		dev_err(&pdev->dev, "Failed to get resource: irq\n");
+		return -ENOENT;
+	}
+
+	gpmc->irq = res->start;
 
 	gpmc_l3_clk = devm_clk_get(&pdev->dev, "fck");
 	if (IS_ERR(gpmc_l3_clk)) {
 		dev_err(&pdev->dev, "Failed to get GPMC fck\n");
-		gpmc_irq = 0;
 		return PTR_ERR(gpmc_l3_clk);
 	}
 
@@ -2187,8 +2219,6 @@ static int gpmc_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
 
-	gpmc_dev = &pdev->dev;
-
 	l = gpmc_read_reg(GPMC_REVISION);
 
 	/*
@@ -2207,13 +2237,16 @@ static int gpmc_probe(struct platform_device *pdev)
 		gpmc_capability = GPMC_HAS_WR_ACCESS | GPMC_HAS_WR_DATA_MUX_BUS;
 	if (GPMC_REVISION_MAJOR(l) > 0x5)
 		gpmc_capability |= GPMC_HAS_MUX_AAD;
-	dev_info(gpmc_dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
+	dev_info(gpmc->dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
 		 GPMC_REVISION_MINOR(l));
 
 	gpmc_mem_init();
 
-	if (gpmc_setup_irq() < 0)
-		dev_warn(gpmc_dev, "gpmc_setup_irq failed\n");
+	rc = gpmc_setup_irq(gpmc);
+	if (rc) {
+		dev_err(gpmc->dev, "gpmc_setup_irq failed\n");
+		goto fail;
+	}
 
 	if (!pdev->dev.of_node) {
 		gpmc_cs_num	 = GPMC_CS_NUM;
@@ -2222,21 +2255,27 @@ static int gpmc_probe(struct platform_device *pdev)
 
 	rc = gpmc_probe_dt(pdev);
 	if (rc < 0) {
-		pm_runtime_put_sync(&pdev->dev);
-		dev_err(gpmc_dev, "failed to probe DT parameters\n");
-		return rc;
+		dev_err(gpmc->dev, "failed to probe DT parameters\n");
+		gpmc_free_irq(gpmc);
+		goto fail;
 	}
 
 	return 0;
+
+fail:
+	pm_runtime_put_sync(&pdev->dev);
+	return rc;
 }
 
 static int gpmc_remove(struct platform_device *pdev)
 {
-	gpmc_free_irq();
+	struct gpmc_device *gpmc = platform_get_drvdata(pdev);
+
+	gpmc_free_irq(gpmc);
 	gpmc_mem_exit();
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
-	gpmc_dev = NULL;
+
 	return 0;
 }
 
@@ -2282,25 +2321,6 @@ static __exit void gpmc_exit(void)
 postcore_initcall(gpmc_init);
 module_exit(gpmc_exit);
 
-static irqreturn_t gpmc_handle_irq(int irq, void *dev)
-{
-	int i;
-	u32 regval;
-
-	regval = gpmc_read_reg(GPMC_IRQSTATUS);
-
-	if (!regval)
-		return IRQ_NONE;
-
-	for (i = 0; i < GPMC_NR_IRQ; i++)
-		if (regval & gpmc_client_irq[i].bitmask)
-			generic_handle_irq(gpmc_client_irq[i].irq);
-
-	gpmc_write_reg(GPMC_IRQSTATUS, regval);
-
-	return IRQ_HANDLED;
-}
-
 static struct omap3_gpmc_regs gpmc_context;
 
 void omap3_gpmc_save_context(void)
diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index dc2ada6fb9b4..9e9d79e8efa5 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -11,8 +11,9 @@
 
 #define GPMC_CONFIG_WP		0x00000005
 
-#define GPMC_IRQ_FIFOEVENTENABLE	0x01
-#define GPMC_IRQ_COUNT_EVENT		0x02
+/* IRQ numbers in GPMC IRQ domain for legacy boot use */
+#define GPMC_IRQ_FIFOEVENTENABLE	0
+#define GPMC_IRQ_COUNT_EVENT		1
 
 /**
  * gpmc_nand_ops - Interface between NAND and GPMC
-- 
cgit v1.2.3


From c509aefd75d026f4ef4aa306131d7a780c2eda7b Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 5 Aug 2015 14:01:50 +0300
Subject: mtd: nand: omap: Use gpmc_omap_get_nand_ops() to get NAND registers

Deprecate nand register passing via platform data and use
gpmc_omap_get_nand_ops() instead.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/gpmc-nand.c              | 2 --
 drivers/mtd/nand/omap2.c                     | 9 ++++++++-
 include/linux/platform_data/mtd-nand-omap2.h | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/gpmc-nand.c b/arch/arm/mach-omap2/gpmc-nand.c
index 72918c4973ea..04e6998c1529 100644
--- a/arch/arm/mach-omap2/gpmc-nand.c
+++ b/arch/arm/mach-omap2/gpmc-nand.c
@@ -121,8 +121,6 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
 	if (err < 0)
 		goto out_free_cs;
 
-	gpmc_update_nand_reg(&gpmc_nand_data->reg, gpmc_nand_data->cs);
-
 	if (!gpmc_hwecc_bch_capable(gpmc_nand_data->ecc_opt)) {
 		pr_err("omap2-nand: Unsupported NAND ECC scheme selected\n");
 		err = -EINVAL;
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 0749ca1a1456..cba9bf0adba1 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -28,6 +28,7 @@
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
 
+#include <linux/omap-gpmc.h>
 #include <linux/platform_data/mtd-nand-omap2.h>
 
 #define	DRIVER_NAME	"omap2-nand"
@@ -168,7 +169,9 @@ struct omap_nand_info {
 	} iomode;
 	u_char				*buf;
 	int					buf_len;
+	/* Interface to GPMC */
 	struct gpmc_nand_regs		reg;
+	struct gpmc_nand_ops		*ops;
 	/* generated at runtime depending on ECC algorithm and layout selected */
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
@@ -1665,9 +1668,13 @@ static int omap_nand_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, info);
 
+	info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
+	if (!info->ops) {
+		dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
+		return -ENODEV;
+	}
 	info->pdev		= pdev;
 	info->gpmc_cs		= pdata->cs;
-	info->reg		= pdata->reg;
 	info->of_node		= pdata->of_node;
 	info->ecc_opt		= pdata->ecc_opt;
 	nand_chip		= &info->nand;
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 090bbab0130a..a067f581e938 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -75,10 +75,12 @@ struct omap_nand_platform_data {
 	enum nand_io		xfer_type;
 	int			devsize;
 	enum omap_ecc           ecc_opt;
-	struct gpmc_nand_regs	reg;
 
 	/* for passing the partitions */
 	struct device_node	*of_node;
 	struct device_node	*elm_of_node;
+
+	/* deprecated */
+	struct gpmc_nand_regs	reg;
 };
 #endif
-- 
cgit v1.2.3


From c9711ec5250b22fd94e9b34c17c095e001a90e66 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 21 May 2014 07:29:03 +0300
Subject: mtd: nand: omap: Clean up device tree support

Move NAND specific device tree parsing to NAND driver.

The NAND controller node must have a compatible id, register space
resource and interrupt resource.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/gpmc-nand.c              |   5 +-
 drivers/memory/omap-gpmc.c                   | 143 +++++++--------------------
 drivers/mtd/nand/omap2.c                     | 134 +++++++++++++++++++++----
 include/linux/platform_data/mtd-nand-omap2.h |   3 +-
 4 files changed, 153 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/gpmc-nand.c b/arch/arm/mach-omap2/gpmc-nand.c
index 04e6998c1529..f6ac027f3c3b 100644
--- a/arch/arm/mach-omap2/gpmc-nand.c
+++ b/arch/arm/mach-omap2/gpmc-nand.c
@@ -97,10 +97,7 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
 	gpmc_nand_res[2].start = gpmc_get_client_irq(GPMC_IRQ_COUNT_EVENT);
 
 	memset(&s, 0, sizeof(struct gpmc_settings));
-	if (gpmc_nand_data->of_node)
-		gpmc_read_settings_dt(gpmc_nand_data->of_node, &s);
-	else
-		gpmc_set_legacy(gpmc_nand_data, &s);
+	gpmc_set_legacy(gpmc_nand_data, &s);
 
 	s.device_nand = true;
 
diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index e28d6bc2500a..8dc6e3b1c44a 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -30,7 +30,6 @@
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/omap-gpmc.h>
-#include <linux/mtd/nand.h>
 #include <linux/pm_runtime.h>
 
 #include <linux/platform_data/mtd-nand-omap2.h>
@@ -1852,105 +1851,6 @@ static void __maybe_unused gpmc_read_timings_dt(struct device_node *np,
 		of_property_read_bool(np, "gpmc,time-para-granularity");
 }
 
-#if IS_ENABLED(CONFIG_MTD_NAND)
-
-static const char * const nand_xfer_types[] = {
-	[NAND_OMAP_PREFETCH_POLLED]		= "prefetch-polled",
-	[NAND_OMAP_POLLED]			= "polled",
-	[NAND_OMAP_PREFETCH_DMA]		= "prefetch-dma",
-	[NAND_OMAP_PREFETCH_IRQ]		= "prefetch-irq",
-};
-
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-				 struct device_node *child)
-{
-	u32 val;
-	const char *s;
-	struct gpmc_timings gpmc_t;
-	struct omap_nand_platform_data *gpmc_nand_data;
-
-	if (of_property_read_u32(child, "reg", &val) < 0) {
-		dev_err(&pdev->dev, "%s has no 'reg' property\n",
-			child->full_name);
-		return -ENODEV;
-	}
-
-	gpmc_nand_data = devm_kzalloc(&pdev->dev, sizeof(*gpmc_nand_data),
-				      GFP_KERNEL);
-	if (!gpmc_nand_data)
-		return -ENOMEM;
-
-	gpmc_nand_data->cs = val;
-	gpmc_nand_data->of_node = child;
-
-	/* Detect availability of ELM module */
-	gpmc_nand_data->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
-	if (gpmc_nand_data->elm_of_node == NULL)
-		gpmc_nand_data->elm_of_node =
-					of_parse_phandle(child, "elm_id", 0);
-
-	/* select ecc-scheme for NAND */
-	if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
-		pr_err("%s: ti,nand-ecc-opt not found\n", __func__);
-		return -ENODEV;
-	}
-
-	if (!strcmp(s, "sw"))
-		gpmc_nand_data->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
-	else if (!strcmp(s, "ham1") ||
-		 !strcmp(s, "hw") || !strcmp(s, "hw-romcode"))
-		gpmc_nand_data->ecc_opt =
-				OMAP_ECC_HAM1_CODE_HW;
-	else if (!strcmp(s, "bch4"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH4_CODE_HW;
-		else
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
-	else if (!strcmp(s, "bch8"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH8_CODE_HW;
-		else
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
-	else if (!strcmp(s, "bch16"))
-		if (gpmc_nand_data->elm_of_node)
-			gpmc_nand_data->ecc_opt =
-				OMAP_ECC_BCH16_CODE_HW;
-		else
-			pr_err("%s: BCH16 requires ELM support\n", __func__);
-	else
-		pr_err("%s: ti,nand-ecc-opt invalid value\n", __func__);
-
-	/* select data transfer mode for NAND controller */
-	if (!of_property_read_string(child, "ti,nand-xfer-type", &s))
-		for (val = 0; val < ARRAY_SIZE(nand_xfer_types); val++)
-			if (!strcasecmp(s, nand_xfer_types[val])) {
-				gpmc_nand_data->xfer_type = val;
-				break;
-			}
-
-	gpmc_nand_data->flash_bbt = of_get_nand_on_flash_bbt(child);
-
-	val = of_get_nand_bus_width(child);
-	if (val == 16)
-		gpmc_nand_data->devsize = NAND_BUSWIDTH_16;
-
-	gpmc_read_timings_dt(child, &gpmc_t);
-	gpmc_nand_init(gpmc_nand_data, &gpmc_t);
-
-	return 0;
-}
-#else
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-				 struct device_node *child)
-{
-	return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_MTD_ONENAND)
 static int gpmc_probe_onenand_child(struct platform_device *pdev,
 				 struct device_node *child)
@@ -2069,9 +1969,42 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
 		goto err;
 	}
 
-	ret = of_property_read_u32(child, "bank-width", &gpmc_s.device_width);
-	if (ret < 0)
-		goto err;
+	if (of_node_cmp(child->name, "nand") == 0) {
+		/* Warn about older DT blobs with no compatible property */
+		if (!of_property_read_bool(child, "compatible")) {
+			dev_warn(&pdev->dev,
+				 "Incompatible NAND node: missing compatible");
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	if (of_device_is_compatible(child, "ti,omap2-nand")) {
+		/* NAND specific setup */
+		val = of_get_nand_bus_width(child);
+		switch (val) {
+		case 8:
+			gpmc_s.device_width = GPMC_DEVWIDTH_8BIT;
+			break;
+		case 16:
+			gpmc_s.device_width = GPMC_DEVWIDTH_16BIT;
+			break;
+		default:
+			dev_err(&pdev->dev, "%s: invalid 'nand-bus-width'\n",
+				child->name);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/* disable write protect */
+		gpmc_configure(GPMC_CONFIG_WP, 0);
+		gpmc_s.device_nand = true;
+	} else {
+		ret = of_property_read_u32(child, "bank-width",
+					   &gpmc_s.device_width);
+		if (ret < 0)
+			goto err;
+	}
 
 	gpmc_cs_show_timings(cs, "before gpmc_cs_program_settings");
 	ret = gpmc_cs_program_settings(cs, &gpmc_s);
@@ -2155,9 +2088,7 @@ static int gpmc_probe_dt(struct platform_device *pdev)
 		if (!child->name)
 			continue;
 
-		if (of_node_cmp(child->name, "nand") == 0)
-			ret = gpmc_probe_nand_child(pdev, child);
-		else if (of_node_cmp(child->name, "onenand") == 0)
+		if (of_node_cmp(child->name, "onenand") == 0)
 			ret = gpmc_probe_onenand_child(pdev, child);
 		else
 			ret = gpmc_probe_generic_child(pdev, child);
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 7e4e263c7d9c..35b8f3359c17 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/of_mtd.h>
 
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
@@ -176,11 +177,11 @@ struct omap_nand_info {
 	/* Interface to GPMC */
 	struct gpmc_nand_regs		reg;
 	struct gpmc_nand_ops		*ops;
+	bool				flash_bbt;
 	/* generated at runtime depending on ECC algorithm and layout selected */
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
 	struct device			*elm_dev;
-	struct device_node		*of_node;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -1643,10 +1644,86 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
 	return true;
 }
 
+static const char * const nand_xfer_types[] = {
+	[NAND_OMAP_PREFETCH_POLLED] = "prefetch-polled",
+	[NAND_OMAP_POLLED] = "polled",
+	[NAND_OMAP_PREFETCH_DMA] = "prefetch-dma",
+	[NAND_OMAP_PREFETCH_IRQ] = "prefetch-irq",
+};
+
+static int omap_get_dt_info(struct device *dev, struct omap_nand_info *info)
+{
+	struct device_node *child = dev->of_node;
+	int i;
+	const char *s;
+	u32 cs;
+
+	if (of_property_read_u32(child, "reg", &cs) < 0) {
+		dev_err(dev, "reg not found in DT\n");
+		return -EINVAL;
+	}
+
+	info->gpmc_cs = cs;
+
+	/* detect availability of ELM module. Won't be present pre-OMAP4 */
+	info->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
+	if (!info->elm_of_node)
+		dev_dbg(dev, "ti,elm-id not in DT\n");
+
+	/* select ecc-scheme for NAND */
+	if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
+		dev_err(dev, "ti,nand-ecc-opt not found\n");
+		return -EINVAL;
+	}
+
+	if (!strcmp(s, "sw")) {
+		info->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
+	} else if (!strcmp(s, "ham1") ||
+		   !strcmp(s, "hw") || !strcmp(s, "hw-romcode")) {
+		info->ecc_opt =	OMAP_ECC_HAM1_CODE_HW;
+	} else if (!strcmp(s, "bch4")) {
+		if (info->elm_of_node)
+			info->ecc_opt = OMAP_ECC_BCH4_CODE_HW;
+		else
+			info->ecc_opt = OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
+	} else if (!strcmp(s, "bch8")) {
+		if (info->elm_of_node)
+			info->ecc_opt = OMAP_ECC_BCH8_CODE_HW;
+		else
+			info->ecc_opt = OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
+	} else if (!strcmp(s, "bch16")) {
+		info->ecc_opt =	OMAP_ECC_BCH16_CODE_HW;
+	} else {
+		dev_err(dev, "unrecognized value for ti,nand-ecc-opt\n");
+		return -EINVAL;
+	}
+
+	/* select data transfer mode */
+	if (!of_property_read_string(child, "ti,nand-xfer-type", &s)) {
+		for (i = 0; i < ARRAY_SIZE(nand_xfer_types); i++) {
+			if (!strcasecmp(s, nand_xfer_types[i])) {
+				info->xfer_type = i;
+				goto next;
+			}
+		}
+
+		dev_err(dev, "unrecognized value for ti,nand-xfer-type\n");
+		return -EINVAL;
+	}
+
+next:
+	of_get_nand_on_flash_bbt(child);
+
+	if (of_get_nand_bus_width(child) == 16)
+		info->devsize = NAND_BUSWIDTH_16;
+
+	return 0;
+}
+
 static int omap_nand_probe(struct platform_device *pdev)
 {
 	struct omap_nand_info		*info;
-	struct omap_nand_platform_data	*pdata;
+	struct omap_nand_platform_data	*pdata = NULL;
 	struct mtd_info			*mtd;
 	struct nand_chip		*nand_chip;
 	struct nand_ecclayout		*ecclayout;
@@ -1656,39 +1733,47 @@ static int omap_nand_probe(struct platform_device *pdev)
 	unsigned			sig;
 	unsigned			oob_index;
 	struct resource			*res;
-
-	pdata = dev_get_platdata(&pdev->dev);
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "platform data missing\n");
-		return -ENODEV;
-	}
+	struct device			*dev = &pdev->dev;
 
 	info = devm_kzalloc(&pdev->dev, sizeof(struct omap_nand_info),
 				GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
-	platform_set_drvdata(pdev, info);
+	info->pdev = pdev;
 
+	if (dev->of_node) {
+		if (omap_get_dt_info(dev, info))
+			return -EINVAL;
+	} else {
+		pdata = dev_get_platdata(&pdev->dev);
+		if (!pdata) {
+			dev_err(&pdev->dev, "platform data missing\n");
+			return -EINVAL;
+		}
+
+		info->gpmc_cs = pdata->cs;
+		info->reg = pdata->reg;
+		info->ecc_opt = pdata->ecc_opt;
+		info->dev_ready	= pdata->dev_ready;
+		info->xfer_type = pdata->xfer_type;
+		info->devsize = pdata->devsize;
+		info->elm_of_node = pdata->elm_of_node;
+		info->flash_bbt = pdata->flash_bbt;
+	}
+
+	platform_set_drvdata(pdev, info);
 	info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
 	if (!info->ops) {
 		dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
 		return -ENODEV;
 	}
-	info->pdev		= pdev;
-	info->gpmc_cs		= pdata->cs;
-	info->of_node		= pdata->of_node;
-	info->ecc_opt		= pdata->ecc_opt;
-	info->dev_ready	= pdata->dev_ready;
-	info->xfer_type = pdata->xfer_type;
-	info->devsize = pdata->devsize;
-	info->elm_of_node = pdata->elm_of_node;
 
 	nand_chip		= &info->nand;
 	mtd			= nand_to_mtd(nand_chip);
 	mtd->dev.parent		= &pdev->dev;
 	nand_chip->ecc.priv	= NULL;
-	nand_set_flash_node(nand_chip, pdata->of_node);
+	nand_set_flash_node(nand_chip, dev->of_node);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	nand_chip->IO_ADDR_R = devm_ioremap_resource(&pdev->dev, res);
@@ -1717,7 +1802,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 		nand_chip->chip_delay = 50;
 	}
 
-	if (pdata->flash_bbt)
+	if (info->flash_bbt)
 		nand_chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
 	else
 		nand_chip->options |= NAND_SKIP_BBTSCAN;
@@ -2035,7 +2120,10 @@ scan_tail:
 		goto return_error;
 	}
 
-	mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
+	if (dev->of_node)
+		mtd_device_register(mtd, NULL, 0);
+	else
+		mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
 
 	platform_set_drvdata(pdev, mtd);
 
@@ -2066,11 +2154,17 @@ static int omap_nand_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id omap_nand_ids[] = {
+	{ .compatible = "ti,omap2-nand", },
+	{},
+};
+
 static struct platform_driver omap_nand_driver = {
 	.probe		= omap_nand_probe,
 	.remove		= omap_nand_remove,
 	.driver		= {
 		.name	= DRIVER_NAME,
+		.of_match_table = of_match_ptr(omap_nand_ids),
 	},
 };
 
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index a067f581e938..ff27e5a77e03 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -76,11 +76,10 @@ struct omap_nand_platform_data {
 	int			devsize;
 	enum omap_ecc           ecc_opt;
 
-	/* for passing the partitions */
-	struct device_node	*of_node;
 	struct device_node	*elm_of_node;
 
 	/* deprecated */
 	struct gpmc_nand_regs	reg;
+	struct device_node	*of_node;
 };
 #endif
-- 
cgit v1.2.3


From 9e6946215dbd9803e8b511928c9f61f3a49e2c58 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 7 Aug 2015 10:38:13 +0300
Subject: memory: omap-gpmc: Prevent GPMC_STATUS from being accessed via
 gpmc_regs

GPMC_STATUS register is private to the GPMC module and must not be
accessed directly by NAND driver through the gpmc_regs.

They must use gpmc_omap_get_nand_ops() instead.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/memory/omap-gpmc.c                   | 2 +-
 include/linux/platform_data/mtd-nand-omap2.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index ea9c89747950..33d69b1e4c31 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1081,7 +1081,7 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 {
 	int i;
 
-	reg->gpmc_status = gpmc_base + GPMC_STATUS;
+	reg->gpmc_status = NULL;	/* deprecated */
 	reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
 				GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
 	reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index ff27e5a77e03..7f6de5377f80 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -45,7 +45,6 @@ enum omap_ecc {
 };
 
 struct gpmc_nand_regs {
-	void __iomem	*gpmc_status;
 	void __iomem	*gpmc_nand_command;
 	void __iomem	*gpmc_nand_address;
 	void __iomem	*gpmc_nand_data;
@@ -64,6 +63,8 @@ struct gpmc_nand_regs {
 	void __iomem	*gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
+	/* Deprecated. Do not use */
+	void __iomem	*gpmc_status;
 };
 
 struct omap_nand_platform_data {
-- 
cgit v1.2.3


From 10f22ee367c4aff7841da6a83c10445d7d6328d9 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Thu, 6 Aug 2015 17:39:35 +0300
Subject: mtd: nand: omap2: Implement NAND ready using gpiolib

The GPMC WAIT pin status are now available over gpiolib.
Update the omap_dev_ready() function to use gpio instead of
directly accessing GPMC register space.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 .../devicetree/bindings/mtd/gpmc-nand.txt          |  2 ++
 drivers/mtd/nand/omap2.c                           | 29 ++++++++++++++--------
 include/linux/platform_data/mtd-nand-omap2.h       |  2 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
index ff3215d20343..3ee7e202657c 100644
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -48,6 +48,7 @@ Optional properties:
 		locating ECC errors for BCHx algorithms. SoC devices which have
 		ELM hardware engines should specify this device node in .dtsi
 		Using ELM for ECC error correction frees some CPU cycles.
+ - rb-gpios:	GPIO specifier for the ready/busy# pin.
 
 For inline partition table parsing (optional):
 
@@ -78,6 +79,7 @@ Example for an AM33xx board:
 			nand-bus-width = <16>;
 			ti,nand-ecc-opt = "bch8";
 			ti,nand-xfer-type = "polled";
+			rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
 
 			gpmc,sync-clk-ps = <0>;
 			gpmc,cs-on-ns = <0>;
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 35b8f3359c17..e0b2b2f0fbde 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -12,6 +12,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -182,6 +183,8 @@ struct omap_nand_info {
 	struct nand_ecclayout		oobinfo;
 	/* fields specific for BCHx_HW ECC scheme */
 	struct device			*elm_dev;
+	/* NAND ready gpio */
+	struct gpio_desc		*ready_gpiod;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -1023,21 +1026,16 @@ static int omap_wait(struct mtd_info *mtd, struct nand_chip *chip)
 }
 
 /**
- * omap_dev_ready - calls the platform specific dev_ready function
+ * omap_dev_ready - checks the NAND Ready GPIO line
  * @mtd: MTD device structure
+ *
+ * Returns true if ready and false if busy.
  */
 static int omap_dev_ready(struct mtd_info *mtd)
 {
-	unsigned int val = 0;
 	struct omap_nand_info *info = mtd_to_omap(mtd);
 
-	val = readl(info->reg.gpmc_status);
-
-	if ((val & 0x100) == 0x100) {
-		return 1;
-	} else {
-		return 0;
-	}
+	return gpiod_get_value(info->ready_gpiod);
 }
 
 /**
@@ -1755,7 +1753,9 @@ static int omap_nand_probe(struct platform_device *pdev)
 		info->gpmc_cs = pdata->cs;
 		info->reg = pdata->reg;
 		info->ecc_opt = pdata->ecc_opt;
-		info->dev_ready	= pdata->dev_ready;
+		if (pdata->dev_ready)
+			dev_info(&pdev->dev, "pdata->dev_ready is deprecated\n");
+
 		info->xfer_type = pdata->xfer_type;
 		info->devsize = pdata->devsize;
 		info->elm_of_node = pdata->elm_of_node;
@@ -1787,6 +1787,13 @@ static int omap_nand_probe(struct platform_device *pdev)
 	nand_chip->IO_ADDR_W = nand_chip->IO_ADDR_R;
 	nand_chip->cmd_ctrl  = omap_hwcontrol;
 
+	info->ready_gpiod = devm_gpiod_get_optional(&pdev->dev, "rb",
+						    GPIOD_IN);
+	if (IS_ERR(info->ready_gpiod)) {
+		dev_err(dev, "failed to get ready gpio\n");
+		return PTR_ERR(info->ready_gpiod);
+	}
+
 	/*
 	 * If RDY/BSY line is connected to OMAP then use the omap ready
 	 * function and the generic nand_wait function which reads the status
@@ -1794,7 +1801,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 	 * chip delay which is slightly more than tR (AC Timing) of the NAND
 	 * device and read status register until you get a failure or success
 	 */
-	if (info->dev_ready) {
+	if (info->ready_gpiod) {
 		nand_chip->dev_ready = omap_dev_ready;
 		nand_chip->chip_delay = 0;
 	} else {
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 7f6de5377f80..17d57a18bac5 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -71,7 +71,6 @@ struct omap_nand_platform_data {
 	int			cs;
 	struct mtd_partition	*parts;
 	int			nr_parts;
-	bool			dev_ready;
 	bool			flash_bbt;
 	enum nand_io		xfer_type;
 	int			devsize;
@@ -82,5 +81,6 @@ struct omap_nand_platform_data {
 	/* deprecated */
 	struct gpmc_nand_regs	reg;
 	struct device_node	*of_node;
+	bool			dev_ready;
 };
 #endif
-- 
cgit v1.2.3


From 675f10bde6cc3874632a8f684df2a8a2a8ace76e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 22 Feb 2016 18:29:18 +0800
Subject: f2fs: fix to convert inline directory correctly

With below serials, we will lose parts of dirents:

1) mount f2fs with inline_dentry option
2) echo 1 > /sys/fs/f2fs/sdX/dir_level
3) mkdir dir
4) touch 180 files named [1-180] in dir
5) touch 181 in dir
6) echo 3 > /proc/sys/vm/drop_caches
7) ll dir

ls: cannot access 2: No such file or directory
ls: cannot access 4: No such file or directory
ls: cannot access 5: No such file or directory
ls: cannot access 6: No such file or directory
ls: cannot access 8: No such file or directory
ls: cannot access 9: No such file or directory
...
total 360
drwxr-xr-x 2 root root 4096 Feb 19 15:12 ./
drwxr-xr-x 3 root root 4096 Feb 19 15:11 ../
-rw-r--r-- 1 root root    0 Feb 19 15:12 1
-rw-r--r-- 1 root root    0 Feb 19 15:12 10
-rw-r--r-- 1 root root    0 Feb 19 15:12 100
-????????? ? ?    ?       ?            ? 101
-????????? ? ?    ?       ?            ? 102
-????????? ? ?    ?       ?            ? 103
...

The reason is: when doing the inline dir conversion, we didn't consider
that directory has hierarchical hash structure which can be configured
through sysfs interface 'dir_level'.

By default, dir_level of directory inode is 0, it means we have one bucket
in hash table located in first level, all dirents will be hashed in this
bucket, so it has no problem for us to do the duplication simply between
inline dentry page and converted normal dentry page.

However, if we configured dir_level with the value N (greater than 0), it
will expand the bucket number of first level hash table by 2^N - 1, it
hashs dirents into different buckets according their hash value, if we
still move all dirents to first bucket, it makes incorrent locating for
inline dirents, the result is, although we can iterate all dirents through
->readdir, we can't stat some of them in ->lookup which based on hash
table searching.

This patch fixes this issue by rehashing dirents into correct position
when converting inline directory.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c           | 87 ++++++++++++++++++++++++---------------------
 fs/f2fs/f2fs.h          |  4 ++-
 fs/f2fs/inline.c        | 94 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/f2fs_fs.h |  2 ++
 4 files changed, 144 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index af819571bce7..e90380d82214 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
 	[F2FS_FT_SYMLINK]	= DT_LNK,
 };
 
-#define S_SHIFT 12
 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+	if (de->file_type < F2FS_FT_MAX)
+		return f2fs_filetype_table[de->file_type];
+	return DT_UNKNOWN;
+}
+
 static unsigned long dir_block_index(unsigned int level,
 				int dir_level, unsigned int idx)
 {
@@ -509,11 +515,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
 	}
 }
 
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
 				struct inode *inode, nid_t ino, umode_t mode)
 {
 	unsigned int bit_pos;
@@ -526,28 +528,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
 	struct page *page = NULL;
-	struct fscrypt_name fname;
-	struct qstr new_name;
-	int slots, err;
-
-	err = fscrypt_setup_filename(dir, name, 0, &fname);
-	if (err)
-		return err;
-
-	new_name.name = fname_name(&fname);
-	new_name.len = fname_len(&fname);
-
-	if (f2fs_has_inline_dentry(dir)) {
-		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
-		if (!err || err != -EAGAIN)
-			goto out;
-		else
-			err = 0;
-	}
+	int slots, err = 0;
 
 	level = 0;
-	slots = GET_DENTRY_SLOTS(new_name.len);
-	dentry_hash = f2fs_dentry_hash(&new_name);
+	slots = GET_DENTRY_SLOTS(new_name->len);
+	dentry_hash = f2fs_dentry_hash(new_name);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -556,10 +541,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	}
 
 start:
-	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
-		err = -ENOSPC;
-		goto out;
-	}
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+		return -ENOSPC;
 
 	/* Increase the depth, if required */
 	if (level == current_depth)
@@ -573,10 +556,8 @@ start:
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
 		dentry_page = get_new_data_page(dir, NULL, block, true);
-		if (IS_ERR(dentry_page)) {
-			err = PTR_ERR(dentry_page);
-			goto out;
-		}
+		if (IS_ERR(dentry_page))
+			return PTR_ERR(dentry_page);
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -596,7 +577,7 @@ add_dentry:
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, &new_name, NULL);
+		page = init_inode_metadata(inode, dir, new_name, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
@@ -606,7 +587,7 @@ add_dentry:
 	}
 
 	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
-	f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+	f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
 
@@ -628,7 +609,34 @@ fail:
 	}
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
-out:
+
+	return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct fscrypt_name fname;
+	struct qstr new_name;
+	int err;
+
+	err = fscrypt_setup_filename(dir, name, 0, &fname);
+	if (err)
+		return err;
+
+	new_name.name = fname_name(&fname);
+	new_name.len = fname_len(&fname);
+
+	err = -EAGAIN;
+	if (f2fs_has_inline_dentry(dir))
+		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
+	if (err == -EAGAIN)
+		err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+
 	fscrypt_free_filename(&fname);
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 	return err;
@@ -792,10 +800,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			continue;
 		}
 
-		if (de->file_type < F2FS_FT_MAX)
-			d_type = f2fs_filetype_table[de->file_type];
-		else
-			d_type = DT_UNKNOWN;
+		d_type = get_de_type(de);
 
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e1c07b60f301..3f1551395244 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1711,7 +1711,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
  */
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-
+unsigned char get_de_type(struct f2fs_dir_entry *);
 struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1732,6 +1732,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
 			const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+						struct inode *, nid_t, umode_t);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
 			umode_t);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a2fbe6f427d3..772056587eb9 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -355,7 +355,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * NOTE: ipage is grabbed by caller, but if any error occurs, we should
  * release ipage in this function.
  */
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 				struct f2fs_inline_dentry *inline_dentry)
 {
 	struct page *page;
@@ -416,6 +416,98 @@ out:
 	return err;
 }
 
+static int f2fs_add_inline_entries(struct inode *dir,
+			struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_dentry_ptr d;
+	unsigned long bit_pos = 0;
+	int err = 0;
+
+	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+	while (bit_pos < d.max) {
+		struct f2fs_dir_entry *de;
+		struct qstr new_name;
+		nid_t ino;
+		umode_t fake_mode;
+
+		if (!test_bit_le(bit_pos, d.bitmap)) {
+			bit_pos++;
+			continue;
+		}
+
+		de = &d.dentry[bit_pos];
+		new_name.name = d.filename[bit_pos];
+		new_name.len = de->name_len;
+
+		ino = le32_to_cpu(de->ino);
+		fake_mode = get_de_type(de) << S_SHIFT;
+
+		err = f2fs_add_regular_entry(dir, &new_name, NULL,
+							ino, fake_mode);
+		if (err)
+			goto punch_dentry_pages;
+
+		if (unlikely(!de->name_len))
+			d.max = -1;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+	}
+	return 0;
+punch_dentry_pages:
+	truncate_inode_pages(&dir->i_data, 0);
+	truncate_blocks(dir, 0, false);
+	remove_dirty_inode(dir);
+	return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_inline_dentry *backup_dentry;
+	int err;
+
+	backup_dentry = kmalloc(sizeof(struct f2fs_inline_dentry),
+							GFP_F2FS_ZERO);
+	if (!backup_dentry)
+		return -ENOMEM;
+
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+	truncate_inline_inode(ipage, 0);
+
+	unlock_page(ipage);
+
+	err = f2fs_add_inline_entries(dir, backup_dentry);
+	if (err)
+		goto recover;
+
+	lock_page(ipage);
+
+	stat_dec_inline_dir(dir);
+	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+	update_inode(dir, ipage);
+	kfree(backup_dentry);
+	return 0;
+recover:
+	lock_page(ipage);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	i_size_write(dir, MAX_INLINE_DATA);
+	update_inode(dir, ipage);
+	f2fs_put_page(ipage, 1);
+
+	kfree(backup_dentry);
+	return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	if (!F2FS_I(dir)->i_dir_level)
+		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+	else
+		return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
 int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 			struct inode *inode, nid_t ino, umode_t mode)
 {
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b90e9bdbd1dd..4c02c6521fef 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -508,4 +508,6 @@ enum {
 	F2FS_FT_MAX
 };
 
+#define S_SHIFT 12
+
 #endif  /* _LINUX_F2FS_FS_H */
-- 
cgit v1.2.3


From 522566376a3f8373fbd5ff75bb8a7a2da701c1a7 Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Thu, 14 Apr 2016 11:59:31 +0200
Subject: devcoredump: add scatterlist support

Add scatterlist support (dev_coredumpsg) to allow drivers to avoid
vmalloc() like dev_coredumpm(), while also avoiding the module
reference that the latter function requires.

This internally uses dev_coredumpm() with function inside the
devcoredump module, requiring removing the const
(which touches the driver using it.)

Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/devcoredump.c                      | 83 +++++++++++++++++++++---
 drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c |  4 +-
 include/linux/devcoredump.h                     | 86 ++++++++++++++++++++++---
 3 files changed, 154 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 1bd120a0b084..240374fd1838 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -4,6 +4,7 @@
  * GPL LICENSE SUMMARY
  *
  * Copyright(c) 2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -41,12 +42,12 @@ static bool devcd_disabled;
 
 struct devcd_entry {
 	struct device devcd_dev;
-	const void *data;
+	void *data;
 	size_t datalen;
 	struct module *owner;
 	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-			const void *data, size_t datalen);
-	void (*free)(const void *data);
+			void *data, size_t datalen);
+	void (*free)(void *data);
 	struct delayed_work del_wk;
 	struct device *failing_dev;
 };
@@ -174,7 +175,7 @@ static struct class devcd_class = {
 };
 
 static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
-			   const void *data, size_t datalen)
+			   void *data, size_t datalen)
 {
 	if (offset > datalen)
 		return -EINVAL;
@@ -188,6 +189,11 @@ static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
 	return count;
 }
 
+static void devcd_freev(void *data)
+{
+	vfree(data);
+}
+
 /**
  * dev_coredumpv - create device coredump with vmalloc data
  * @dev: the struct device for the crashed device
@@ -198,10 +204,10 @@ static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
  * This function takes ownership of the vmalloc'ed data and will free
  * it when it is no longer used. See dev_coredumpm() for more information.
  */
-void dev_coredumpv(struct device *dev, const void *data, size_t datalen,
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
 		   gfp_t gfp)
 {
-	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, vfree);
+	dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);
 }
 EXPORT_SYMBOL_GPL(dev_coredumpv);
 
@@ -212,6 +218,44 @@ static int devcd_match_failing(struct device *dev, const void *failing)
 	return devcd->failing_dev == failing;
 }
 
+/**
+ * devcd_free_sgtable - free all the memory of the given scatterlist table
+ * (i.e. both pages and scatterlist instances)
+ * NOTE: if two tables allocated with devcd_alloc_sgtable and then chained
+ * using the sg_chain function then that function should be called only once
+ * on the chained table
+ * @table: pointer to sg_table to free
+ */
+static void devcd_free_sgtable(void *data)
+{
+	_devcd_free_sgtable(data);
+}
+
+/**
+ * devcd_read_from_table - copy data from sg_table to a given buffer
+ * and return the number of bytes read
+ * @buffer: the buffer to copy the data to it
+ * @buf_len: the length of the buffer
+ * @data: the scatterlist table to copy from
+ * @offset: start copy from @offset@ bytes from the head of the data
+ *	in the given scatterlist
+ * @data_len: the length of the data in the sg_table
+ */
+static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
+				       size_t buf_len, void *data,
+				       size_t data_len)
+{
+	struct scatterlist *table = data;
+
+	if (offset > data_len)
+		return -EINVAL;
+
+	if (offset + buf_len > data_len)
+		buf_len = data_len - offset;
+	return sg_pcopy_to_buffer(table, sg_nents(table), buffer, buf_len,
+				  offset);
+}
+
 /**
  * dev_coredumpm - create device coredump with read/free methods
  * @dev: the struct device for the crashed device
@@ -228,10 +272,10 @@ static int devcd_match_failing(struct device *dev, const void *failing)
  * function will be called to free the data.
  */
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   const void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-				   const void *data, size_t datalen),
-		   void (*free)(const void *data))
+				   void *data, size_t datalen),
+		   void (*free)(void *data))
 {
 	static atomic_t devcd_count = ATOMIC_INIT(0);
 	struct devcd_entry *devcd;
@@ -291,6 +335,27 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 }
 EXPORT_SYMBOL_GPL(dev_coredumpm);
 
+/**
+ * dev_coredumpmsg - create device coredump that uses scatterlist as data
+ * parameter
+ * @dev: the struct device for the crashed device
+ * @table: the dump data
+ * @datalen: length of the data
+ * @gfp: allocation flags
+ *
+ * Creates a new device coredump for the given device. If a previous one hasn't
+ * been read yet, the new coredump is discarded. The data lifetime is determined
+ * by the device coredump framework and when it is no longer needed
+ * it will free the data.
+ */
+void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+		    size_t datalen, gfp_t gfp)
+{
+	dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,
+		      devcd_free_sgtable);
+}
+EXPORT_SYMBOL_GPL(dev_coredumpsg);
+
 static int __init devcoredump_init(void)
 {
 	return class_register(&devcd_class);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
index 4856eac120f6..a4b0581d2275 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw-dbg.c
@@ -71,7 +71,7 @@
 #include "iwl-csr.h"
 
 static ssize_t iwl_mvm_read_coredump(char *buffer, loff_t offset, size_t count,
-				     const void *data, size_t datalen)
+				     void *data, size_t datalen)
 {
 	const struct iwl_mvm_dump_ptrs *dump_ptrs = data;
 	ssize_t bytes_read;
@@ -104,7 +104,7 @@ static ssize_t iwl_mvm_read_coredump(char *buffer, loff_t offset, size_t count,
 	return bytes_read + bytes_read_trans;
 }
 
-static void iwl_mvm_free_coredump(const void *data)
+static void iwl_mvm_free_coredump(void *data)
 {
 	const struct iwl_mvm_dump_ptrs *fw_error_dump = data;
 
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c0a360e99f64..269521f143ac 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -1,3 +1,22 @@
+/*
+ * This file is provided under the GPLv2 license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Deutschland GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called COPYING.
+ */
 #ifndef __DEVCOREDUMP_H
 #define __DEVCOREDUMP_H
 
@@ -5,17 +24,62 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+/*
+ * _devcd_free_sgtable - free all the memory of the given scatterlist table
+ * (i.e. both pages and scatterlist instances)
+ * NOTE: if two tables allocated and chained using the sg_chain function then
+ * this function should be called only once on the first table
+ * @table: pointer to sg_table to free
+ */
+static inline void _devcd_free_sgtable(struct scatterlist *table)
+{
+	int i;
+	struct page *page;
+	struct scatterlist *iter;
+	struct scatterlist *delete_iter;
+
+	/* free pages */
+	iter = table;
+	for_each_sg(table, iter, sg_nents(table), i) {
+		page = sg_page(iter);
+		if (page)
+			__free_page(page);
+	}
+
+	/* then free all chained tables */
+	iter = table;
+	delete_iter = table;	/* always points on a head of a table */
+	while (!sg_is_last(iter)) {
+		iter++;
+		if (sg_is_chain(iter)) {
+			iter = sg_chain_ptr(iter);
+			kfree(delete_iter);
+			delete_iter = iter;
+		}
+	}
+
+	/* free the last table */
+	kfree(delete_iter);
+}
+
+
 #ifdef CONFIG_DEV_COREDUMP
-void dev_coredumpv(struct device *dev, const void *data, size_t datalen,
+void dev_coredumpv(struct device *dev, void *data, size_t datalen,
 		   gfp_t gfp);
 
 void dev_coredumpm(struct device *dev, struct module *owner,
-		   const void *data, size_t datalen, gfp_t gfp,
+		   void *data, size_t datalen, gfp_t gfp,
 		   ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-				   const void *data, size_t datalen),
-		   void (*free)(const void *data));
+				   void *data, size_t datalen),
+		   void (*free)(void *data));
+
+void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+		    size_t datalen, gfp_t gfp);
 #else
-static inline void dev_coredumpv(struct device *dev, const void *data,
+static inline void dev_coredumpv(struct device *dev, void *data,
 				 size_t datalen, gfp_t gfp)
 {
 	vfree(data);
@@ -23,13 +87,19 @@ static inline void dev_coredumpv(struct device *dev, const void *data,
 
 static inline void
 dev_coredumpm(struct device *dev, struct module *owner,
-	      const void *data, size_t datalen, gfp_t gfp,
+	      void *data, size_t datalen, gfp_t gfp,
 	      ssize_t (*read)(char *buffer, loff_t offset, size_t count,
-			      const void *data, size_t datalen),
-	      void (*free)(const void *data))
+			      void *data, size_t datalen),
+	      void (*free)(void *data))
 {
 	free(data);
 }
+
+static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
+				  size_t datalen, gfp_t gfp)
+{
+	_devcd_free_sgtable(table);
+}
 #endif /* CONFIG_DEV_COREDUMP */
 
 #endif /* __DEVCOREDUMP_H */
-- 
cgit v1.2.3


From 9b1d6c8950021ab007608d455fc9c398ecd25476 Mon Sep 17 00:00:00 2001
From: Ming Lin <ming.l@ssi.samsung.com>
Date: Mon, 4 Apr 2016 14:48:11 -0700
Subject: lib: scatterlist: move SG pool code from SCSI driver to lib/sg_pool.c

Now it's ready to move the mempool based SG chained allocator code from
SCSI driver to lib/sg_pool.c, which will be compiled only based on a Kconfig
symbol CONFIG_SG_POOL.

SCSI selects CONFIG_SG_POOL.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/Kconfig        |   1 +
 drivers/scsi/scsi_lib.c     | 137 -----------------------------------
 include/linux/scatterlist.h |  25 +++++++
 include/scsi/scsi.h         |  19 -----
 lib/Kconfig                 |   7 ++
 lib/Makefile                |   1 +
 lib/sg_pool.c               | 172 ++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 206 insertions(+), 156 deletions(-)
 create mode 100644 lib/sg_pool.c

(limited to 'include/linux')

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 0950567e6269..98e5d51a3346 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -17,6 +17,7 @@ config SCSI
 	tristate "SCSI device support"
 	depends on BLOCK
 	select SCSI_DMA if HAS_DMA
+	select SG_POOL
 	---help---
 	  If you want to use a SCSI hard disk, SCSI tape drive, SCSI CD-ROM or
 	  any other SCSI device under Linux, say Y and make sure that you know
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8f776f1e95ce..b920c5dabf60 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -14,8 +14,6 @@
 #include <linux/completion.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
-#include <linux/mempool.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -40,39 +38,6 @@
 #include "scsi_logging.h"
 
 
-#define SG_MEMPOOL_NR		ARRAY_SIZE(sg_pools)
-#define SG_MEMPOOL_SIZE		2
-
-struct sg_pool {
-	size_t		size;
-	char		*name;
-	struct kmem_cache	*slab;
-	mempool_t	*pool;
-};
-
-#define SP(x) { .size = x, "sgpool-" __stringify(x) }
-#if (SG_CHUNK_SIZE < 32)
-#error SG_CHUNK_SIZE is too small (must be 32 or greater)
-#endif
-static struct sg_pool sg_pools[] = {
-	SP(8),
-	SP(16),
-#if (SG_CHUNK_SIZE > 32)
-	SP(32),
-#if (SG_CHUNK_SIZE > 64)
-	SP(64),
-#if (SG_CHUNK_SIZE > 128)
-	SP(128),
-#if (SG_CHUNK_SIZE > 256)
-#error SG_CHUNK_SIZE is too large (256 MAX)
-#endif
-#endif
-#endif
-#endif
-	SP(SG_CHUNK_SIZE)
-};
-#undef SP
-
 struct kmem_cache *scsi_sdb_cache;
 
 /*
@@ -553,65 +518,6 @@ void scsi_run_host_queues(struct Scsi_Host *shost)
 		scsi_run_queue(sdev->request_queue);
 }
 
-static inline unsigned int sg_pool_index(unsigned short nents)
-{
-	unsigned int index;
-
-	BUG_ON(nents > SG_CHUNK_SIZE);
-
-	if (nents <= 8)
-		index = 0;
-	else
-		index = get_count_order(nents) - 3;
-
-	return index;
-}
-
-static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
-{
-	struct sg_pool *sgp;
-
-	sgp = sg_pools + sg_pool_index(nents);
-	mempool_free(sgl, sgp->pool);
-}
-
-static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
-{
-	struct sg_pool *sgp;
-
-	sgp = sg_pools + sg_pool_index(nents);
-	return mempool_alloc(sgp->pool, gfp_mask);
-}
-
-static void sg_free_table_chained(struct sg_table *table, bool first_chunk)
-{
-	if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE)
-		return;
-	__sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free);
-}
-
-static int sg_alloc_table_chained(struct sg_table *table, int nents,
-		struct scatterlist *first_chunk)
-{
-	int ret;
-
-	BUG_ON(!nents);
-
-	if (first_chunk) {
-		if (nents <= SG_CHUNK_SIZE) {
-			table->nents = table->orig_nents = nents;
-			sg_init_table(table->sgl, nents);
-			return 0;
-		}
-	}
-
-	ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
-			       first_chunk, GFP_ATOMIC, sg_pool_alloc);
-	if (unlikely(ret))
-		sg_free_table_chained(table, (bool)first_chunk);
-	return ret;
-}
-
 static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
 {
 	if (cmd->request->cmd_type == REQ_TYPE_FS) {
@@ -2269,8 +2175,6 @@ EXPORT_SYMBOL(scsi_unblock_requests);
 
 int __init scsi_init_queue(void)
 {
-	int i;
-
 	scsi_sdb_cache = kmem_cache_create("scsi_data_buffer",
 					   sizeof(struct scsi_data_buffer),
 					   0, 0, NULL);
@@ -2279,53 +2183,12 @@ int __init scsi_init_queue(void)
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < SG_MEMPOOL_NR; i++) {
-		struct sg_pool *sgp = sg_pools + i;
-		int size = sgp->size * sizeof(struct scatterlist);
-
-		sgp->slab = kmem_cache_create(sgp->name, size, 0,
-				SLAB_HWCACHE_ALIGN, NULL);
-		if (!sgp->slab) {
-			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
-					sgp->name);
-			goto cleanup_sdb;
-		}
-
-		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
-						     sgp->slab);
-		if (!sgp->pool) {
-			printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
-					sgp->name);
-			goto cleanup_sdb;
-		}
-	}
-
 	return 0;
-
-cleanup_sdb:
-	for (i = 0; i < SG_MEMPOOL_NR; i++) {
-		struct sg_pool *sgp = sg_pools + i;
-		if (sgp->pool)
-			mempool_destroy(sgp->pool);
-		if (sgp->slab)
-			kmem_cache_destroy(sgp->slab);
-	}
-	kmem_cache_destroy(scsi_sdb_cache);
-
-	return -ENOMEM;
 }
 
 void scsi_exit_queue(void)
 {
-	int i;
-
 	kmem_cache_destroy(scsi_sdb_cache);
-
-	for (i = 0; i < SG_MEMPOOL_NR; i++) {
-		struct sg_pool *sgp = sg_pools + i;
-		mempool_destroy(sgp->pool);
-		kmem_cache_destroy(sgp->slab);
-	}
 }
 
 /**
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 556ec1ea2574..cb3c8fe6acd7 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -285,6 +285,31 @@ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
  */
 #define SG_MAX_SINGLE_ALLOC		(PAGE_SIZE / sizeof(struct scatterlist))
 
+/*
+ * The maximum number of SG segments that we will put inside a
+ * scatterlist (unless chaining is used). Should ideally fit inside a
+ * single page, to avoid a higher order allocation.  We could define this
+ * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
+ * minimum value is 32
+ */
+#define SG_CHUNK_SIZE	128
+
+/*
+ * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#ifdef CONFIG_ARCH_HAS_SG_CHAIN
+#define SG_MAX_SEGMENTS	2048
+#else
+#define SG_MAX_SEGMENTS	SG_CHUNK_SIZE
+#endif
+
+#ifdef CONFIG_SG_POOL
+void sg_free_table_chained(struct sg_table *table, bool first_chunk);
+int sg_alloc_table_chained(struct sg_table *table, int nents,
+			   struct scatterlist *first_chunk);
+#endif
+
 /*
  * sg page iterator
  *
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 74dafa75bae7..8ec7c30e35af 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -17,25 +17,6 @@ enum scsi_timeouts {
 	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
 };
 
-/*
- * The maximum number of SG segments that we will put inside a
- * scatterlist (unless chaining is used). Should ideally fit inside a
- * single page, to avoid a higher order allocation.  We could define this
- * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
- * minimum value is 32
- */
-#define SG_CHUNK_SIZE	128
-
-/*
- * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
- * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
- */
-#ifdef CONFIG_ARCH_HAS_SG_CHAIN
-#define SG_MAX_SEGMENTS	2048
-#else
-#define SG_MAX_SEGMENTS	SG_CHUNK_SIZE
-#endif
-
 /*
  * DIX-capable adapters effectively support infinite chaining for the
  * protection information scatterlist
diff --git a/lib/Kconfig b/lib/Kconfig
index 3cca1222578e..61d55bd0ed89 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -523,6 +523,13 @@ config SG_SPLIT
 	 a scatterlist. This should be selected by a driver or an API which
 	 whishes to split a scatterlist amongst multiple DMA channels.
 
+config SG_POOL
+	def_bool n
+	help
+	 Provides a helper to allocate chained scatterlists. This should be
+	 selected by a driver or an API which whishes to allocate chained
+	 scatterlist.
+
 #
 # sg chaining option
 #
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..bf01c2673423 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -178,6 +178,7 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o
 obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
+obj-$(CONFIG_SG_POOL) += sg_pool.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
diff --git a/lib/sg_pool.c b/lib/sg_pool.c
new file mode 100644
index 000000000000..6dd30615a201
--- /dev/null
+++ b/lib/sg_pool.c
@@ -0,0 +1,172 @@
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+
+#define SG_MEMPOOL_NR		ARRAY_SIZE(sg_pools)
+#define SG_MEMPOOL_SIZE		2
+
+struct sg_pool {
+	size_t		size;
+	char		*name;
+	struct kmem_cache	*slab;
+	mempool_t	*pool;
+};
+
+#define SP(x) { .size = x, "sgpool-" __stringify(x) }
+#if (SG_CHUNK_SIZE < 32)
+#error SG_CHUNK_SIZE is too small (must be 32 or greater)
+#endif
+static struct sg_pool sg_pools[] = {
+	SP(8),
+	SP(16),
+#if (SG_CHUNK_SIZE > 32)
+	SP(32),
+#if (SG_CHUNK_SIZE > 64)
+	SP(64),
+#if (SG_CHUNK_SIZE > 128)
+	SP(128),
+#if (SG_CHUNK_SIZE > 256)
+#error SG_CHUNK_SIZE is too large (256 MAX)
+#endif
+#endif
+#endif
+#endif
+	SP(SG_CHUNK_SIZE)
+};
+#undef SP
+
+static inline unsigned int sg_pool_index(unsigned short nents)
+{
+	unsigned int index;
+
+	BUG_ON(nents > SG_CHUNK_SIZE);
+
+	if (nents <= 8)
+		index = 0;
+	else
+		index = get_count_order(nents) - 3;
+
+	return index;
+}
+
+static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
+{
+	struct sg_pool *sgp;
+
+	sgp = sg_pools + sg_pool_index(nents);
+	mempool_free(sgl, sgp->pool);
+}
+
+static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
+{
+	struct sg_pool *sgp;
+
+	sgp = sg_pools + sg_pool_index(nents);
+	return mempool_alloc(sgp->pool, gfp_mask);
+}
+
+/**
+ * sg_free_table_chained - Free a previously mapped sg table
+ * @table:	The sg table header to use
+ * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained?
+ *
+ *  Description:
+ *    Free an sg table previously allocated and setup with
+ *    sg_alloc_table_chained().
+ *
+ **/
+void sg_free_table_chained(struct sg_table *table, bool first_chunk)
+{
+	if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE)
+		return;
+	__sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free);
+}
+EXPORT_SYMBOL_GPL(sg_free_table_chained);
+
+/**
+ * sg_alloc_table_chained - Allocate and chain SGLs in an sg table
+ * @table:	The sg table header to use
+ * @nents:	Number of entries in sg list
+ * @first_chunk: first SGL
+ *
+ *  Description:
+ *    Allocate and chain SGLs in an sg table. If @nents@ is larger than
+ *    SG_CHUNK_SIZE a chained sg table will be setup.
+ *
+ **/
+int sg_alloc_table_chained(struct sg_table *table, int nents,
+		struct scatterlist *first_chunk)
+{
+	int ret;
+
+	BUG_ON(!nents);
+
+	if (first_chunk) {
+		if (nents <= SG_CHUNK_SIZE) {
+			table->nents = table->orig_nents = nents;
+			sg_init_table(table->sgl, nents);
+			return 0;
+		}
+	}
+
+	ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
+			       first_chunk, GFP_ATOMIC, sg_pool_alloc);
+	if (unlikely(ret))
+		sg_free_table_chained(table, (bool)first_chunk);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sg_alloc_table_chained);
+
+static __init int sg_pool_init(void)
+{
+	int i;
+
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		int size = sgp->size * sizeof(struct scatterlist);
+
+		sgp->slab = kmem_cache_create(sgp->name, size, 0,
+				SLAB_HWCACHE_ALIGN, NULL);
+		if (!sgp->slab) {
+			printk(KERN_ERR "SG_POOL: can't init sg slab %s\n",
+					sgp->name);
+			goto cleanup_sdb;
+		}
+
+		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
+						     sgp->slab);
+		if (!sgp->pool) {
+			printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n",
+					sgp->name);
+			goto cleanup_sdb;
+		}
+	}
+
+	return 0;
+
+cleanup_sdb:
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		if (sgp->pool)
+			mempool_destroy(sgp->pool);
+		if (sgp->slab)
+			kmem_cache_destroy(sgp->slab);
+	}
+
+	return -ENOMEM;
+}
+
+static __exit void sg_pool_exit(void)
+{
+	int i;
+
+	for (i = 0; i < SG_MEMPOOL_NR; i++) {
+		struct sg_pool *sgp = sg_pools + i;
+		mempool_destroy(sgp->pool);
+		kmem_cache_destroy(sgp->slab);
+	}
+}
+
+module_init(sg_pool_init);
+module_exit(sg_pool_exit);
-- 
cgit v1.2.3


From 464f664501816ef5fbbc00b8de96f4ae5a1c9325 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@qlogic.com>
Date: Thu, 14 Apr 2016 01:38:29 -0400
Subject: qed: Add infrastructure support for tunneling

This patch adds various structure/APIs needed to configure/enable different
tunnel [VXLAN/GRE/GENEVE] parameters on the adapter.

Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: Ariel Elior <Ariel.Elior@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h              |  46 ++++
 drivers/net/ethernet/qlogic/qed/qed_dev.c          |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h      |   2 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          |  51 ++++-
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    | 127 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_l2.c           |  31 +++
 drivers/net/ethernet/qlogic/qed/qed_main.c         |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |  31 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h           |   7 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c  | 254 +++++++++++++++++++++
 include/linux/qed/qed_eth_if.h                     |  10 +
 11 files changed, 563 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 0f0d2d1d77e5..33e2ed60c18f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -74,6 +74,51 @@ struct qed_rt_data {
 	bool	*b_valid;
 };
 
+enum qed_tunn_mode {
+	QED_MODE_L2GENEVE_TUNN,
+	QED_MODE_IPGENEVE_TUNN,
+	QED_MODE_L2GRE_TUNN,
+	QED_MODE_IPGRE_TUNN,
+	QED_MODE_VXLAN_TUNN,
+};
+
+enum qed_tunn_clss {
+	QED_TUNN_CLSS_MAC_VLAN,
+	QED_TUNN_CLSS_MAC_VNI,
+	QED_TUNN_CLSS_INNER_MAC_VLAN,
+	QED_TUNN_CLSS_INNER_MAC_VNI,
+	MAX_QED_TUNN_CLSS,
+};
+
+struct qed_tunn_start_params {
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
+struct qed_tunn_update_params {
+	unsigned long	tunn_mode_update_mask;
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_rx_pf_clss;
+	u8		update_tx_pf_clss;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
 /* The PCI personality is not quite synonymous to protocol ID:
  * 1. All personalities need CORE connections
  * 2. The Ethernet personality may support also the RoCE protocol
@@ -430,6 +475,7 @@ struct qed_dev {
 	u8				num_hwfns;
 	struct qed_hwfn			hwfns[MAX_HWFNS_PER_DEVICE];
 
+	unsigned long			tunn_mode;
 	u32				drv_type;
 
 	struct qed_eth_stats		*reset_stats;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index b7d100f6bd6f..bdae5a55afa4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -558,6 +558,7 @@ static int qed_hw_init_port(struct qed_hwfn *p_hwfn,
 
 static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt,
+			  struct qed_tunn_start_params *p_tunn,
 			  int hw_mode,
 			  bool b_hw_start,
 			  enum qed_int_mode int_mode,
@@ -625,7 +626,7 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		qed_int_igu_enable(p_hwfn, p_ptt, int_mode);
 
 		/* send function start command */
-		rc = qed_sp_pf_start(p_hwfn, p_hwfn->cdev->mf_mode);
+		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode);
 		if (rc)
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
 	}
@@ -672,6 +673,7 @@ static void qed_reset_mb_shadow(struct qed_hwfn *p_hwfn,
 }
 
 int qed_hw_init(struct qed_dev *cdev,
+		struct qed_tunn_start_params *p_tunn,
 		bool b_hw_start,
 		enum qed_int_mode int_mode,
 		bool allow_npar_tx_switch,
@@ -724,7 +726,7 @@ int qed_hw_init(struct qed_dev *cdev,
 		/* Fall into */
 		case FW_MSG_CODE_DRV_LOAD_FUNCTION:
 			rc = qed_hw_init_pf(p_hwfn, p_hwfn->p_main_ptt,
-					    p_hwfn->hw_info.hw_mode,
+					    p_tunn, p_hwfn->hw_info.hw_mode,
 					    b_hw_start, int_mode,
 					    allow_npar_tx_switch);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index d6c7ddf4f4d4..6aac3f855aa1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -62,6 +62,7 @@ void qed_resc_setup(struct qed_dev *cdev);
  * @brief qed_hw_init -
  *
  * @param cdev
+ * @param p_tunn
  * @param b_hw_start
  * @param int_mode - interrupt mode [msix, inta, etc.] to use.
  * @param allow_npar_tx_switch - npar tx switching to be used
@@ -72,6 +73,7 @@ void qed_resc_setup(struct qed_dev *cdev);
  * @return int
  */
 int qed_hw_init(struct qed_dev *cdev,
+		struct qed_tunn_start_params *p_tunn,
 		bool b_hw_start,
 		enum qed_int_mode int_mode,
 		bool allow_npar_tx_switch,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index a368f5e71d95..15e02ab9be5a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -46,7 +46,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
 	COMMON_RAMROD_RESERVED,
 	COMMON_RAMROD_RESERVED2,
-	COMMON_RAMROD_RESERVED3,
+	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
 	MAX_COMMON_RAMROD_CMD_ID
 };
@@ -626,6 +626,42 @@ struct pf_start_ramrod_data {
 	u8				reserved0[4];
 };
 
+/* tunnel configuration */
+struct pf_update_tunnel_config {
+	u8	update_rx_pf_clss;
+	u8	update_tx_pf_clss;
+	u8	set_vxlan_udp_port_flg;
+	u8	set_geneve_udp_port_flg;
+	u8	tx_enable_vxlan;
+	u8	tx_enable_l2geneve;
+	u8	tx_enable_ipgeneve;
+	u8	tx_enable_l2gre;
+	u8	tx_enable_ipgre;
+	u8	tunnel_clss_vxlan;
+	u8	tunnel_clss_l2geneve;
+	u8	tunnel_clss_ipgeneve;
+	u8	tunnel_clss_l2gre;
+	u8	tunnel_clss_ipgre;
+	__le16	vxlan_udp_port;
+	__le16	geneve_udp_port;
+	__le16	reserved[3];
+};
+
+struct pf_update_ramrod_data {
+	u32				reserved[2];
+	u32				reserved_1[6];
+	struct pf_update_tunnel_config	tunnel_config;
+};
+
+/* Tunnel classification scheme */
+enum tunnel_clss {
+	TUNNEL_CLSS_MAC_VLAN = 0,
+	TUNNEL_CLSS_MAC_VNI,
+	TUNNEL_CLSS_INNER_MAC_VLAN,
+	TUNNEL_CLSS_INNER_MAC_VNI,
+	MAX_TUNNEL_CLSS
+};
+
 enum ports_mode {
 	ENGX2_PORTX1 /* 2 engines x 1 port */,
 	ENGX2_PORTX2 /* 2 engines x 2 ports */,
@@ -1603,6 +1639,19 @@ bool qed_send_qm_stop_cmd(struct qed_hwfn	*p_hwfn,
 			  u16			start_pq,
 			  u16			num_pqs);
 
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt  *p_ptt, u16 dest_port);
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, bool vxlan_enable);
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn,
+			struct qed_ptt  *p_ptt, bool eth_gre_enable,
+			bool ip_gre_enable);
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 dest_port);
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, bool eth_geneve_enable,
+			   bool ip_geneve_enable);
+
 /* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
 #define YSTORM_FLOW_CONTROL_MODE_OFFSET  (IRO[0].base)
 #define YSTORM_FLOW_CONTROL_MODE_SIZE    (IRO[0].size)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index f55ebdc3c832..1dd53248b984 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -788,3 +788,130 @@ bool qed_send_qm_stop_cmd(struct qed_hwfn *p_hwfn,
 
 	return true;
 }
+
+static void
+qed_set_tunnel_type_enable_bit(unsigned long *var, int bit, bool enable)
+{
+	if (enable)
+		set_bit(bit, var);
+	else
+		clear_bit(bit, var);
+}
+
+#define PRS_ETH_TUNN_FIC_FORMAT	-188897008
+
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     u16 dest_port)
+{
+	qed_wr(p_hwfn, p_ptt, PRS_REG_VXLAN_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_VXLAN_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_VXLAN_PORT, dest_port);
+}
+
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  bool vxlan_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_VXLAN_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, vxlan_enable);
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, vxlan_enable);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN,
+	       vxlan_enable ? 1 : 0);
+}
+
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			bool eth_gre_enable, bool ip_gre_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_gre_enable);
+
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_gre_enable);
+
+	shift = NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN,
+	       eth_gre_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN,
+	       ip_gre_enable ? 1 : 0);
+}
+
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u16 dest_port)
+{
+	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_PORT, dest_port);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_PORT, dest_port);
+}
+
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   bool eth_geneve_enable,
+			   bool ip_geneve_enable)
+{
+	unsigned long reg_val = 0;
+	u8 shift;
+
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GENEVE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, eth_geneve_enable);
+
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GENEVE_ENABLE_SHIFT;
+	qed_set_tunnel_type_enable_bit(&reg_val, shift, ip_geneve_enable);
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val)
+		qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0,
+		       PRS_ETH_TUNN_FIC_FORMAT);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_ETH_ENABLE,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_IP_ENABLE, ip_geneve_enable ? 1 : 0);
+
+	/* comp ver */
+	reg_val = (ip_geneve_enable || eth_geneve_enable) ? 1 : 0;
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_COMP_VER, reg_val);
+	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_COMP_VER, reg_val);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_COMP_VER, reg_val);
+
+	/* EDPM with geneve tunnel not supported in BB_B0 */
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		return;
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN,
+	       ip_geneve_enable ? 1 : 0);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 5005497ee23e..fb5f3b815340 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1884,6 +1884,36 @@ static int qed_stop_txq(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_tunn_configure(struct qed_dev *cdev,
+			      struct qed_tunn_params *tunn_params)
+{
+	struct qed_tunn_update_params tunn_info;
+	int i, rc;
+
+	memset(&tunn_info, 0, sizeof(tunn_info));
+	if (tunn_params->update_vxlan_port == 1) {
+		tunn_info.update_vxlan_udp_port = 1;
+		tunn_info.vxlan_udp_port = tunn_params->vxlan_port;
+	}
+
+	if (tunn_params->update_geneve_port == 1) {
+		tunn_info.update_geneve_udp_port = 1;
+		tunn_info.geneve_udp_port = tunn_params->geneve_port;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+
+		rc = qed_sp_pf_update_tunn_cfg(hwfn, &tunn_info,
+					       QED_SPQ_MODE_EBLOCK, NULL);
+
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
 static int qed_configure_filter_rx_mode(struct qed_dev *cdev,
 					enum qed_filter_rx_mode_type type)
 {
@@ -2026,6 +2056,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.fastpath_stop = &qed_fastpath_stop,
 	.eth_cqe_completion = &qed_fp_cqe_completion,
 	.get_vport_stats = &qed_get_vport_stats,
+	.tunn_config = &qed_tunn_configure,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c31d485f72d6..1916992ae8b1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -776,7 +776,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	/* Start the slowpath */
 	data = cdev->firmware->data;
 
-	rc = qed_hw_init(cdev, true, cdev->int_params.out.int_mode,
+	rc = qed_hw_init(cdev, NULL, true, cdev->int_params.out.int_mode,
 			 true, data);
 	if (rc)
 		goto err2;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index c15b1622e636..55451a4dc587 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -427,4 +427,35 @@
 	0x2aae60UL
 #define PGLUE_B_REG_PF_BAR1_SIZE \
 	0x2aae64UL
+#define PRS_REG_ENCAPSULATION_TYPE_EN	0x1f0730UL
+#define PRS_REG_GRE_PROTOCOL		0x1f0734UL
+#define PRS_REG_VXLAN_PORT		0x1f0738UL
+#define PRS_REG_OUTPUT_FORMAT_4_0	0x1f099cUL
+#define NIG_REG_ENC_TYPE_ENABLE		0x501058UL
+
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE		(0x1 << 0)
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT	0
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE		(0x1 << 1)
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT	1
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE			(0x1 << 2)
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT		2
+
+#define NIG_REG_VXLAN_PORT		0x50105cUL
+#define PBF_REG_VXLAN_PORT		0xd80518UL
+#define PBF_REG_NGE_PORT		0xd8051cUL
+#define PRS_REG_NGE_PORT		0x1f086cUL
+#define NIG_REG_NGE_PORT		0x508b38UL
+
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN	0x10090cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN	0x100910UL
+#define DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN	0x100914UL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN	0x10092cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN	0x100930UL
+
+#define NIG_REG_NGE_IP_ENABLE			0x508b28UL
+#define NIG_REG_NGE_ETH_ENABLE			0x508b2cUL
+#define NIG_REG_NGE_COMP_VER			0x508b30UL
+#define PBF_REG_NGE_COMP_VER			0xd80524UL
+#define PRS_REG_NGE_COMP_VER			0x1f0878UL
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index d39f914b66ee..4b91cb32f317 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -52,6 +52,7 @@ int qed_eth_cqe_completion(struct qed_hwfn *p_hwfn,
 
 union ramrod_data {
 	struct pf_start_ramrod_data pf_start;
+	struct pf_update_ramrod_data pf_update;
 	struct rx_queue_start_ramrod_data rx_queue_start;
 	struct rx_queue_update_ramrod_data rx_queue_update;
 	struct rx_queue_stop_ramrod_data rx_queue_stop;
@@ -338,12 +339,14 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * to the internal RAM of the UStorm by the Function Start Ramrod.
  *
  * @param p_hwfn
+ * @param p_tunn
  * @param mode
  *
  * @return int
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode);
 
 /**
@@ -362,4 +365,8 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 1c06c37d4c3d..306da7000ddc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -87,7 +87,217 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static enum tunnel_clss qed_tunn_get_clss_type(u8 type)
+{
+	switch (type) {
+	case QED_TUNN_CLSS_MAC_VLAN:
+		return TUNNEL_CLSS_MAC_VLAN;
+	case QED_TUNN_CLSS_MAC_VNI:
+		return TUNNEL_CLSS_MAC_VNI;
+	case QED_TUNN_CLSS_INNER_MAC_VLAN:
+		return TUNNEL_CLSS_INNER_MAC_VLAN;
+	case QED_TUNN_CLSS_INNER_MAC_VNI:
+		return TUNNEL_CLSS_INNER_MAC_VNI;
+	default:
+		return TUNNEL_CLSS_MAC_VLAN;
+	}
+}
+
+static void
+qed_tunn_set_pf_fix_tunn_mode(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	unsigned long cached_tunn_mode = p_hwfn->cdev->tunn_mode;
+	unsigned long update_mask = p_src->tunn_mode_update_mask;
+	unsigned long tunn_mode = p_src->tunn_mode;
+	unsigned long new_tunn_mode = 0;
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_L2GRE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_IPGRE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_VXLAN_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
+	}
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_L2GENEVE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
+	}
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &update_mask)) {
+		if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
+	} else {
+		if (test_bit(QED_MODE_IPGENEVE_TUNN, &cached_tunn_mode))
+			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
+	}
+
+	p_src->tunn_mode = new_tunn_mode;
+}
+
+static void
+qed_tunn_set_pf_update_params(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	unsigned long tunn_mode = p_src->tunn_mode;
+	enum tunnel_clss type;
+
+	qed_tunn_set_pf_fix_tunn_mode(p_hwfn, p_src, p_tunn_cfg);
+	p_tunn_cfg->update_rx_pf_clss = p_src->update_rx_pf_clss;
+	p_tunn_cfg->update_tx_pf_clss = p_src->update_tx_pf_clss;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
+	p_tunn_cfg->tunnel_clss_vxlan  = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
+	p_tunn_cfg->tunnel_clss_l2gre = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
+	p_tunn_cfg->tunnel_clss_ipgre = type;
+
+	if (p_src->update_vxlan_udp_port) {
+		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
+		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2gre = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgre = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_vxlan = 1;
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2geneve = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgeneve = 1;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
+	p_tunn_cfg->tunnel_clss_l2geneve = type;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
+	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+}
+
+static void qed_set_hw_tunn_mode(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 unsigned long tunn_mode)
+{
+	u8 l2gre_enable = 0, ipgre_enable = 0, vxlan_enable = 0;
+	u8 l2geneve_enable = 0, ipgeneve_enable = 0;
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		l2gre_enable = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		ipgre_enable = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		vxlan_enable = 1;
+
+	qed_set_gre_enable(p_hwfn, p_ptt, l2gre_enable, ipgre_enable);
+	qed_set_vxlan_enable(p_hwfn, p_ptt, vxlan_enable);
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		l2geneve_enable = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		ipgeneve_enable = 1;
+
+	qed_set_geneve_enable(p_hwfn, p_ptt, l2geneve_enable,
+			      ipgeneve_enable);
+}
+
+static void
+qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
+			     struct qed_tunn_start_params *p_src,
+			     struct pf_start_tunnel_config *p_tunn_cfg)
+{
+	unsigned long tunn_mode;
+	enum tunnel_clss type;
+
+	if (!p_src)
+		return;
+
+	tunn_mode = p_src->tunn_mode;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
+	p_tunn_cfg->tunnel_clss_vxlan = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
+	p_tunn_cfg->tunnel_clss_l2gre = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
+	p_tunn_cfg->tunnel_clss_ipgre = type;
+
+	if (p_src->update_vxlan_udp_port) {
+		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
+		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2gre = 1;
+
+	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgre = 1;
+
+	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_vxlan = 1;
+
+	if (p_src->update_geneve_udp_port) {
+		p_tunn_cfg->set_geneve_udp_port_flg = 1;
+		p_tunn_cfg->geneve_udp_port =
+				cpu_to_le16(p_src->geneve_udp_port);
+	}
+
+	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_l2geneve = 1;
+
+	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
+		p_tunn_cfg->tx_enable_ipgeneve = 1;
+
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
+	p_tunn_cfg->tunnel_clss_l2geneve = type;
+	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
+	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+}
+
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
@@ -143,6 +353,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
 		       p_hwfn->p_consq->chain.pbl.p_phys_table);
 
+	qed_tunn_set_pf_start_params(p_hwfn, NULL, NULL);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
@@ -153,6 +364,49 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+/* Set pf update ramrod command params */
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_tunn_update_params *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_tunn_set_pf_update_params(p_hwfn, p_tunn,
+				      &p_ent->ramrod.pf_update.tunnel_config);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	if (p_tunn->update_vxlan_udp_port)
+		qed_set_vxlan_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					p_tunn->vxlan_udp_port);
+	if (p_tunn->update_geneve_udp_port)
+		qed_set_geneve_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					 p_tunn->geneve_udp_port);
+
+	qed_set_hw_tunn_mode(p_hwfn, p_hwfn->p_main_ptt, p_tunn->tunn_mode);
+	p_hwfn->cdev->tunn_mode = p_tunn->tunn_mode;
+
+	return rc;
+}
+
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn)
 {
 	struct qed_spq_entry *p_ent = NULL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 795c9902e02f..3a4c806be156 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -112,6 +112,13 @@ struct qed_queue_start_common_params {
 	u16 sb_idx;
 };
 
+struct qed_tunn_params {
+	u16 vxlan_port;
+	u8 update_vxlan_port;
+	u16 geneve_port;
+	u8 update_geneve_port;
+};
+
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
 };
@@ -166,6 +173,9 @@ struct qed_eth_ops {
 
 	void (*get_vport_stats)(struct qed_dev *cdev,
 				struct qed_eth_stats *stats);
+
+	int (*tunn_config)(struct qed_dev *cdev,
+			   struct qed_tunn_params *params);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
-- 
cgit v1.2.3


From e1c9c62b9a3a761b56359a7437215ae2e9253821 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 11 Apr 2016 23:10:21 +0300
Subject: net/mlx5: Fix mlx5 ifc cmd_hca_cap bad offsets

All reserved fields after early_vf_enable are off by 1, since
early_vf_enable was not explicitly declared as array of size 1.

Reserved field before cqe_zip had a wrong size, it should
be 0x80 + 0x3f.

Fixes: b0844444590e ("net/mlx5_core: Introduce access function to read internal timer ")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 107 ++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c15b8a864937..c300e7491d80 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -750,21 +750,21 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
 	u8         eswitch_flow_table[0x1];
-	u8	   early_vf_enable;
-	u8         reserved_at_1a8[0x2];
+	u8	   early_vf_enable[0x1];
+	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         reserved_at_1af[0x6];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_at_1bf[0x3];
+	u8         reserved_at_1c0[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_at_1c7[0x4];
+	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1cf[0x6];
+	u8         reserved_at_1d0[0x6];
 	u8         rol_s[0x1];
 	u8         rol_g[0x1];
-	u8         reserved_at_1d7[0x1];
+	u8         reserved_at_1d8[0x1];
 	u8         wol_s[0x1];
 	u8         wol_g[0x1];
 	u8         wol_a[0x1];
@@ -774,47 +774,47 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         wol_p[0x1];
 
 	u8         stat_rate_support[0x10];
-	u8         reserved_at_1ef[0xc];
+	u8         reserved_at_1f0[0xc];
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
 	u8         reserved_at_200[0x3];
 	u8         ipoib_basic_offloads[0x1];
-	u8         reserved_at_204[0xa];
+	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-	u8         reserved_at_212[0x1];
+	u8         reserved_at_213[0x1];
 	u8         wq_signature[0x1];
 	u8         sctr_data_cqe[0x1];
-	u8         reserved_at_215[0x1];
+	u8         reserved_at_216[0x1];
 	u8         sho[0x1];
 	u8         tph[0x1];
 	u8         rf[0x1];
 	u8         dct[0x1];
-	u8         reserved_at_21a[0x1];
+	u8         reserved_at_21b[0x1];
 	u8         eth_net_offloads[0x1];
 	u8         roce[0x1];
 	u8         atomic[0x1];
-	u8         reserved_at_21e[0x1];
+	u8         reserved_at_21f[0x1];
 
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         reserved_at_222[0x3];
+	u8         reserved_at_223[0x3];
 	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-	u8         reserved_at_228[0x1];
+	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
 	u8         reserved_at_22a[0x1];
 	u8         cd[0x1];
-	u8         reserved_at_22c[0x1];
+	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
 	u8         reserved_at_22f[0x1];
 	u8	   imaicl[0x1];
-	u8         reserved_at_231[0x4];
+	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
 	u8         pkv[0x1];
 	u8         set_deth_sqpn[0x1];
@@ -824,98 +824,101 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_at_23f[0xa];
+	u8         reserved_at_240[0xa];
 	u8         uar_sz[0x6];
-	u8         reserved_at_24f[0x8];
+	u8         reserved_at_250[0x8];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_260[0x1];
+	u8         reserved_at_261[0x1];
 	u8         pad_tx_eth_packet[0x1];
-	u8         reserved_at_262[0x8];
+	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
-	u8         reserved_at_26f[0x10];
+	u8         reserved_at_270[0x10];
 
-	u8         reserved_at_27f[0x10];
+	u8         reserved_at_280[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_at_29f[0x10];
+	u8         reserved_at_2a0[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_at_2bf[0x10];
+	u8         reserved_at_2c0[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
-	u8         reserved_at_2df[0x7];
+	u8         reserved_at_2e0[0x7];
 	u8         max_qp_mcg[0x19];
 
-	u8         reserved_at_2ff[0x18];
+	u8         reserved_at_300[0x18];
 	u8         log_max_mcg[0x8];
 
-	u8         reserved_at_31f[0x3];
+	u8         reserved_at_320[0x3];
 	u8         log_max_transport_domain[0x5];
-	u8         reserved_at_327[0x3];
+	u8         reserved_at_328[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_32f[0xb];
+	u8         reserved_at_330[0xb];
 	u8         log_max_xrcd[0x5];
 
-	u8         reserved_at_33f[0x20];
+	u8         reserved_at_340[0x20];
 
-	u8         reserved_at_35f[0x3];
+	u8         reserved_at_360[0x3];
 	u8         log_max_rq[0x5];
-	u8         reserved_at_367[0x3];
+	u8         reserved_at_368[0x3];
 	u8         log_max_sq[0x5];
-	u8         reserved_at_36f[0x3];
+	u8         reserved_at_370[0x3];
 	u8         log_max_tir[0x5];
-	u8         reserved_at_377[0x3];
+	u8         reserved_at_378[0x3];
 	u8         log_max_tis[0x5];
 
 	u8         basic_cyclic_rcv_wqe[0x1];
-	u8         reserved_at_380[0x2];
+	u8         reserved_at_381[0x2];
 	u8         log_max_rmp[0x5];
-	u8         reserved_at_387[0x3];
+	u8         reserved_at_388[0x3];
 	u8         log_max_rqt[0x5];
-	u8         reserved_at_38f[0x3];
+	u8         reserved_at_390[0x3];
 	u8         log_max_rqt_size[0x5];
-	u8         reserved_at_397[0x3];
+	u8         reserved_at_398[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_at_39f[0x3];
+	u8         reserved_at_3a0[0x3];
 	u8         log_max_stride_sz_rq[0x5];
-	u8         reserved_at_3a7[0x3];
+	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-	u8         reserved_at_3af[0x3];
+	u8         reserved_at_3b0[0x3];
 	u8         log_max_stride_sz_sq[0x5];
-	u8         reserved_at_3b7[0x3];
+	u8         reserved_at_3b8[0x3];
 	u8         log_min_stride_sz_sq[0x5];
 
-	u8         reserved_at_3bf[0x1b];
+	u8         reserved_at_3c0[0x1b];
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         reserved_at_3e0[0xa];
+	u8         reserved_at_3e1[0xa];
 	u8         log_max_vlan_list[0x5];
-	u8         reserved_at_3ef[0x3];
+	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
-	u8         reserved_at_3f7[0x3];
+	u8         reserved_at_3f8[0x3];
 	u8         log_max_current_uc_list[0x5];
 
-	u8         reserved_at_3ff[0x80];
+	u8         reserved_at_400[0x80];
 
-	u8         reserved_at_47f[0x3];
+	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
-	u8         reserved_at_487[0x8];
+	u8         reserved_at_488[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_at_49f[0x20];
+	u8         reserved_at_4a0[0x20];
 	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
-	u8         reserved_at_4ff[0x5f];
+
+	u8         reserved_at_500[0x80];
+
+	u8         reserved_at_580[0x3f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_at_57f[0x220];
+	u8         reserved_at_5e0[0x220];
 };
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From 7d5e14237a551a5de3d287f2e8db2d044ee81a1a Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 11 Apr 2016 23:10:22 +0300
Subject: net/mlx5: Update mlx5_ifc hardware features

Adding the needed mlx5_ifc hardware bits and structs
for the following feature:

* Add vport to steering commands for SRIOV ACL support
* Add mlcr, pcmr and mcia registers for dump module EEPROM
* Add support for FCS, baeacon led and disable_link bits to
  hca caps
* Add CQE period mode bit in  CQ context for CQE based CQ
  moderation support
* Add umr SQ bit for fragmented memory registration
* Add needed bits and caps for Striding RQ support

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 146 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c300e7491d80..4ce4ea422a10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -513,7 +513,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         max_lso_cap[0x5];
 	u8         reserved_at_10[0x4];
 	u8         rss_ind_tbl_cap[0x4];
-	u8         reserved_at_18[0x3];
+	u8         reg_umr_sq[0x1];
+	u8         scatter_fcs[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
 	u8         tunnel_statless_gre[0x1];
@@ -648,7 +650,7 @@ struct mlx5_ifc_vector_calc_cap_bits {
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
-	MLX5_WQ_TYPE_STRQ         = 0x2,
+	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
 };
 
 enum {
@@ -753,7 +755,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x6];
+	u8         reserved_at_1af[0x2];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b2[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
@@ -778,7 +784,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
-	u8         reserved_at_200[0x3];
+	u8         striding_rq[0x1];
+	u8         reserved_at_201[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -807,12 +814,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         block_lb_mc[0x1];
 	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
-	u8         reserved_at_22a[0x1];
+	u8         cq_period_start_from_cqe[0x1];
 	u8         cd[0x1];
 	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
-	u8         reserved_at_22f[0x1];
+	u8         umr_ptr_rlky[0x1];
 	u8	   imaicl[0x1];
 	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
@@ -913,10 +920,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_500[0x80];
 
 	u8         reserved_at_580[0x3f];
-	u8         cqe_zip[0x1];
+	u8         cqe_compression[0x1];
 
-	u8         cqe_zip_timeout[0x10];
-	u8         cqe_zip_max_num[0x10];
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
 
 	u8         reserved_at_5e0[0x220];
 };
@@ -1000,7 +1007,13 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_118[0x3];
 	u8         log_wq_sz[0x5];
 
-	u8         reserved_at_120[0x4e0];
+	u8         reserved_at_120[0x15];
+	u8         log_wqe_num_of_strides[0x3];
+	u8         two_byte_shift_en[0x1];
+	u8         reserved_at_139[0x4];
+	u8         log_wqe_stride_size[0x3];
+
+	u8         reserved_at_140[0x4c0];
 
 	struct mlx5_ifc_cmd_pas_bits pas[0];
 };
@@ -2199,7 +2212,8 @@ struct mlx5_ifc_sqc_bits {
 	u8         flush_in_error_en[0x1];
 	u8         reserved_at_4[0x4];
 	u8         state[0x4];
-	u8         reserved_at_c[0x14];
+	u8         reg_umr[0x1];
+	u8         reserved_at_d[0x13];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
@@ -2247,7 +2261,8 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         scatter_fcs[0x1];
 	u8         vsd[0x1];
 	u8         mem_rq_type[0x4];
 	u8         state[0x4];
@@ -2604,6 +2619,11 @@ enum {
 	MLX5_CQC_ST_FIRED                                 = 0xa,
 };
 
+enum {
+	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+};
+
 struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x4];
@@ -2612,8 +2632,8 @@ struct mlx5_ifc_cqc_bits {
 	u8         reserved_at_c[0x1];
 	u8         scqe_break_moderation_en[0x1];
 	u8         oi[0x1];
-	u8         reserved_at_f[0x2];
-	u8         cqe_zip_en[0x1];
+	u8         cq_period_mode[0x2];
+	u8         cqe_comp_en[0x1];
 	u8         mini_cqe_res_format[0x2];
 	u8         st[0x4];
 	u8         reserved_at_18[0x8];
@@ -2987,7 +3007,11 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5181,7 +5205,11 @@ struct mlx5_ifc_destroy_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5208,7 +5236,11 @@ struct mlx5_ifc_destroy_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5349,7 +5381,11 @@ struct mlx5_ifc_delete_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5795,7 +5831,11 @@ struct mlx5_ifc_create_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5839,7 +5879,11 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -6372,6 +6416,17 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_mlcr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x20];
+
+	u8         beacon_duration[0x10];
+	u8         reserved_at_40[0x10];
+
+	u8         beacon_remain[0x10];
+};
+
 struct mlx5_ifc_ptas_reg_bits {
 	u8         reserved_at_0[0x20];
 
@@ -6781,6 +6836,16 @@ struct mlx5_ifc_pamp_reg_bits {
 	u8         index_data[18][0x10];
 };
 
+struct mlx5_ifc_pcmr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2e];
+	u8         fcs_cap[0x1];
+	u8         reserved_at_3f[0x1f];
+	u8         fcs_chk[0x1];
+	u8         reserved_at_5f[0x1];
+};
+
 struct mlx5_ifc_lane_2_module_mapping_bits {
 	u8         reserved_at_0[0x6];
 	u8         rx_lane[0x2];
@@ -7117,6 +7182,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pspa_reg_bits pspa_reg;
 	struct mlx5_ifc_ptas_reg_bits ptas_reg;
 	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_mlcr_reg_bits mlcr_reg;
 	struct mlx5_ifc_pude_reg_bits pude_reg;
 	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
 	struct mlx5_ifc_slrg_reg_bits slrg_reg;
@@ -7150,7 +7216,11 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -7181,7 +7251,9 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x20];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x10];
 	u8         modify_field_select[0x10];
@@ -7247,4 +7319,34 @@ struct mlx5_ifc_qtct_reg_bits {
 	u8         tclass[0x3];
 };
 
+struct mlx5_ifc_mcia_reg_bits {
+	u8         l[0x1];
+	u8         reserved_at_1[0x7];
+	u8         module[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         i2c_device_address[0x8];
+	u8         page_number[0x8];
+	u8         device_address[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         dword_0[0x20];
+	u8         dword_1[0x20];
+	u8         dword_2[0x20];
+	u8         dword_3[0x20];
+	u8         dword_4[0x20];
+	u8         dword_5[0x20];
+	u8         dword_6[0x20];
+	u8         dword_7[0x20];
+	u8         dword_8[0x20];
+	u8         dword_9[0x20];
+	u8         dword_10[0x20];
+	u8         dword_11[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 52c52a61a39fb319c14a582f8631619e5d5f55bf Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 14 Apr 2016 15:35:30 +0800
Subject: sctp: add sctp_info dump api for sctp_diag

sctp_diag will dump some important details of sctp's assoc or ep, we use
sctp_info to describe them,  sctp_get_sctp_info to get them, and export
it to sctp_diag.ko.

v2->v3:
- we will not use list_for_each_safe in sctp_get_sctp_info, cause
  all the callers of it will use lock_sock.

- fix the holes in struct sctp_info with __reserved* field.
  because sctp_diag is a new feature, and sctp_info is just for now,
  it may be changed in the future.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 67 ++++++++++++++++++++++++++++++++++++++
 include/net/sctp/sctp.h |  3 ++
 net/sctp/socket.c       | 86 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 156 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a9414fd49dc6..dacb5e711994 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -705,4 +705,71 @@ typedef struct sctp_auth_chunk {
 	sctp_authhdr_t auth_hdr;
 } __packed sctp_auth_chunk_t;
 
+struct sctp_info {
+	__u32	sctpi_tag;
+	__u32	sctpi_state;
+	__u32	sctpi_rwnd;
+	__u16	sctpi_unackdata;
+	__u16	sctpi_penddata;
+	__u16	sctpi_instrms;
+	__u16	sctpi_outstrms;
+	__u32	sctpi_fragmentation_point;
+	__u32	sctpi_inqueue;
+	__u32	sctpi_outqueue;
+	__u32	sctpi_overall_error;
+	__u32	sctpi_max_burst;
+	__u32	sctpi_maxseg;
+	__u32	sctpi_peer_rwnd;
+	__u32	sctpi_peer_tag;
+	__u8	sctpi_peer_capable;
+	__u8	sctpi_peer_sack;
+	__u16	__reserved1;
+
+	/* assoc status info */
+	__u64	sctpi_isacks;
+	__u64	sctpi_osacks;
+	__u64	sctpi_opackets;
+	__u64	sctpi_ipackets;
+	__u64	sctpi_rtxchunks;
+	__u64	sctpi_outofseqtsns;
+	__u64	sctpi_idupchunks;
+	__u64	sctpi_gapcnt;
+	__u64	sctpi_ouodchunks;
+	__u64	sctpi_iuodchunks;
+	__u64	sctpi_oodchunks;
+	__u64	sctpi_iodchunks;
+	__u64	sctpi_octrlchunks;
+	__u64	sctpi_ictrlchunks;
+
+	/* primary transport info */
+	struct sockaddr_storage	sctpi_p_address;
+	__s32	sctpi_p_state;
+	__u32	sctpi_p_cwnd;
+	__u32	sctpi_p_srtt;
+	__u32	sctpi_p_rto;
+	__u32	sctpi_p_hbinterval;
+	__u32	sctpi_p_pathmaxrxt;
+	__u32	sctpi_p_sackdelay;
+	__u32	sctpi_p_sackfreq;
+	__u32	sctpi_p_ssthresh;
+	__u32	sctpi_p_partial_bytes_acked;
+	__u32	sctpi_p_flight_size;
+	__u16	sctpi_p_error;
+	__u16	__reserved2;
+
+	/* sctp sock info */
+	__u32	sctpi_s_autoclose;
+	__u32	sctpi_s_adaptation_ind;
+	__u32	sctpi_s_pd_point;
+	__u8	sctpi_s_nodelay;
+	__u8	sctpi_s_disable_fragments;
+	__u8	sctpi_s_v4mapped;
+	__u8	sctpi_s_frag_interleave;
+};
+
+struct sctp_infox {
+	struct sctp_info *sctpinfo;
+	struct sctp_association *asoc;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 978d5f67d5a7..268b10058ef5 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,9 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info);
+
 /*
  * sctp/primitive.c
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf265a4bba6e..cd0fb3bb493c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4202,6 +4202,92 @@ static void sctp_shutdown(struct sock *sk, int how)
 	}
 }
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info)
+{
+	struct sctp_transport *prim;
+	struct list_head *pos;
+	int mask;
+
+	memset(info, 0, sizeof(*info));
+	if (!asoc) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		info->sctpi_s_autoclose = sp->autoclose;
+		info->sctpi_s_adaptation_ind = sp->adaptation_ind;
+		info->sctpi_s_pd_point = sp->pd_point;
+		info->sctpi_s_nodelay = sp->nodelay;
+		info->sctpi_s_disable_fragments = sp->disable_fragments;
+		info->sctpi_s_v4mapped = sp->v4mapped;
+		info->sctpi_s_frag_interleave = sp->frag_interleave;
+
+		return 0;
+	}
+
+	info->sctpi_tag = asoc->c.my_vtag;
+	info->sctpi_state = asoc->state;
+	info->sctpi_rwnd = asoc->a_rwnd;
+	info->sctpi_unackdata = asoc->unack_data;
+	info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+	info->sctpi_instrms = asoc->c.sinit_max_instreams;
+	info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+	list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
+		info->sctpi_inqueue++;
+	list_for_each(pos, &asoc->outqueue.out_chunk_list)
+		info->sctpi_outqueue++;
+	info->sctpi_overall_error = asoc->overall_error_count;
+	info->sctpi_max_burst = asoc->max_burst;
+	info->sctpi_maxseg = asoc->frag_point;
+	info->sctpi_peer_rwnd = asoc->peer.rwnd;
+	info->sctpi_peer_tag = asoc->c.peer_vtag;
+
+	mask = asoc->peer.ecn_capable << 1;
+	mask = (mask | asoc->peer.ipv4_address) << 1;
+	mask = (mask | asoc->peer.ipv6_address) << 1;
+	mask = (mask | asoc->peer.hostname_address) << 1;
+	mask = (mask | asoc->peer.asconf_capable) << 1;
+	mask = (mask | asoc->peer.prsctp_capable) << 1;
+	mask = (mask | asoc->peer.auth_capable);
+	info->sctpi_peer_capable = mask;
+	mask = asoc->peer.sack_needed << 1;
+	mask = (mask | asoc->peer.sack_generation) << 1;
+	mask = (mask | asoc->peer.zero_window_announced);
+	info->sctpi_peer_sack = mask;
+
+	info->sctpi_isacks = asoc->stats.isacks;
+	info->sctpi_osacks = asoc->stats.osacks;
+	info->sctpi_opackets = asoc->stats.opackets;
+	info->sctpi_ipackets = asoc->stats.ipackets;
+	info->sctpi_rtxchunks = asoc->stats.rtxchunks;
+	info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
+	info->sctpi_idupchunks = asoc->stats.idupchunks;
+	info->sctpi_gapcnt = asoc->stats.gapcnt;
+	info->sctpi_ouodchunks = asoc->stats.ouodchunks;
+	info->sctpi_iuodchunks = asoc->stats.iuodchunks;
+	info->sctpi_oodchunks = asoc->stats.oodchunks;
+	info->sctpi_iodchunks = asoc->stats.iodchunks;
+	info->sctpi_octrlchunks = asoc->stats.octrlchunks;
+	info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
+
+	prim = asoc->peer.primary_path;
+	memcpy(&info->sctpi_p_address, &prim->ipaddr,
+	       sizeof(struct sockaddr_storage));
+	info->sctpi_p_state = prim->state;
+	info->sctpi_p_cwnd = prim->cwnd;
+	info->sctpi_p_srtt = prim->srtt;
+	info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
+	info->sctpi_p_hbinterval = prim->hbinterval;
+	info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
+	info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
+	info->sctpi_p_ssthresh = prim->ssthresh;
+	info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
+	info->sctpi_p_flight_size = prim->flight_size;
+	info->sctpi_p_error = prim->error_count;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
+
 /* 7.2.1 Association Status (SCTP_STATUS)
 
  * Applications can retrieve current status information about an
-- 
cgit v1.2.3


From c5cc2a0bc930f1ae00b198aeb752acc3bdd4d5a7 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Wed, 16 Mar 2016 21:54:55 +0200
Subject: clk: ti: dpll: add support for specifying max rate for DPLLs

DPLLs typically have a maximum rate they can support, and this varies
from DPLL to DPLL. Add support of the maximum rate value to the DPLL
data struct, and also add check for this in the DPLL round_rate function.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Reviewed-by: Nishanth Menon <nm@ti.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/ti/clkt_dpll.c | 3 +++
 include/linux/clk/ti.h     | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
index 032c658a5f5e..b919fdfe8256 100644
--- a/drivers/clk/ti/clkt_dpll.c
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -301,6 +301,9 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
 
 	dd = clk->dpll_data;
 
+	if (dd->max_rate && target_rate > dd->max_rate)
+		target_rate = dd->max_rate;
+
 	ref_rate = clk_hw_get_rate(dd->clk_ref);
 	clk_name = clk_hw_get_name(hw);
 	pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n",
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index dc5164a6df29..6110fe09ed18 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -37,6 +37,7 @@
  * @last_rounded_n: cache of the last N result of omap2_dpll_round_rate()
  * @min_divider: minimum valid non-bypass divider value (actual)
  * @max_divider: maximum valid non-bypass divider value (actual)
+ * @max_rate: maximum clock rate for the DPLL
  * @modes: possible values of @enable_mask
  * @autoidle_reg: register containing the DPLL autoidle mode bitfield
  * @idlest_reg: register containing the DPLL idle status bitfield
@@ -81,6 +82,7 @@ struct dpll_data {
 	u8			last_rounded_n;
 	u8			min_divider;
 	u16			max_divider;
+	unsigned long		max_rate;
 	u8			modes;
 	void __iomem		*autoidle_reg;
 	void __iomem		*idlest_reg;
-- 
cgit v1.2.3


From af8a41271b56f6d79cb4d7c7f3ca688a2d97a801 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Fri, 15 Apr 2016 16:59:38 +0200
Subject: iio:adis: Add support for manual self-test flag clear

Some variants of the devices from the ADIS family don't auto-clear the
self-test bit after the self-test has completed. Instead we have to
manually clear. Add support for this to the ADIS library.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/imu/adis.c       | 7 ++++++-
 include/linux/iio/imu/adis.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/imu/adis.c b/drivers/iio/imu/adis.c
index 911255d41c1a..ad6f91d06185 100644
--- a/drivers/iio/imu/adis.c
+++ b/drivers/iio/imu/adis.c
@@ -324,7 +324,12 @@ static int adis_self_test(struct adis *adis)
 
 	msleep(adis->data->startup_delay);
 
-	return adis_check_status(adis);
+	ret = adis_check_status(adis);
+
+	if (adis->data->self_test_no_autoclear)
+		adis_write_reg_16(adis, adis->data->msc_ctrl_reg, 0x00);
+
+	return ret;
 }
 
 /**
diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index fa2d01ef8f55..360da7d18a3d 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -41,6 +41,7 @@ struct adis_data {
 	unsigned int diag_stat_reg;
 
 	unsigned int self_test_mask;
+	bool self_test_no_autoclear;
 	unsigned int startup_delay;
 
 	const char * const *status_error_msgs;
-- 
cgit v1.2.3


From 756ca874417695f77941948a77e9b8562635cc0a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Thu, 14 Apr 2016 17:04:34 -0400
Subject: netdev_features: Add NETIF_F_TSO_MANGLEID to NETIF_F_ALL_TSO

I realized that when I added NETIF_F_TSO_MANGLEID as a TSO type I forgot to
add it to NETIF_F_ALL_TSO.  This patch corrects that so the flag will be
included correctly.

The result should be minor as it was only used by a few drivers and in a
few specific cases such as when NETIF_F_SG was not supported on a device so
the TSO flags were cleared.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9fc79df0e561..15eb0b12fff9 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -164,7 +164,8 @@ enum {
 #define NETIF_F_CSUM_MASK	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
 				 NETIF_F_HW_CSUM)
 
-#define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | \
+				 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)
 
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
-- 
cgit v1.2.3


From 6d62b4d5fac620ee0ca65dc6d99b0306d96bc541 Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Fri, 15 Apr 2016 00:34:59 +0200
Subject: net: ethtool: export conversion function between u32 and link mode

The function convert_legacy_u32_to_link_mode and
convert_link_mode_to_legacy_u32 may be used outside
of ethtool.c. We rename them to ethtool_convert_...
and export them, so we could use them in others
drivers and modules.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  7 +++++++
 net/core/ethtool.c      | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e2b7bf27c03e..9ded8c6d8176 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -150,6 +150,13 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+					     u32 legacy_u32);
+
+/* return false if src had higher bits set. lower bits always updated. */
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+				     const unsigned long *src);
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e0cf20a3b3dd..bdb4013581b1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -391,15 +391,17 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
-static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+					     u32 legacy_u32)
 {
 	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
 	dst[0] = legacy_u32;
 }
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
 
 /* return false if src had higher bits set. lower bits always updated. */
-static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
-					    const unsigned long *src)
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					     const unsigned long *src)
 {
 	bool retval = true;
 
@@ -419,6 +421,7 @@ static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	*legacy_u32 = src[0];
 	return retval;
 }
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
 
 /* return false if legacy contained non-0 deprecated fields
  * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
@@ -441,13 +444,13 @@ convert_legacy_settings_to_link_ksettings(
 	    legacy_settings->maxrxpkt)
 		retval = false;
 
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.supported,
 		legacy_settings->supported);
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.advertising,
 		legacy_settings->advertising);
-	convert_legacy_u32_to_link_mode(
+	ethtool_convert_legacy_u32_to_link_mode(
 		link_ksettings->link_modes.lp_advertising,
 		legacy_settings->lp_advertising);
 	link_ksettings->base.speed
@@ -486,13 +489,13 @@ convert_link_ksettings_to_legacy_settings(
 	 * __u32	maxrxpkt;
 	 */
 
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->supported,
 		link_ksettings->link_modes.supported);
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->advertising,
 		link_ksettings->link_modes.advertising);
-	retval &= convert_link_mode_to_legacy_u32(
+	retval &= ethtool_convert_link_mode_to_legacy_u32(
 		&legacy_settings->lp_advertising,
 		link_ksettings->link_modes.lp_advertising);
 	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
-- 
cgit v1.2.3


From 2d55173e71b06c5a369489852d972304e14189fd Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Fri, 15 Apr 2016 00:35:00 +0200
Subject: phy: add generic function to support ksetting support

The old ethtool api (get_setting and set_setting) has
generic phy functions phy_ethtool_sset and phy_ethtool_gset.
To supprt the new ethtool api (get_link_ksettings and
set_link_ksettings), we add generic phy function
phy_ethtool_ksettings_get and phy_ethtool_ksettings_set.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h   |  4 +++
 2 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 5590b9c182c9..6f221c8c2a7f 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -362,6 +362,60 @@ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(phy_ethtool_sset);
 
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	u8 autoneg = cmd->base.autoneg;
+	u8 duplex = cmd->base.duplex;
+	u32 speed = cmd->base.speed;
+	u32 advertising;
+
+	if (cmd->base.phy_address != phydev->mdio.addr)
+		return -EINVAL;
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						cmd->link_modes.advertising);
+
+	/* We make sure that we don't pass unsupported values in to the PHY */
+	advertising &= phydev->supported;
+
+	/* Verify the settings we care about. */
+	if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE)
+		return -EINVAL;
+
+	if (autoneg == AUTONEG_ENABLE && advertising == 0)
+		return -EINVAL;
+
+	if (autoneg == AUTONEG_DISABLE &&
+	    ((speed != SPEED_1000 &&
+	      speed != SPEED_100 &&
+	      speed != SPEED_10) ||
+	     (duplex != DUPLEX_HALF &&
+	      duplex != DUPLEX_FULL)))
+		return -EINVAL;
+
+	phydev->autoneg = autoneg;
+
+	phydev->speed = speed;
+
+	phydev->advertising = advertising;
+
+	if (autoneg == AUTONEG_ENABLE)
+		phydev->advertising |= ADVERTISED_Autoneg;
+	else
+		phydev->advertising &= ~ADVERTISED_Autoneg;
+
+	phydev->duplex = duplex;
+
+	phydev->mdix = cmd->base.eth_tp_mdix_ctrl;
+
+	/* Restart the PHY */
+	phy_start_aneg(phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_set);
+
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 {
 	cmd->supported = phydev->supported;
@@ -385,6 +439,33 @@ int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(phy_ethtool_gset);
 
+int phy_ethtool_ksettings_get(struct phy_device *phydev,
+			      struct ethtool_link_ksettings *cmd)
+{
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						phydev->supported);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						phydev->advertising);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						phydev->lp_advertising);
+
+	cmd->base.speed = phydev->speed;
+	cmd->base.duplex = phydev->duplex;
+	if (phydev->interface == PHY_INTERFACE_MODE_MOCA)
+		cmd->base.port = PORT_BNC;
+	else
+		cmd->base.port = PORT_MII;
+
+	cmd->base.phy_address = phydev->mdio.addr;
+	cmd->base.autoneg = phydev->autoneg;
+	cmd->base.eth_tp_mdix_ctrl = phydev->mdix;
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_ksettings_get);
+
 /**
  * phy_mii_ioctl - generic PHY MII ioctl interface
  * @phydev: the phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2abd7918f64f..be3f83bbdc0b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -805,6 +805,10 @@ void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
+int phy_ethtool_ksettings_get(struct phy_device *phydev,
+			      struct ethtool_link_ksettings *cmd);
+int phy_ethtool_ksettings_set(struct phy_device *phydev,
+			      const struct ethtool_link_ksettings *cmd);
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
-- 
cgit v1.2.3


From 20147f0d4f50f6f0d1fbe1815fe3d4d0a6444a70 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Tue, 29 Mar 2016 17:22:26 +0800
Subject: mfd: axp20x: Add support for AXP809 PMIC

The X-Powers AXP809 is a new PMIC that is paired with Allwinner's A80
SoC, along with a slave AXP806 PMIC.

This PMIC is quite similar to the earlier AXP223, though the interrupts
and regulator have changed a bit.

This patch adds support for the interrupts and power button of the PMIC.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 59 ++++++++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index 28c20247c112..a407527bcd09 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -61,6 +61,7 @@ static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, axp20x_rsb_of_match);
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index a57d6e940610..1ce923277cc8 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -37,6 +37,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP221",
 	"AXP223",
 	"AXP288",
+	"AXP809",
 };
 
 static const struct regmap_range axp152_writeable_ranges[] = {
@@ -85,6 +86,7 @@ static const struct regmap_access_table axp20x_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp20x_volatile_ranges),
 };
 
+/* AXP22x ranges are shared with the AXP809, as they cover the same range */
 static const struct regmap_range axp22x_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP22X_BATLOW_THRES1),
@@ -211,6 +213,20 @@ static struct resource axp288_fuel_gauge_resources[] = {
 	},
 };
 
+static struct resource axp809_pek_resources[] = {
+	{
+		.name   = "PEK_DBR",
+		.start  = AXP809_IRQ_PEK_RIS_EDGE,
+		.end    = AXP809_IRQ_PEK_RIS_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	}, {
+		.name   = "PEK_DBF",
+		.start  = AXP809_IRQ_PEK_FAL_EDGE,
+		.end    = AXP809_IRQ_PEK_FAL_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
 static const struct regmap_config axp152_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -378,6 +394,41 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP288, BC_USB_CHNG,            5, 1),
 };
 
+static const struct regmap_irq axp809_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP809, ACIN_OVER_V,		0, 7),
+	INIT_REGMAP_IRQ(AXP809, ACIN_PLUGIN,		0, 6),
+	INIT_REGMAP_IRQ(AXP809, ACIN_REMOVAL,	        0, 5),
+	INIT_REGMAP_IRQ(AXP809, VBUS_OVER_V,		0, 4),
+	INIT_REGMAP_IRQ(AXP809, VBUS_PLUGIN,		0, 3),
+	INIT_REGMAP_IRQ(AXP809, VBUS_REMOVAL,	        0, 2),
+	INIT_REGMAP_IRQ(AXP809, VBUS_V_LOW,		0, 1),
+	INIT_REGMAP_IRQ(AXP809, BATT_PLUGIN,		1, 7),
+	INIT_REGMAP_IRQ(AXP809, BATT_REMOVAL,	        1, 6),
+	INIT_REGMAP_IRQ(AXP809, BATT_ENT_ACT_MODE,	1, 5),
+	INIT_REGMAP_IRQ(AXP809, BATT_EXIT_ACT_MODE,	1, 4),
+	INIT_REGMAP_IRQ(AXP809, CHARG,		        1, 3),
+	INIT_REGMAP_IRQ(AXP809, CHARG_DONE,		1, 2),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH,	2, 7),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH_END,	2, 6),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW,	2, 5),
+	INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW_END,	2, 4),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH,	2, 3),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH_END,	2, 2),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW,	2, 1),
+	INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW_END,	2, 0),
+	INIT_REGMAP_IRQ(AXP809, DIE_TEMP_HIGH,	        3, 7),
+	INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL1,	        3, 1),
+	INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL2,	        3, 0),
+	INIT_REGMAP_IRQ(AXP809, TIMER,		        4, 7),
+	INIT_REGMAP_IRQ(AXP809, PEK_RIS_EDGE,	        4, 6),
+	INIT_REGMAP_IRQ(AXP809, PEK_FAL_EDGE,	        4, 5),
+	INIT_REGMAP_IRQ(AXP809, PEK_SHORT,		4, 4),
+	INIT_REGMAP_IRQ(AXP809, PEK_LONG,		4, 3),
+	INIT_REGMAP_IRQ(AXP809, PEK_OVER_OFF,		4, 2),
+	INIT_REGMAP_IRQ(AXP809, GPIO1_INPUT,		4, 1),
+	INIT_REGMAP_IRQ(AXP809, GPIO0_INPUT,		4, 0),
+};
+
 static const struct regmap_irq_chip axp152_regmap_irq_chip = {
 	.name			= "axp152_irq_chip",
 	.status_base		= AXP152_IRQ1_STATE,
@@ -428,6 +479,18 @@ static const struct regmap_irq_chip axp288_regmap_irq_chip = {
 
 };
 
+static const struct regmap_irq_chip axp809_regmap_irq_chip = {
+	.name			= "axp809",
+	.status_base		= AXP20X_IRQ1_STATE,
+	.ack_base		= AXP20X_IRQ1_STATE,
+	.mask_base		= AXP20X_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp809_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp809_regmap_irqs),
+	.num_regs		= 5,
+};
+
 static struct mfd_cell axp20x_cells[] = {
 	{
 		.name		= "axp20x-pek",
@@ -572,6 +635,16 @@ static struct mfd_cell axp288_cells[] = {
 	},
 };
 
+static struct mfd_cell axp809_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp809_pek_resources),
+		.resources		= axp809_pek_resources,
+	}, {
+		.name			= "axp20x-regulator",
+	},
+};
+
 static struct axp20x_dev *axp20x_pm_power_off;
 static void axp20x_power_off(void)
 {
@@ -631,6 +704,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp288_regmap_config;
 		axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
 		break;
+	case AXP809_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp809_cells);
+		axp20x->cells = axp809_cells;
+		axp20x->regmap_cfg = &axp22x_regmap_config;
+		axp20x->regmap_irq_chip = &axp809_regmap_irq_chip;
+		break;
 	default:
 		dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
 		return -EINVAL;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index d82e7d51372b..0be4982f08fe 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -20,6 +20,7 @@ enum {
 	AXP221_ID,
 	AXP223_ID,
 	AXP288_ID,
+	AXP809_ID,
 	NR_AXP20X_VARIANTS,
 };
 
@@ -264,6 +265,29 @@ enum {
 	AXP22X_REG_ID_MAX,
 };
 
+enum {
+	AXP809_DCDC1 = 0,
+	AXP809_DCDC2,
+	AXP809_DCDC3,
+	AXP809_DCDC4,
+	AXP809_DCDC5,
+	AXP809_DC1SW,
+	AXP809_DC5LDO,
+	AXP809_ALDO1,
+	AXP809_ALDO2,
+	AXP809_ALDO3,
+	AXP809_ELDO1,
+	AXP809_ELDO2,
+	AXP809_ELDO3,
+	AXP809_DLDO1,
+	AXP809_DLDO2,
+	AXP809_RTC_LDO,
+	AXP809_LDO_IO0,
+	AXP809_LDO_IO1,
+	AXP809_SW,
+	AXP809_REG_ID_MAX,
+};
+
 /* IRQs */
 enum {
 	AXP152_IRQ_LDO0IN_CONNECT = 1,
@@ -390,6 +414,41 @@ enum axp288_irqs {
 	AXP288_IRQ_BC_USB_CHNG,
 };
 
+enum axp809_irqs {
+	AXP809_IRQ_ACIN_OVER_V = 1,
+	AXP809_IRQ_ACIN_PLUGIN,
+	AXP809_IRQ_ACIN_REMOVAL,
+	AXP809_IRQ_VBUS_OVER_V,
+	AXP809_IRQ_VBUS_PLUGIN,
+	AXP809_IRQ_VBUS_REMOVAL,
+	AXP809_IRQ_VBUS_V_LOW,
+	AXP809_IRQ_BATT_PLUGIN,
+	AXP809_IRQ_BATT_REMOVAL,
+	AXP809_IRQ_BATT_ENT_ACT_MODE,
+	AXP809_IRQ_BATT_EXIT_ACT_MODE,
+	AXP809_IRQ_CHARG,
+	AXP809_IRQ_CHARG_DONE,
+	AXP809_IRQ_BATT_CHG_TEMP_HIGH,
+	AXP809_IRQ_BATT_CHG_TEMP_HIGH_END,
+	AXP809_IRQ_BATT_CHG_TEMP_LOW,
+	AXP809_IRQ_BATT_CHG_TEMP_LOW_END,
+	AXP809_IRQ_BATT_ACT_TEMP_HIGH,
+	AXP809_IRQ_BATT_ACT_TEMP_HIGH_END,
+	AXP809_IRQ_BATT_ACT_TEMP_LOW,
+	AXP809_IRQ_BATT_ACT_TEMP_LOW_END,
+	AXP809_IRQ_DIE_TEMP_HIGH,
+	AXP809_IRQ_LOW_PWR_LVL1,
+	AXP809_IRQ_LOW_PWR_LVL2,
+	AXP809_IRQ_TIMER,
+	AXP809_IRQ_PEK_RIS_EDGE,
+	AXP809_IRQ_PEK_FAL_EDGE,
+	AXP809_IRQ_PEK_SHORT,
+	AXP809_IRQ_PEK_LONG,
+	AXP809_IRQ_PEK_OVER_OFF,
+	AXP809_IRQ_GPIO1_INPUT,
+	AXP809_IRQ_GPIO0_INPUT,
+};
+
 #define AXP288_TS_ADC_H		0x58
 #define AXP288_TS_ADC_L		0x59
 #define AXP288_GP_ADC_H		0x5a
-- 
cgit v1.2.3


From a8f447be8056d9ce17bf7757d6de79426700bb8b Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Fri, 8 Apr 2016 00:12:55 +0530
Subject: mfd: Add resource managed APIs for mfd_add_devices

Add resource managed API devm_mfd_add_devices() for the mfd_add_devices().

This helps in reducing code in error path as it is not required
to call mfd_remove_devices() explicitly to remove all child-devices.
In some cases, it also helps not to implement .remove() callback
which get called during driver unbind.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mfd-core.c   | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/core.h |  4 ++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index 409da01effcd..4b4c1d4f3280 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -334,6 +334,44 @@ void mfd_remove_devices(struct device *parent)
 }
 EXPORT_SYMBOL(mfd_remove_devices);
 
+static void devm_mfd_dev_release(struct device *dev, void *res)
+{
+	mfd_remove_devices(dev);
+}
+
+/**
+ * devm_mfd_add_devices - Resource managed version of mfd_add_devices()
+ *
+ * Returns 0 on success or an appropriate negative error number on failure.
+ * All child-devices of the MFD will automatically be removed when it gets
+ * unbinded.
+ */
+int devm_mfd_add_devices(struct device *dev, int id,
+			 const struct mfd_cell *cells, int n_devs,
+			 struct resource *mem_base,
+			 int irq_base, struct irq_domain *domain)
+{
+	struct device **ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_mfd_dev_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = mfd_add_devices(dev, id, cells, n_devs, mem_base,
+			      irq_base, domain);
+	if (ret < 0) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	*ptr = dev;
+	devres_add(dev, ptr);
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_mfd_add_devices);
+
 int mfd_clone_cell(const char *cell, const char **clones, size_t n_clones)
 {
 	struct mfd_cell cell_entry;
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index bc6f7e00fb3d..4a0268afe546 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -131,4 +131,8 @@ static inline int mfd_add_hotplug_devices(struct device *parent,
 
 extern void mfd_remove_devices(struct device *parent);
 
+extern int devm_mfd_add_devices(struct device *dev, int id,
+				const struct mfd_cell *cells, int n_devs,
+				struct resource *mem_base,
+				int irq_base, struct irq_domain *irq_domain);
 #endif
-- 
cgit v1.2.3


From 679ca39fc670a5a95c2b40d2cc8cf2cee2486f7a Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Mon, 18 Apr 2016 16:53:39 +0900
Subject: usb: gadget: udc: core: add usb_gadget_{un}map_request_by_dev()

If the following environment, the first argument of DMA API should
be set to a DMAC's device structure, not a udc controller's one.
 - A udc controller needs an external DMAC device (like a DMA Engine).
 - The external DMAC enables IOMMU.

So, this patch add usb_gadget_{un}map_request_by_dev() API to set
a DMAC's device structure by a udc controller driver.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/udc/udc-core.c | 24 ++++++++++++++++++------
 include/linux/usb/gadget.h        |  4 ++++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/udc-core.c b/drivers/usb/gadget/udc/udc-core.c
index c6e76465065a..6e8300d6a737 100644
--- a/drivers/usb/gadget/udc/udc-core.c
+++ b/drivers/usb/gadget/udc/udc-core.c
@@ -61,11 +61,9 @@ static int udc_bind_to_driver(struct usb_udc *udc,
 
 #ifdef	CONFIG_HAS_DMA
 
-int usb_gadget_map_request(struct usb_gadget *gadget,
+int usb_gadget_map_request_by_dev(struct device *dev,
 		struct usb_request *req, int is_in)
 {
-	struct device *dev = gadget->dev.parent;
-
 	if (req->length == 0)
 		return 0;
 
@@ -92,24 +90,38 @@ int usb_gadget_map_request(struct usb_gadget *gadget,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev);
+
+int usb_gadget_map_request(struct usb_gadget *gadget,
+		struct usb_request *req, int is_in)
+{
+	return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in);
+}
 EXPORT_SYMBOL_GPL(usb_gadget_map_request);
 
-void usb_gadget_unmap_request(struct usb_gadget *gadget,
+void usb_gadget_unmap_request_by_dev(struct device *dev,
 		struct usb_request *req, int is_in)
 {
 	if (req->length == 0)
 		return;
 
 	if (req->num_mapped_sgs) {
-		dma_unmap_sg(gadget->dev.parent, req->sg, req->num_mapped_sgs,
+		dma_unmap_sg(dev, req->sg, req->num_mapped_sgs,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
 		req->num_mapped_sgs = 0;
 	} else {
-		dma_unmap_single(gadget->dev.parent, req->dma, req->length,
+		dma_unmap_single(dev, req->dma, req->length,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	}
 }
+EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);
+
+void usb_gadget_unmap_request(struct usb_gadget *gadget,
+		struct usb_request *req, int is_in)
+{
+	usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in);
+}
 EXPORT_SYMBOL_GPL(usb_gadget_unmap_request);
 
 #endif	/* CONFIG_HAS_DMA */
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 5d4e151c49bf..457651bf45b0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1223,9 +1223,13 @@ int usb_otg_descriptor_init(struct usb_gadget *gadget,
 
 /* utility to simplify map/unmap of usb_requests to/from DMA */
 
+extern int usb_gadget_map_request_by_dev(struct device *dev,
+		struct usb_request *req, int is_in);
 extern int usb_gadget_map_request(struct usb_gadget *gadget,
 		struct usb_request *req, int is_in);
 
+extern void usb_gadget_unmap_request_by_dev(struct device *dev,
+		struct usb_request *req, int is_in);
 extern void usb_gadget_unmap_request(struct usb_gadget *gadget,
 		struct usb_request *req, int is_in);
 
-- 
cgit v1.2.3


From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:42 -0600
Subject: tracing: Add 'hist' event trigger command

'hist' triggers allow users to continually aggregate trace events,
which can then be viewed afterwards by simply reading a 'hist' file
containing the aggregation in a human-readable format.

The basic idea is very simple and boils down to a mechanism whereby
trace events, rather than being exhaustively dumped in raw form and
viewed directly, are automatically 'compressed' into meaningful tables
completely defined by the user.

This is done strictly via single-line command-line commands and
without the aid of any kind of programming language or interpreter.

A surprising number of typical use cases can be accomplished by users
via this simple mechanism.  In fact, a large number of the tasks that
users typically do using the more complicated script-based tracing
tools, at least during the initial stages of an investigation, can be
accomplished by simply specifying a set of keys and values to be used
in the creation of a hash table.

The Linux kernel trace event subsystem happens to provide an extensive
list of keys and values ready-made for such a purpose in the form of
the event format files associated with each trace event.  By simply
consulting the format file for field names of interest and by plugging
them into the hist trigger command, users can create an endless number
of useful aggregations to help with investigating various properties
of the system.  See Documentation/trace/events.txt for examples.

hist triggers are implemented on top of the existing event trigger
infrastructure, and as such are consistent with the existing triggers
from a user's perspective as well.

The basic syntax follows the existing trigger syntax.  Users start an
aggregation by writing a 'hist' trigger to the event of interest's
trigger file:

  # echo hist:keys=xxx [ if filter] > event/trigger

Once a hist trigger has been set up, by default it continually
aggregates every matching event into a hash table using the event key
and a value field named 'hitcount'.

To view the aggregation at any point in time, simply read the 'hist'
file in the same directory as the 'trigger' file:

  # cat event/hist

The detailed syntax provides additional options for user control, and
is described exhaustively in Documentation/trace/events.txt and in the
virtual tracing/README file in the tracing subsystem.

Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/Kconfig                |  16 +
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.c                |  17 +
 kernel/trace/trace.h                |   7 +
 kernel/trace/trace_events.c         |   4 +
 kernel/trace/trace_events_hist.c    | 849 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |   1 +
 8 files changed, 896 insertions(+)
 create mode 100644 kernel/trace/trace_events_hist.c

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..404603720650 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -407,6 +407,7 @@ enum event_trigger_type {
 	ETT_SNAPSHOT		= (1 << 1),
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
+	ETT_EVENT_HIST		= (1 << 4),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d39556fd863a..fafeaf803bd0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -538,6 +538,22 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
+config HIST_TRIGGERS
+	bool "Histogram triggers"
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select TRACING_MAP
+	default n
+	help
+	  Hist triggers allow one or more arbitrary trace event fields
+	  to be aggregated into hash tables and dumped to stdout by
+	  reading a debugfs/tracefs file.  They're useful for
+	  gathering quick and dirty (though precise) summaries of
+	  event activity as an initial guide for further investigation
+	  using more advanced tools.
+
+	  See Documentation/trace/events.txt.
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4255c4057aaa..979e7bfbde7a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0d12dbde8399..6cf8fd03b028 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3812,6 +3812,9 @@ static const char readme_msg[] =
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
 	"\t\t    snapshot\n"
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t\t    hist (see below)\n"
 #endif
 	"\t   example: echo traceoff > events/block/block_unplug/trigger\n"
 	"\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
@@ -3828,6 +3831,20 @@ static const char readme_msg[] =
 	"\t   To remove a trigger with a count:\n"
 	"\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
 	"\t   Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
+	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:size=#entries]\n"
+	"\t            [if <filter>]\n\n"
+	"\t    When a matching event is hit, an entry is added to a hash\n"
+	"\t    table using the key named, and the value of a sum called\n"
+	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
+	"\t    event's format description.  Keys can be any field.  The\n"
+	"\t    'size' parameter can be  used to specify more or fewer than\n"
+	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    Reading the 'hist' file for the event will dump the hash\n"
+	"\t    table in its entirety to stdout."
+#endif
 ;
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2525042760e6..505f8a45f426 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1162,6 +1162,13 @@ extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_hist_fops;
+
+#ifdef CONFIG_HIST_TRIGGERS
+extern int register_trigger_hist_cmd(void);
+#else
+static inline int register_trigger_hist_cmd(void) { return 0; }
+#endif
 
 extern int register_trigger_cmds(void);
 extern void clear_event_triggers(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index add81dff7520..e7cb983ee93c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	trace_create_file("trigger", 0644, file->dir, file,
 			  &event_trigger_fops);
 
+#ifdef CONFIG_HIST_TRIGGERS
+	trace_create_file("hist", 0444, file->dir, file,
+			  &event_hist_fops);
+#endif
 	trace_create_file("format", 0444, file->dir, call,
 			  &ftrace_event_format_fops);
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
new file mode 100644
index 000000000000..23b45e462117
--- /dev/null
+++ b/kernel/trace/trace_events_hist.c
@@ -0,0 +1,849 @@
+/*
+ * trace_events_hist - trace event hist triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+struct hist_field;
+
+typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+
+struct hist_field {
+	struct ftrace_event_field	*field;
+	unsigned long			flags;
+	hist_field_fn_t			fn;
+	unsigned int			size;
+};
+
+static u64 hist_field_counter(struct hist_field *field, void *event)
+{
+	return 1;
+}
+
+static u64 hist_field_string(struct hist_field *hist_field, void *event)
+{
+	char *addr = (char *)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)addr;
+}
+
+#define DEFINE_HIST_FIELD_FN(type)					\
+static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+{									\
+	type *addr = (type *)(event + hist_field->field->offset);	\
+									\
+	return (u64)*addr;						\
+}
+
+DEFINE_HIST_FIELD_FN(s64);
+DEFINE_HIST_FIELD_FN(u64);
+DEFINE_HIST_FIELD_FN(s32);
+DEFINE_HIST_FIELD_FN(u32);
+DEFINE_HIST_FIELD_FN(s16);
+DEFINE_HIST_FIELD_FN(u16);
+DEFINE_HIST_FIELD_FN(s8);
+DEFINE_HIST_FIELD_FN(u8);
+
+#define for_each_hist_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_fields; (i)++)
+
+#define for_each_hist_val_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_vals; (i)++)
+
+#define for_each_hist_key_field(i, hist_data)	\
+	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
+
+#define HITCOUNT_IDX		0
+#define HIST_KEY_MAX		1
+#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+
+enum hist_field_flags {
+	HIST_FIELD_FL_HITCOUNT	= 1,
+	HIST_FIELD_FL_KEY	= 2,
+	HIST_FIELD_FL_STRING	= 4,
+};
+
+struct hist_trigger_attrs {
+	char		*keys_str;
+	unsigned int	map_bits;
+};
+
+struct hist_trigger_data {
+	struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_vals;
+	unsigned int			n_keys;
+	unsigned int			n_fields;
+	unsigned int			key_size;
+	struct tracing_map_sort_key	sort_keys[TRACING_MAP_SORT_KEYS_MAX];
+	unsigned int			n_sort_keys;
+	struct trace_event_file		*event_file;
+	struct hist_trigger_attrs	*attrs;
+	struct tracing_map		*map;
+};
+
+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+{
+	hist_field_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = hist_field_s64;
+		else
+			fn = hist_field_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = hist_field_s32;
+		else
+			fn = hist_field_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = hist_field_s16;
+		else
+			fn = hist_field_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = hist_field_s8;
+		else
+			fn = hist_field_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int parse_map_size(char *str)
+{
+	unsigned long size, map_bits;
+	int ret;
+
+	strsep(&str, "=");
+	if (!str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = kstrtoul(str, 0, &size);
+	if (ret)
+		goto out;
+
+	map_bits = ilog2(roundup_pow_of_two(size));
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		ret = -EINVAL;
+	else
+		ret = map_bits;
+ out:
+	return ret;
+}
+
+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
+{
+	if (!attrs)
+		return;
+
+	kfree(attrs->keys_str);
+	kfree(attrs);
+}
+
+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+{
+	struct hist_trigger_attrs *attrs;
+	int ret = 0;
+
+	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return ERR_PTR(-ENOMEM);
+
+	while (trigger_str) {
+		char *str = strsep(&trigger_str, ":");
+
+		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+		    (strncmp(str, "keys=", strlen("keys=")) == 0))
+			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "size=", strlen("size=")) == 0) {
+			int map_bits = parse_map_size(str);
+
+			if (map_bits < 0) {
+				ret = map_bits;
+				goto free;
+			}
+			attrs->map_bits = map_bits;
+		} else {
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	if (!attrs->keys_str) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	return attrs;
+ free:
+	destroy_hist_trigger_attrs(attrs);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_hist_field(struct hist_field *hist_field)
+{
+	kfree(hist_field);
+}
+
+static struct hist_field *create_hist_field(struct ftrace_event_field *field,
+					    unsigned long flags)
+{
+	struct hist_field *hist_field;
+
+	if (field && is_function_field(field))
+		return NULL;
+
+	hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+	if (!hist_field)
+		return NULL;
+
+	if (flags & HIST_FIELD_FL_HITCOUNT) {
+		hist_field->fn = hist_field_counter;
+		goto out;
+	}
+
+	if (is_string_field(field)) {
+		flags |= HIST_FIELD_FL_STRING;
+		hist_field->fn = hist_field_string;
+	} else {
+		hist_field->fn = select_value_fn(field->size,
+						 field->is_signed);
+		if (!hist_field->fn) {
+			destroy_hist_field(hist_field);
+			return NULL;
+		}
+	}
+ out:
+	hist_field->field = field;
+	hist_field->flags = flags;
+
+	return hist_field;
+}
+
+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+		if (hist_data->fields[i]) {
+			destroy_hist_field(hist_data->fields[i]);
+			hist_data->fields[i] = NULL;
+		}
+	}
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+	hist_data->fields[HITCOUNT_IDX] =
+		create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
+	if (!hist_data->fields[HITCOUNT_IDX])
+		return -ENOMEM;
+
+	hist_data->n_vals++;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int create_val_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_hitcount_val(hist_data);
+
+	return ret;
+}
+
+static int create_key_field(struct hist_trigger_data *hist_data,
+			    unsigned int key_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	unsigned int key_size;
+	int ret = 0;
+
+	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+		return -EINVAL;
+
+	flags |= HIST_FIELD_FL_KEY;
+
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key_size = field->size;
+
+	hist_data->fields[key_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[key_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key_size = ALIGN(key_size, sizeof(u64));
+	hist_data->fields[key_idx]->size = key_size;
+	hist_data->key_size = key_size;
+	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->n_keys++;
+
+	if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
+		return -EINVAL;
+
+	ret = key_size;
+ out:
+	return ret;
+}
+
+static int create_key_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	unsigned int i, n_vals = hist_data->n_vals;
+	char *fields_str, *field_str;
+	int ret = -EINVAL;
+
+	fields_str = hist_data->attrs->keys_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		ret = create_key_field(hist_data, i, file, field_str);
+		if (ret < 0)
+			goto out;
+	}
+	if (fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+static int create_hist_fields(struct hist_trigger_data *hist_data,
+			      struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_val_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	ret = create_key_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ out:
+	return ret;
+}
+
+static int create_sort_keys(struct hist_trigger_data *hist_data)
+{
+	int ret = 0;
+
+	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+
+	return ret;
+}
+
+static void destroy_hist_data(struct hist_trigger_data *hist_data)
+{
+	destroy_hist_trigger_attrs(hist_data->attrs);
+	destroy_hist_fields(hist_data);
+	tracing_map_destroy(hist_data->map);
+	kfree(hist_data);
+}
+
+static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
+{
+	struct tracing_map *map = hist_data->map;
+	struct ftrace_event_field *field;
+	struct hist_field *hist_field;
+	unsigned int i, idx;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field->flags & HIST_FIELD_FL_KEY) {
+			tracing_map_cmp_fn_t cmp_fn;
+
+			field = hist_field->field;
+
+			if (is_string_field(field))
+				cmp_fn = tracing_map_cmp_string;
+			else
+				cmp_fn = tracing_map_cmp_num(field->size,
+							     field->is_signed);
+			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+		} else
+			idx = tracing_map_add_sum_field(map);
+
+		if (idx < 0)
+			return idx;
+	}
+
+	return 0;
+}
+
+static struct hist_trigger_data *
+create_hist_data(unsigned int map_bits,
+		 struct hist_trigger_attrs *attrs,
+		 struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	int ret = 0;
+
+	hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+	if (!hist_data)
+		return ERR_PTR(-ENOMEM);
+
+	hist_data->attrs = attrs;
+
+	ret = create_hist_fields(hist_data, file);
+	if (ret)
+		goto free;
+
+	ret = create_sort_keys(hist_data);
+	if (ret)
+		goto free;
+
+	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
+					    NULL, hist_data);
+	if (IS_ERR(hist_data->map)) {
+		ret = PTR_ERR(hist_data->map);
+		hist_data->map = NULL;
+		goto free;
+	}
+
+	ret = create_tracing_map_fields(hist_data);
+	if (ret)
+		goto free;
+
+	ret = tracing_map_init(hist_data->map);
+	if (ret)
+		goto free;
+
+	hist_data->event_file = file;
+ out:
+	return hist_data;
+ free:
+	hist_data->attrs = NULL;
+
+	destroy_hist_data(hist_data);
+
+	hist_data = ERR_PTR(ret);
+
+	goto out;
+}
+
+static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
+				    struct tracing_map_elt *elt,
+				    void *rec)
+{
+	struct hist_field *hist_field;
+	unsigned int i;
+	u64 hist_val;
+
+	for_each_hist_val_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		hist_val = hist_field->fn(hist_field, rec);
+		tracing_map_update_sum(elt, i, hist_val);
+	}
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	struct tracing_map_elt *elt;
+	u64 field_contents;
+	void *key = NULL;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		field_contents = key_field->fn(key_field, rec);
+		if (key_field->flags & HIST_FIELD_FL_STRING)
+			key = (void *)(unsigned long)field_contents;
+		else
+			key = (void *)&field_contents;
+	}
+
+	elt = tracing_map_insert(hist_data->map, key);
+	if (elt)
+		hist_trigger_elt_update(hist_data, elt, rec);
+}
+
+static void
+hist_trigger_entry_print(struct seq_file *m,
+			 struct hist_trigger_data *hist_data, void *key,
+			 struct tracing_map_elt *elt)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+	u64 uval;
+
+	seq_puts(m, "{ ");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ", ");
+
+		if (key_field->flags & HIST_FIELD_FL_STRING) {
+			seq_printf(m, "%s: %-50s", key_field->field->name,
+				   (char *)key);
+		} else {
+			uval = *(u64 *)key;
+			seq_printf(m, "%s: %10llu",
+				   key_field->field->name, uval);
+		}
+	}
+
+	seq_puts(m, " }");
+
+	seq_printf(m, " hitcount: %10llu",
+		   tracing_map_read_sum(elt, HITCOUNT_IDX));
+
+	seq_puts(m, "\n");
+}
+
+static int print_entries(struct seq_file *m,
+			 struct hist_trigger_data *hist_data)
+{
+	struct tracing_map_sort_entry **sort_entries = NULL;
+	struct tracing_map *map = hist_data->map;
+	unsigned int i, n_entries;
+
+	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
+					     hist_data->n_sort_keys,
+					     &sort_entries);
+	if (n_entries < 0)
+		return n_entries;
+
+	for (i = 0; i < n_entries; i++)
+		hist_trigger_entry_print(m, hist_data,
+					 sort_entries[i]->key,
+					 sort_entries[i]->elt);
+
+	tracing_map_destroy_sort_entries(sort_entries, n_entries);
+
+	return n_entries;
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *test, *data = NULL;
+	struct trace_event_file *event_file;
+	struct hist_trigger_data *hist_data;
+	int n_entries, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(test, &event_file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			data = test;
+			break;
+		}
+	}
+	if (!data)
+		goto out_unlock;
+
+	seq_puts(m, "# event histogram\n#\n# trigger info: ");
+	data->ops->print(m, data->ops, data);
+	seq_puts(m, "\n");
+
+	hist_data = data->private_data;
+	n_entries = print_entries(m, hist_data);
+	if (n_entries < 0) {
+		ret = n_entries;
+		n_entries = 0;
+	}
+
+	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
+		   (u64)atomic64_read(&hist_data->map->hits),
+		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int event_hist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hist_show, file);
+}
+
+const struct file_operations event_hist_fops = {
+	.open = event_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
+{
+	seq_printf(m, "%s", hist_field->field->name);
+}
+
+static int event_hist_trigger_print(struct seq_file *m,
+				    struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	seq_puts(m, "hist:keys=");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ",");
+
+		hist_field_print(m, key_field);
+	}
+
+	seq_puts(m, ":vals=");
+	seq_puts(m, "hitcount");
+
+	seq_puts(m, ":sort=");
+	seq_puts(m, "hitcount");
+
+	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+
+	if (data->filter_str)
+		seq_printf(m, " if %s", data->filter_str);
+
+	seq_puts(m, " [active]");
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void event_hist_trigger_free(struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	data->ref--;
+	if (!data->ref) {
+		trigger_data_free(data);
+		destroy_hist_data(hist_data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_hist_trigger_free,
+};
+
+static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+							    char *param)
+{
+	return &event_hist_trigger_ops;
+}
+
+static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+				 struct event_trigger_data *data,
+				 struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	int ret = 0;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (data->ops->init) {
+		ret = data->ops->init(data->ops, data);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_add_rcu(&data->list, &file->triggers);
+	ret++;
+
+	update_cond_flag(file);
+	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+		list_del_rcu(&data->list);
+		update_cond_flag(file);
+		ret--;
+	}
+ out:
+	return ret;
+}
+
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+				   struct trace_event_file *file,
+				   char *glob, char *cmd, char *param)
+{
+	unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
+	struct event_trigger_data *trigger_data;
+	struct hist_trigger_attrs *attrs;
+	struct event_trigger_ops *trigger_ops;
+	struct hist_trigger_data *hist_data;
+	char *trigger;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	/* separate the trigger from the filter (k:v [if filter]) */
+	trigger = strsep(&param, " \t");
+	if (!trigger)
+		return -EINVAL;
+
+	attrs = parse_hist_trigger_attrs(trigger);
+	if (IS_ERR(attrs))
+		return PTR_ERR(attrs);
+
+	if (attrs->map_bits)
+		hist_trigger_bits = attrs->map_bits;
+
+	hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+	if (IS_ERR(hist_data)) {
+		destroy_hist_trigger_attrs(attrs);
+		return PTR_ERR(hist_data);
+	}
+
+	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+	ret = -ENOMEM;
+	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	if (!trigger_data)
+		goto out_free;
+
+	trigger_data->count = -1;
+	trigger_data->ops = trigger_ops;
+	trigger_data->cmd_ops = cmd_ops;
+
+	INIT_LIST_HEAD(&trigger_data->list);
+	RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+	trigger_data->private_data = hist_data;
+
+	if (glob[0] == '!') {
+		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+		ret = 0;
+		goto out_free;
+	}
+
+	if (!param) /* if param is non-empty, it's supposed to be a filter */
+		goto out_reg;
+
+	if (!cmd_ops->set_filter)
+		goto out_reg;
+
+	ret = cmd_ops->set_filter(param, trigger_data, file);
+	if (ret < 0)
+		goto out_free;
+ out_reg:
+	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+	/*
+	 * The above returns on success the # of triggers registered,
+	 * but if it didn't register any it returns zero.  Consider no
+	 * triggers registered a failure too.
+	 */
+	if (!ret) {
+		ret = -ENOENT;
+		goto out_free;
+	} else if (ret < 0)
+		goto out_free;
+	/* Just return zero, not the number of registered triggers */
+	ret = 0;
+ out:
+	return ret;
+ out_free:
+	if (cmd_ops->set_filter)
+		cmd_ops->set_filter(NULL, trigger_data, NULL);
+
+	kfree(trigger_data);
+
+	destroy_hist_data(hist_data);
+	goto out;
+}
+
+static struct event_command trigger_hist_cmd = {
+	.name			= "hist",
+	.trigger_type		= ETT_EVENT_HIST,
+	.flags			= EVENT_CMD_FL_NEEDS_REC,
+	.func			= event_hist_trigger_func,
+	.reg			= hist_register_trigger,
+	.unreg			= unregister_trigger,
+	.get_trigger_ops	= event_hist_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+__init int register_trigger_hist_cmd(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_cmd);
+	WARN_ON(ret < 0);
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d67992f3bb0e..d29092afe005 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_cmd();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 97865fe41322d83dac4373fe0a0de5b1a1b318c5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 24 Mar 2016 14:18:05 +0100
Subject: iio: st_sensors: verify interrupt event to status

This makes all ST sensor drivers check that they actually have
new data available for the requested channel(s) before claiming
an IRQ, by reading the status register (which is conveniently
the same for all ST sensors) and check that the channel has new
data before proceeding to read it and fill the buffer.

This way sensors can share an interrupt line: it can be flaged
as shared and then the sensor that did not fire will return
NO_IRQ, and the sensor that fired will handle the IRQ and
return IRQ_HANDLED.

Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_core.c                 |  5 +++++
 drivers/iio/common/st_sensors/st_sensors_buffer.c | 18 ++++++++++++++++++
 drivers/iio/gyro/st_gyro_core.c                   |  3 +++
 drivers/iio/magnetometer/st_magn_core.c           |  1 +
 drivers/iio/pressure/st_pressure_core.c           |  2 ++
 include/linux/iio/common/st_sensors.h             |  3 +++
 6 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index fee32e3d7a05..9fb6d35fce5b 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -332,6 +332,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_1_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_1_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_1_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_1_MULTIREAD_BIT,
 		.bootime = 2,
@@ -397,6 +398,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_2_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_2_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_2_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_2_MULTIREAD_BIT,
 		.bootime = 2,
@@ -474,6 +476,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_3_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_3_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_3_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 			.ig1 = {
 				.en_addr = ST_ACCEL_3_IG1_EN_ADDR,
 				.en_mask = ST_ACCEL_3_IG1_EN_MASK,
@@ -532,6 +535,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.drdy_irq = {
 			.addr = ST_ACCEL_4_DRDY_IRQ_ADDR,
 			.mask_int1 = ST_ACCEL_4_DRDY_IRQ_INT1_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_4_MULTIREAD_BIT,
 		.bootime = 2, /* guess */
@@ -583,6 +587,7 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_5_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_5_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_5_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_5_MULTIREAD_BIT,
 		.bootime = 2, /* guess */
diff --git a/drivers/iio/common/st_sensors/st_sensors_buffer.c b/drivers/iio/common/st_sensors/st_sensors_buffer.c
index 2ce0d2a3f855..c55898543a47 100644
--- a/drivers/iio/common/st_sensors/st_sensors_buffer.c
+++ b/drivers/iio/common/st_sensors/st_sensors_buffer.c
@@ -58,6 +58,24 @@ irqreturn_t st_sensors_trigger_handler(int irq, void *p)
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
 
+	/* If we have a status register, check if this IRQ came from us */
+	if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) {
+		u8 status;
+
+		len = sdata->tf->read_byte(&sdata->tb, sdata->dev,
+			   sdata->sensor_settings->drdy_irq.addr_stat_drdy,
+			   &status);
+		if (len < 0)
+			dev_err(sdata->dev, "could not read channel status\n");
+
+		/*
+		 * If this was not caused by any channels on this sensor,
+		 * return IRQ_NONE
+		 */
+		if (!(status & (u8)indio_dev->active_scan_mask[0]))
+			return IRQ_NONE;
+	}
+
 	len = st_sensors_get_buffer_element(indio_dev, sdata->buffer_data);
 	if (len < 0)
 		goto st_sensors_get_buffer_element_error;
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index 110f95b6e52f..be9057e89dc3 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -190,6 +190,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_1_MULTIREAD_BIT,
 		.bootime = 2,
@@ -258,6 +259,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_2_MULTIREAD_BIT,
 		.bootime = 2,
@@ -322,6 +324,7 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_GYRO_3_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 501f858df413..62036d2a9956 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -484,6 +484,7 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 			.mask_int1 = ST_MAGN_3_DRDY_INT_MASK,
 			.addr_ihl = ST_MAGN_3_IHL_IRQ_ADDR,
 			.mask_ihl = ST_MAGN_3_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_MAGN_3_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 172393ad34af..1cd37eaa4a57 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -226,6 +226,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS331AP_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS331AP_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS331AP_MULTIREAD_BIT,
 		.bootime = 2,
@@ -312,6 +313,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS25H_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS25H_IHL_IRQ_MASK,
+			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS25H_MULTIREAD_BIT,
 		.bootime = 2,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 6670c3d25c58..d8da075bfda0 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -37,6 +37,7 @@
 #define ST_SENSORS_DEFAULT_AXIS_ADDR		0x20
 #define ST_SENSORS_DEFAULT_AXIS_MASK		0x07
 #define ST_SENSORS_DEFAULT_AXIS_N_BIT		3
+#define ST_SENSORS_DEFAULT_STAT_ADDR		0x27
 
 #define ST_SENSORS_MAX_NAME			17
 #define ST_SENSORS_MAX_4WAI			7
@@ -121,6 +122,7 @@ struct st_sensor_bdu {
  * @mask_int2: mask to enable/disable IRQ on INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
+ * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
  * @en_mask: mask to write the on/off value for enable.
@@ -131,6 +133,7 @@ struct st_sensor_data_ready_irq {
 	u8 mask_int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
+	u8 addr_stat_drdy;
 	struct {
 		u8 en_addr;
 		u8 en_mask;
-- 
cgit v1.2.3


From 0e6f6871a1591f4bb0971809c45bc91a991f1967 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 14 Apr 2016 10:45:21 +0200
Subject: iio: st_sensors: support open drain mode

Some types of ST Sensors can be connected to the same IRQ line
as other peripherals using open drain. Add a device tree binding
and a sensor data property to flip the right bit in the interrupt
control register to enable open drain mode on the INT line.

If the line is set to be open drain, also tag on IRQF_SHARED
to the IRQ flags when requesting the interrupt, as the whole
point of using open drain interrupt lines is to share them with
more than one peripheral (wire-or).

Cc: devicetree@vger.kernel.org
Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Acked-by: Rob Herring <rob@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/devicetree/bindings/iio/st-sensors.txt |  4 ++++
 drivers/iio/accel/st_accel_core.c                    |  8 ++++++++
 drivers/iio/common/st_sensors/st_sensors_core.c      | 20 ++++++++++++++++++++
 drivers/iio/common/st_sensors/st_sensors_trigger.c   | 13 +++++++++++++
 drivers/iio/pressure/st_pressure_core.c              |  8 ++++++++
 include/linux/iio/common/st_sensors.h                |  6 ++++++
 include/linux/platform_data/st_sensors_pdata.h       |  2 ++
 7 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/iio/st-sensors.txt b/Documentation/devicetree/bindings/iio/st-sensors.txt
index 71b7bdff21cd..637e283f4a8b 100644
--- a/Documentation/devicetree/bindings/iio/st-sensors.txt
+++ b/Documentation/devicetree/bindings/iio/st-sensors.txt
@@ -16,6 +16,10 @@ Optional properties:
 - st,drdy-int-pin: the pin on the package that will be used to signal
   "data ready" (valid values: 1 or 2). This property is not configurable
   on all sensors.
+- drive-open-drain: the interrupt/data ready line will be configured
+  as open drain, which is useful if several sensors share the same
+  interrupt line. (This binding is taken from pinctrl/pinctrl-bindings.txt)
+  This is a boolean property.
 
 Sensors may also have applicable pin control settings, those use the
 standard bindings from pinctrl/pinctrl-bindings.txt.
diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 9fb6d35fce5b..dc73f2d85e6d 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -99,6 +99,8 @@
 #define ST_ACCEL_2_DRDY_IRQ_INT2_MASK		0x10
 #define ST_ACCEL_2_IHL_IRQ_ADDR			0x22
 #define ST_ACCEL_2_IHL_IRQ_MASK			0x80
+#define ST_ACCEL_2_OD_IRQ_ADDR			0x22
+#define ST_ACCEL_2_OD_IRQ_MASK			0x40
 #define ST_ACCEL_2_MULTIREAD_BIT		true
 
 /* CUSTOM VALUES FOR SENSOR 3 */
@@ -180,6 +182,8 @@
 #define ST_ACCEL_5_DRDY_IRQ_INT2_MASK		0x20
 #define ST_ACCEL_5_IHL_IRQ_ADDR			0x22
 #define ST_ACCEL_5_IHL_IRQ_MASK			0x80
+#define ST_ACCEL_5_OD_IRQ_ADDR			0x22
+#define ST_ACCEL_5_OD_IRQ_MASK			0x40
 #define ST_ACCEL_5_IG1_EN_ADDR			0x21
 #define ST_ACCEL_5_IG1_EN_MASK			0x08
 #define ST_ACCEL_5_MULTIREAD_BIT		false
@@ -398,6 +402,8 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_2_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_2_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_2_IHL_IRQ_MASK,
+			.addr_od = ST_ACCEL_2_OD_IRQ_ADDR,
+			.mask_od = ST_ACCEL_2_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_2_MULTIREAD_BIT,
@@ -587,6 +593,8 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = ST_ACCEL_5_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_ACCEL_5_IHL_IRQ_ADDR,
 			.mask_ihl = ST_ACCEL_5_IHL_IRQ_MASK,
+			.addr_od = ST_ACCEL_5_OD_IRQ_ADDR,
+			.mask_od = ST_ACCEL_5_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_ACCEL_5_MULTIREAD_BIT,
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index f5a2d445d0c0..dffe00692169 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -301,6 +301,14 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 		return -EINVAL;
 	}
 
+	if (pdata->open_drain) {
+		if (!sdata->sensor_settings->drdy_irq.addr_od)
+			dev_err(&indio_dev->dev,
+				"open drain requested but unsupported.\n");
+		else
+			sdata->int_pin_open_drain = true;
+	}
+
 	return 0;
 }
 
@@ -321,6 +329,8 @@ static struct st_sensors_platform_data *st_sensors_of_probe(struct device *dev,
 	else
 		pdata->drdy_int_pin = defdata ? defdata->drdy_int_pin : 0;
 
+	pdata->open_drain = of_property_read_bool(np, "drive-open-drain");
+
 	return pdata;
 }
 #else
@@ -374,6 +384,16 @@ int st_sensors_init_sensor(struct iio_dev *indio_dev,
 			return err;
 	}
 
+	if (sdata->int_pin_open_drain) {
+		dev_info(&indio_dev->dev,
+			 "set interrupt line to open drain mode\n");
+		err = st_sensors_write_data_with_mask(indio_dev,
+				sdata->sensor_settings->drdy_irq.addr_od,
+				sdata->sensor_settings->drdy_irq.mask_od, 1);
+		if (err < 0)
+			return err;
+	}
+
 	err = st_sensors_set_axis_enable(indio_dev, ST_SENSORS_ENABLE_ALL_AXIS);
 
 	return err;
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index 6a8c98327945..da72279fcf99 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -64,6 +64,19 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 			"rising edge\n", irq_trig);
 		irq_trig = IRQF_TRIGGER_RISING;
 	}
+
+	/*
+	 * If the interrupt pin is Open Drain, by definition this
+	 * means that the interrupt line may be shared with other
+	 * peripherals. But to do this we also need to have a status
+	 * register and mask to figure out if this sensor was firing
+	 * the IRQ or not, so we can tell the interrupt handle that
+	 * it was "our" interrupt.
+	 */
+	if (sdata->int_pin_open_drain &&
+	    sdata->sensor_settings->drdy_irq.addr_stat_drdy)
+		irq_trig |= IRQF_SHARED;
+
 	err = request_threaded_irq(irq,
 			iio_trigger_generic_data_rdy_poll,
 			NULL,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 1cd37eaa4a57..9e9b72a8f18f 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -64,6 +64,8 @@
 #define ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK	0x20
 #define ST_PRESS_LPS331AP_IHL_IRQ_ADDR		0x22
 #define ST_PRESS_LPS331AP_IHL_IRQ_MASK		0x80
+#define ST_PRESS_LPS331AP_OD_IRQ_ADDR		0x22
+#define ST_PRESS_LPS331AP_OD_IRQ_MASK		0x40
 #define ST_PRESS_LPS331AP_MULTIREAD_BIT		true
 #define ST_PRESS_LPS331AP_TEMP_OFFSET		42500
 
@@ -104,6 +106,8 @@
 #define ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK	0x10
 #define ST_PRESS_LPS25H_IHL_IRQ_ADDR		0x22
 #define ST_PRESS_LPS25H_IHL_IRQ_MASK		0x80
+#define ST_PRESS_LPS25H_OD_IRQ_ADDR		0x22
+#define ST_PRESS_LPS25H_OD_IRQ_MASK		0x40
 #define ST_PRESS_LPS25H_MULTIREAD_BIT		true
 #define ST_PRESS_LPS25H_TEMP_OFFSET		42500
 #define ST_PRESS_LPS25H_OUT_XL_ADDR		0x28
@@ -226,6 +230,8 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS331AP_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS331AP_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS331AP_IHL_IRQ_MASK,
+			.addr_od = ST_PRESS_LPS331AP_OD_IRQ_ADDR,
+			.mask_od = ST_PRESS_LPS331AP_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS331AP_MULTIREAD_BIT,
@@ -313,6 +319,8 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_int2 = ST_PRESS_LPS25H_DRDY_IRQ_INT2_MASK,
 			.addr_ihl = ST_PRESS_LPS25H_IHL_IRQ_ADDR,
 			.mask_ihl = ST_PRESS_LPS25H_IHL_IRQ_MASK,
+			.addr_od = ST_PRESS_LPS25H_OD_IRQ_ADDR,
+			.mask_od = ST_PRESS_LPS25H_OD_IRQ_MASK,
 			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
 		},
 		.multi_read_bit = ST_PRESS_LPS25H_MULTIREAD_BIT,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index d8da075bfda0..d029ffac0d69 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -122,6 +122,8 @@ struct st_sensor_bdu {
  * @mask_int2: mask to enable/disable IRQ on INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
+ * @addr_od: address to enable/disable Open Drain on the INT lines.
+ * @mask_od: mask to enable/disable Open Drain on the INT lines.
  * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
@@ -133,6 +135,8 @@ struct st_sensor_data_ready_irq {
 	u8 mask_int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
+	u8 addr_od;
+	u8 mask_od;
 	u8 addr_stat_drdy;
 	struct {
 		u8 en_addr;
@@ -215,6 +219,7 @@ struct st_sensor_settings {
  * @odr: Output data rate of the sensor [Hz].
  * num_data_channels: Number of data channels used in buffer.
  * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2).
+ * @int_pin_open_drain: Set the interrupt/DRDY to open drain.
  * @get_irq_data_ready: Function to get the IRQ used for data ready signal.
  * @tf: Transfer function structure used by I/O operations.
  * @tb: Transfer buffers and mutex used by I/O operations.
@@ -236,6 +241,7 @@ struct st_sensor_data {
 	unsigned int num_data_channels;
 
 	u8 drdy_int_pin;
+	bool int_pin_open_drain;
 
 	unsigned int (*get_irq_data_ready) (struct iio_dev *indio_dev);
 
diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h
index 753839187ba0..79b0e4cdb814 100644
--- a/include/linux/platform_data/st_sensors_pdata.h
+++ b/include/linux/platform_data/st_sensors_pdata.h
@@ -16,9 +16,11 @@
  * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2).
  *	Available only for accelerometer and pressure sensors.
  *	Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet).
+ * @open_drain: set the interrupt line to be open drain if possible.
  */
 struct st_sensors_platform_data {
 	u8 drdy_int_pin;
+	bool open_drain;
 };
 
 #endif /* ST_SENSORS_PDATA_H */
-- 
cgit v1.2.3


From 8bf872d8d261feefcdf67027522e3f717cad2bfe Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 6 Apr 2016 16:01:06 +0530
Subject: iio: core: Add devm_ APIs for iio_channel_{get,release}

Some of kernel driver uses the IIO framework to get the sensor
value via ADC or IIO HW driver. The client driver get iio channel
by iio_channel_get() and release it by calling iio_channel_release().

Add resource managed version (devm_*) of these APIs so that if client
calls the devm_iio_channel_get() then it need not to release it explicitly,
it can be done by managed device framework when driver get un-binded.

This reduces the code in error path and also need of .remove callback in
some cases.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iio/consumer.h | 27 +++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 2fc7928f401d..9fd8934c1887 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -356,6 +356,54 @@ void iio_channel_release(struct iio_channel *channel)
 }
 EXPORT_SYMBOL_GPL(iio_channel_release);
 
+static void devm_iio_channel_free(struct device *dev, void *res)
+{
+	struct iio_channel *channel = *(struct iio_channel **)res;
+
+	iio_channel_release(channel);
+}
+
+static int devm_iio_channel_match(struct device *dev, void *res, void *data)
+{
+	struct iio_channel **r = res;
+
+	if (!r || !*r) {
+		WARN_ON(!r || !*r);
+		return 0;
+	}
+
+	return *r == data;
+}
+
+struct iio_channel *devm_iio_channel_get(struct device *dev,
+					 const char *channel_name)
+{
+	struct iio_channel **ptr, *channel;
+
+	ptr = devres_alloc(devm_iio_channel_free, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	channel = iio_channel_get(dev, channel_name);
+	if (IS_ERR(channel)) {
+		devres_free(ptr);
+		return channel;
+	}
+
+	*ptr = channel;
+	devres_add(dev, ptr);
+
+	return channel;
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_get);
+
+void devm_iio_channel_release(struct device *dev, struct iio_channel *channel)
+{
+	WARN_ON(devres_release(dev, devm_iio_channel_free,
+			       devm_iio_channel_match, channel));
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_release);
+
 struct iio_channel *iio_channel_get_all(struct device *dev)
 {
 	const char *name;
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index fad58671c49e..e1e033d6a81f 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -48,6 +48,33 @@ struct iio_channel *iio_channel_get(struct device *dev,
  */
 void iio_channel_release(struct iio_channel *chan);
 
+/**
+ * devm_iio_channel_get() - Resource managed version of iio_channel_get().
+ * @dev:		Pointer to consumer device. Device name must match
+ *			the name of the device as provided in the iio_map
+ *			with which the desired provider to consumer mapping
+ *			was registered.
+ * @consumer_channel:	Unique name to identify the channel on the consumer
+ *			side. This typically describes the channels use within
+ *			the consumer. E.g. 'battery_voltage'
+ *
+ * Returns a pointer to negative errno if it is not able to get the iio channel
+ * otherwise returns valid pointer for iio channel.
+ *
+ * The allocated iio channel is automatically released when the device is
+ * unbound.
+ */
+struct iio_channel *devm_iio_channel_get(struct device *dev,
+					 const char *consumer_channel);
+/**
+ * devm_iio_channel_release() - Resource managed version of
+ *				iio_channel_release().
+ * @dev:		Pointer to consumer device for which resource
+ *			is allocared.
+ * @chan:		The channel to be released.
+ */
+void devm_iio_channel_release(struct device *dev, struct iio_channel *chan);
+
 /**
  * iio_channel_get_all() - get all channels associated with a client
  * @dev:		Pointer to consumer device.
-- 
cgit v1.2.3


From efc2c0133f198bc65593a67015af358919b0c48f Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 6 Apr 2016 16:01:07 +0530
Subject: iio: core: Add devm_ APIs for iio_channel_{get,release}_all

Some of kernel driver uses the IIO framework to get the sensor
value via ADC or IIO HW driver. The client driver get iio channel
by iio_channel_get_all() and release it by calling
iio_channel_release_all().

Add resource managed version (devm_*) of these APIs so that if client
calls the devm_iio_channel_get_all() then it need not to release it
explicitly, it can be done by managed device framework when driver
get un-binded.

This reduces the code in error path and also need of .remove callback in
some cases.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/inkern.c         | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/iio/consumer.h | 26 ++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 9fd8934c1887..c4757e6367e7 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -489,6 +489,42 @@ void iio_channel_release_all(struct iio_channel *channels)
 }
 EXPORT_SYMBOL_GPL(iio_channel_release_all);
 
+static void devm_iio_channel_free_all(struct device *dev, void *res)
+{
+	struct iio_channel *channels = *(struct iio_channel **)res;
+
+	iio_channel_release_all(channels);
+}
+
+struct iio_channel *devm_iio_channel_get_all(struct device *dev)
+{
+	struct iio_channel **ptr, *channels;
+
+	ptr = devres_alloc(devm_iio_channel_free_all, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	channels = iio_channel_get_all(dev);
+	if (IS_ERR(channels)) {
+		devres_free(ptr);
+		return channels;
+	}
+
+	*ptr = channels;
+	devres_add(dev, ptr);
+
+	return channels;
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_get_all);
+
+void devm_iio_channel_release_all(struct device *dev,
+				  struct iio_channel *channels)
+{
+	WARN_ON(devres_release(dev, devm_iio_channel_free_all,
+			       devm_iio_channel_match, channels));
+}
+EXPORT_SYMBOL_GPL(devm_iio_channel_release_all);
+
 static int iio_channel_read(struct iio_channel *chan, int *val, int *val2,
 	enum iio_chan_info_enum info)
 {
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index e1e033d6a81f..3d672f72e7ec 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -92,6 +92,32 @@ struct iio_channel *iio_channel_get_all(struct device *dev);
  */
 void iio_channel_release_all(struct iio_channel *chan);
 
+/**
+ * devm_iio_channel_get_all() - Resource managed version of
+ *				iio_channel_get_all().
+ * @dev: Pointer to consumer device.
+ *
+ * Returns a pointer to negative errno if it is not able to get the iio channel
+ * otherwise returns an array of iio_channel structures terminated with one with
+ * null iio_dev pointer.
+ *
+ * This function is used by fairly generic consumers to get all the
+ * channels registered as having this consumer.
+ *
+ * The allocated iio channels are automatically released when the device is
+ * unbounded.
+ */
+struct iio_channel *devm_iio_channel_get_all(struct device *dev);
+
+/**
+ * devm_iio_channel_release_all() - Resource managed version of
+ *				    iio_channel_release_all().
+ * @dev:		Pointer to consumer device for which resource
+ *			is allocared.
+ * @chan:		Array channel to be released.
+ */
+void devm_iio_channel_release_all(struct device *dev, struct iio_channel *chan);
+
 struct iio_cb_buffer;
 /**
  * iio_channel_get_all_cb() - register callback for triggered capture
-- 
cgit v1.2.3


From b0fcd8ab7b3c89b5da7fff5224d06ed73e7a33cc Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 23 Mar 2016 11:19:00 +0100
Subject: mtd: nand: add new enum for storing ECC algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our nand_ecc_modes_t is already a bit abused by value NAND_ECC_SOFT_BCH.
This enum should store ECC mode only and putting algorithm details there
is a bad idea. It would result in too many values impossible to support
in a sane way.

To solve this problem let's add a new enum. We'll have to modify all
drivers to set it properly but once it's done it'll be possible to drop
NAND_ECC_SOFT_BCH. That will result in a cleaner design and more
possibilities like setting ECC algorithm for hardware ECC mode.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/nand.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 56574ba36555..1b673e19667c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -119,6 +119,12 @@ typedef enum {
 	NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
+enum nand_ecc_algo {
+	NAND_ECC_UNKNOWN,
+	NAND_ECC_HAMMING,
+	NAND_ECC_BCH,
+};
+
 /*
  * Constants for Hardware ECC
  */
@@ -458,6 +464,7 @@ struct nand_hw_control {
 /**
  * struct nand_ecc_ctrl - Control structure for ECC
  * @mode:	ECC mode
+ * @algo:	ECC algorithm
  * @steps:	number of ECC steps per page
  * @size:	data bytes per ECC step
  * @bytes:	ECC bytes per step
@@ -508,6 +515,7 @@ struct nand_hw_control {
  */
 struct nand_ecc_ctrl {
 	nand_ecc_modes_t mode;
+	enum nand_ecc_algo algo;
 	int steps;
 	int size;
 	int bytes;
-- 
cgit v1.2.3


From dd2dcc004230b9d8fa809102cd326e3ee4bbdb2a Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 23 Mar 2016 11:19:01 +0100
Subject: of: mtd: prepare helper reading NAND ECC algo from DT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NAND subsystem is being slightly reworked to store ECC details in
separated fields. In future we'll want to add support for more DT
properties as specifying every possible setup with a single
"nand-ecc-mode" is a pretty bad idea.
To allow this let's add a helper that will support something like
"nand-ecc-algo" in future. Right now we use it for keeping backward
compatibility.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/of/of_mtd.c    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/of_mtd.h |  6 ++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
index b7361ed70537..15d056e181d2 100644
--- a/drivers/of/of_mtd.c
+++ b/drivers/of/of_mtd.c
@@ -49,6 +49,42 @@ int of_get_nand_ecc_mode(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
 
+/**
+ * of_get_nand_ecc_algo - Get nand ecc algorithm for given device_node
+ * @np:	Pointer to the given device_node
+ *
+ * The function gets ecc algorithm and returns its enum value, or errno in error
+ * case.
+ */
+int of_get_nand_ecc_algo(struct device_node *np)
+{
+	const char *pm;
+	int err;
+
+	/*
+	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
+	 * It's not implemented yet as currently NAND subsystem ignores
+	 * algorithm explicitly set this way. Once it's handled we should
+	 * document & support new property.
+	 */
+
+	/*
+	 * For backward compatibility we also read "nand-ecc-mode" checking
+	 * for some obsoleted values that were specifying ECC algorithm.
+	 */
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	if (!strcasecmp(pm, "soft"))
+		return NAND_ECC_HAMMING;
+	else if (!strcasecmp(pm, "soft_bch"))
+		return NAND_ECC_BCH;
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(of_get_nand_ecc_algo);
+
 /**
  * of_get_nand_ecc_step_size - Get ECC step size associated to
  * the required ECC strength (see below).
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
index e266caa36402..0f6aca5c6f2f 100644
--- a/include/linux/of_mtd.h
+++ b/include/linux/of_mtd.h
@@ -13,6 +13,7 @@
 
 #include <linux/of.h>
 int of_get_nand_ecc_mode(struct device_node *np);
+int of_get_nand_ecc_algo(struct device_node *np);
 int of_get_nand_ecc_step_size(struct device_node *np);
 int of_get_nand_ecc_strength(struct device_node *np);
 int of_get_nand_bus_width(struct device_node *np);
@@ -25,6 +26,11 @@ static inline int of_get_nand_ecc_mode(struct device_node *np)
 	return -ENOSYS;
 }
 
+static inline int of_get_nand_ecc_algo(struct device_node *np)
+{
+	return -ENOSYS;
+}
+
 static inline int of_get_nand_ecc_step_size(struct device_node *np)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From 7a654172161c8c9c7d59cbd0054d9e63c7411219 Mon Sep 17 00:00:00 2001
From: Raghav Dogra <raghav.dogra@nxp.com>
Date: Wed, 17 Feb 2016 16:54:18 +0530
Subject: mtd/ifc: Add support for IFC controller version 2.0

The new IFC controller version 2.0 has a different memory map page.
Upto IFC 1.4 PAGE size is 4 KB and from IFC2.0 PAGE size is 64KB.
This patch segregates the IFC global and runtime registers to appropriate
PAGE sizes.

Signed-off-by: Jaiprakash Singh <b44839@freescale.com>
Signed-off-by: Raghav Dogra <raghav@freescale.com>
Acked-by: Li Yang <leoyang.li@nxp.com>
Signed-off-by: Raghav Dogra <raghav.dogra@nxp.com>
Acked-by: Scott Wood <oss@buserror.net>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/memory/fsl_ifc.c        | 36 ++++++++++-----------
 drivers/mtd/nand/fsl_ifc_nand.c | 72 ++++++++++++++++++++++-------------------
 include/linux/fsl_ifc.h         | 45 +++++++++++++++++---------
 3 files changed, 87 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
index 2a691da8c1c7..904b4af5f142 100644
--- a/drivers/memory/fsl_ifc.c
+++ b/drivers/memory/fsl_ifc.c
@@ -59,11 +59,11 @@ int fsl_ifc_find(phys_addr_t addr_base)
 {
 	int i = 0;
 
-	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->gregs)
 		return -ENODEV;
 
 	for (i = 0; i < fsl_ifc_ctrl_dev->banks; i++) {
-		u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
+		u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->gregs->cspr_cs[i].cspr);
 		if (cspr & CSPR_V && (cspr & CSPR_BA) ==
 				convert_ifc_address(addr_base))
 			return i;
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(fsl_ifc_find);
 
 static int fsl_ifc_ctrl_init(struct fsl_ifc_ctrl *ctrl)
 {
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 
 	/*
 	 * Clear all the common status and event registers
@@ -104,7 +104,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
 	irq_dispose_mapping(ctrl->nand_irq);
 	irq_dispose_mapping(ctrl->irq);
 
-	iounmap(ctrl->regs);
+	iounmap(ctrl->gregs);
 
 	dev_set_drvdata(&dev->dev, NULL);
 	kfree(ctrl);
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(nand_irq_lock);
 
 static u32 check_nand_stat(struct fsl_ifc_ctrl *ctrl)
 {
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	unsigned long flags;
 	u32 stat;
 
@@ -157,7 +157,7 @@ static irqreturn_t fsl_ifc_nand_irq(int irqno, void *data)
 static irqreturn_t fsl_ifc_ctrl_irq(int irqno, void *data)
 {
 	struct fsl_ifc_ctrl *ctrl = data;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 	u32 err_axiid, err_srcid, status, cs_err, err_addr;
 	irqreturn_t ret = IRQ_NONE;
 
@@ -215,6 +215,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 {
 	int ret = 0;
 	int version, banks;
+	void __iomem *addr;
 
 	dev_info(&dev->dev, "Freescale Integrated Flash Controller\n");
 
@@ -225,22 +226,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 	dev_set_drvdata(&dev->dev, fsl_ifc_ctrl_dev);
 
 	/* IOMAP the entire IFC region */
-	fsl_ifc_ctrl_dev->regs = of_iomap(dev->dev.of_node, 0);
-	if (!fsl_ifc_ctrl_dev->regs) {
+	fsl_ifc_ctrl_dev->gregs = of_iomap(dev->dev.of_node, 0);
+	if (!fsl_ifc_ctrl_dev->gregs) {
 		dev_err(&dev->dev, "failed to get memory region\n");
 		ret = -ENODEV;
 		goto err;
 	}
 
-	version = ifc_in32(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
-			FSL_IFC_VERSION_MASK;
-	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
-	dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
-		version >> 24, (version >> 16) & 0xf, banks);
-
-	fsl_ifc_ctrl_dev->version = version;
-	fsl_ifc_ctrl_dev->banks = banks;
-
 	if (of_property_read_bool(dev->dev.of_node, "little-endian")) {
 		fsl_ifc_ctrl_dev->little_endian = true;
 		dev_dbg(&dev->dev, "IFC REGISTERS are LITTLE endian\n");
@@ -249,8 +241,9 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 		dev_dbg(&dev->dev, "IFC REGISTERS are BIG endian\n");
 	}
 
-	version = ioread32be(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
+	version = ifc_in32(&fsl_ifc_ctrl_dev->gregs->ifc_rev) &
 			FSL_IFC_VERSION_MASK;
+
 	banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
 	dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
 		version >> 24, (version >> 16) & 0xf, banks);
@@ -258,6 +251,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 	fsl_ifc_ctrl_dev->version = version;
 	fsl_ifc_ctrl_dev->banks = banks;
 
+	addr = fsl_ifc_ctrl_dev->gregs;
+	if (version >= FSL_IFC_VERSION_2_0_0)
+		addr += PGOFFSET_64K;
+	else
+		addr += PGOFFSET_4K;
+	fsl_ifc_ctrl_dev->rregs = addr;
+
 	/* get the Controller level irq */
 	fsl_ifc_ctrl_dev->irq = irq_of_parse_and_map(dev->dev.of_node, 0);
 	if (fsl_ifc_ctrl_dev->irq == 0) {
diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index 43f5a3a4873f..f8a016f038cd 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -232,7 +232,7 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	int buf_num;
 
 	ifc_nand_ctrl->page = page_addr;
@@ -295,7 +295,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
 	struct fsl_ifc_nand_ctrl *nctrl = ifc_nand_ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	u32 eccstat[4];
 	int i;
 
@@ -371,7 +371,7 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
 {
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
 	/* Program FIR/IFC_NAND_FCR0 for Small/Large page */
 	if (mtd->writesize > 512) {
@@ -411,7 +411,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
 	/* clear the read buffer */
 	ifc_nand_ctrl->read_bytes = 0;
@@ -723,7 +723,7 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
 	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 	u32 nand_fsr;
 
 	/* Use READ_STATUS command, but wait for the device to be ready */
@@ -825,39 +825,42 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
 static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 {
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
+	struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
 	uint32_t csor = 0, csor_8k = 0, csor_ext = 0;
 	uint32_t cs = priv->bank;
 
 	/* Save CSOR and CSOR_ext */
-	csor = ifc_in32(&ifc->csor_cs[cs].csor);
-	csor_ext = ifc_in32(&ifc->csor_cs[cs].csor_ext);
+	csor = ifc_in32(&ifc_global->csor_cs[cs].csor);
+	csor_ext = ifc_in32(&ifc_global->csor_cs[cs].csor_ext);
 
 	/* chage PageSize 8K and SpareSize 1K*/
 	csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
-	ifc_out32(csor_8k, &ifc->csor_cs[cs].csor);
-	ifc_out32(0x0000400, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor_8k, &ifc_global->csor_cs[cs].csor);
+	ifc_out32(0x0000400, &ifc_global->csor_cs[cs].csor_ext);
 
 	/* READID */
 	ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-		  (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-		  (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
-		  &ifc->ifc_nand.nand_fir0);
+		    (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+		    (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+		    &ifc_runtime->ifc_nand.nand_fir0);
 	ifc_out32(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
-		  &ifc->ifc_nand.nand_fcr0);
-	ifc_out32(0x0, &ifc->ifc_nand.row3);
+		    &ifc_runtime->ifc_nand.nand_fcr0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.row3);
 
-	ifc_out32(0x0, &ifc->ifc_nand.nand_fbcr);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.nand_fbcr);
 
 	/* Program ROW0/COL0 */
-	ifc_out32(0x0, &ifc->ifc_nand.row0);
-	ifc_out32(0x0, &ifc->ifc_nand.col0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.row0);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.col0);
 
 	/* set the chip select for NAND Transaction */
-	ifc_out32(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
+	ifc_out32(cs << IFC_NAND_CSEL_SHIFT,
+		&ifc_runtime->ifc_nand.nand_csel);
 
 	/* start read seq */
-	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+	ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT,
+		&ifc_runtime->ifc_nand.nandseq_strt);
 
 	/* wait for command complete flag or timeout */
 	wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -867,14 +870,15 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 		printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
 
 	/* Restore CSOR and CSOR_ext */
-	ifc_out32(csor, &ifc->csor_cs[cs].csor);
-	ifc_out32(csor_ext, &ifc->csor_cs[cs].csor_ext);
+	ifc_out32(csor, &ifc_global->csor_cs[cs].csor);
+	ifc_out32(csor_ext, &ifc_global->csor_cs[cs].csor_ext);
 }
 
 static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 {
 	struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-	struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+	struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
+	struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
 	struct nand_chip *chip = &priv->chip;
 	struct mtd_info *mtd = nand_to_mtd(&priv->chip);
 	struct nand_ecclayout *layout;
@@ -886,7 +890,8 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 
 	/* fill in nand_chip structure */
 	/* set up function call table */
-	if ((ifc_in32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+	if ((ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr))
+		& CSPR_PORT_SIZE_16)
 		chip->read_byte = fsl_ifc_read_byte16;
 	else
 		chip->read_byte = fsl_ifc_read_byte;
@@ -900,13 +905,14 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->bbt_td = &bbt_main_descr;
 	chip->bbt_md = &bbt_mirror_descr;
 
-	ifc_out32(0x0, &ifc->ifc_nand.ncfgr);
+	ifc_out32(0x0, &ifc_runtime->ifc_nand.ncfgr);
 
 	/* set up nand options */
 	chip->bbt_options = NAND_BBT_USE_FLASH;
 	chip->options = NAND_NO_SUBPAGE_WRITE;
 
-	if (ifc_in32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+	if (ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr)
+		& CSPR_PORT_SIZE_16) {
 		chip->read_byte = fsl_ifc_read_byte16;
 		chip->options |= NAND_BUSWIDTH_16;
 	} else {
@@ -919,7 +925,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 	chip->ecc.read_page = fsl_ifc_read_page;
 	chip->ecc.write_page = fsl_ifc_write_page;
 
-	csor = ifc_in32(&ifc->csor_cs[priv->bank].csor);
+	csor = ifc_in32(&ifc_global->csor_cs[priv->bank].csor);
 
 	/* Hardware generates ECC per 512 Bytes */
 	chip->ecc.size = 512;
@@ -1007,10 +1013,10 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
 	return 0;
 }
 
-static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
+static int match_bank(struct fsl_ifc_global __iomem *ifc_global, int bank,
 		      phys_addr_t addr)
 {
-	u32 cspr = ifc_in32(&ifc->cspr_cs[bank].cspr);
+	u32 cspr = ifc_in32(&ifc_global->cspr_cs[bank].cspr);
 
 	if (!(cspr & CSPR_V))
 		return 0;
@@ -1024,7 +1030,7 @@ static DEFINE_MUTEX(fsl_ifc_nand_mutex);
 
 static int fsl_ifc_nand_probe(struct platform_device *dev)
 {
-	struct fsl_ifc_regs __iomem *ifc;
+	struct fsl_ifc_runtime __iomem *ifc;
 	struct fsl_ifc_mtd *priv;
 	struct resource res;
 	static const char *part_probe_types[]
@@ -1034,9 +1040,9 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 	struct device_node *node = dev->dev.of_node;
 	struct mtd_info *mtd;
 
-	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+	if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->rregs)
 		return -ENODEV;
-	ifc = fsl_ifc_ctrl_dev->regs;
+	ifc = fsl_ifc_ctrl_dev->rregs;
 
 	/* get, allocate and map the memory resource */
 	ret = of_address_to_resource(node, 0, &res);
@@ -1047,7 +1053,7 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 
 	/* find which chip select it is connected to */
 	for (bank = 0; bank < fsl_ifc_ctrl_dev->banks; bank++) {
-		if (match_bank(ifc, bank, res.start))
+		if (match_bank(fsl_ifc_ctrl_dev->gregs, bank, res.start))
 			break;
 	}
 
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index 0023088b253b..3f9778cbc79d 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -39,6 +39,10 @@
 #define FSL_IFC_VERSION_MASK	0x0F0F0000
 #define FSL_IFC_VERSION_1_0_0	0x01000000
 #define FSL_IFC_VERSION_1_1_0	0x01010000
+#define FSL_IFC_VERSION_2_0_0	0x02000000
+
+#define PGOFFSET_64K	(64*1024)
+#define PGOFFSET_4K	(4*1024)
 
 /*
  * CSPR - Chip Select Property Register
@@ -723,20 +727,26 @@ struct fsl_ifc_nand {
 	__be32 nand_evter_en;
 	u32 res17[0x2];
 	__be32 nand_evter_intr_en;
-	u32 res18[0x2];
+	__be32 nand_vol_addr_stat;
+	u32 res18;
 	__be32 nand_erattr0;
 	__be32 nand_erattr1;
 	u32 res19[0x10];
 	__be32 nand_fsr;
-	u32 res20;
-	__be32 nand_eccstat[4];
-	u32 res21[0x20];
+	u32 res20[0x3];
+	__be32 nand_eccstat[6];
+	u32 res21[0x1c];
 	__be32 nanndcr;
 	u32 res22[0x2];
 	__be32 nand_autoboot_trgr;
 	u32 res23;
 	__be32 nand_mdr;
-	u32 res24[0x5C];
+	u32 res24[0x1C];
+	__be32 nand_dll_lowcfg0;
+	__be32 nand_dll_lowcfg1;
+	u32 res25;
+	__be32 nand_dll_lowstat;
+	u32 res26[0x3c];
 };
 
 /*
@@ -771,13 +781,12 @@ struct fsl_ifc_gpcm {
 	__be32 gpcm_erattr1;
 	__be32 gpcm_erattr2;
 	__be32 gpcm_stat;
-	u32 res4[0x1F3];
 };
 
 /*
  * IFC Controller Registers
  */
-struct fsl_ifc_regs {
+struct fsl_ifc_global {
 	__be32 ifc_rev;
 	u32 res1[0x2];
 	struct {
@@ -803,21 +812,26 @@ struct fsl_ifc_regs {
 	} ftim_cs[FSL_IFC_BANK_COUNT];
 	u32 res9[0x30];
 	__be32 rb_stat;
-	u32 res10[0x2];
+	__be32 rb_map;
+	__be32 wb_map;
 	__be32 ifc_gcr;
-	u32 res11[0x2];
+	u32 res10[0x2];
 	__be32 cm_evter_stat;
-	u32 res12[0x2];
+	u32 res11[0x2];
 	__be32 cm_evter_en;
-	u32 res13[0x2];
+	u32 res12[0x2];
 	__be32 cm_evter_intr_en;
-	u32 res14[0x2];
+	u32 res13[0x2];
 	__be32 cm_erattr0;
 	__be32 cm_erattr1;
-	u32 res15[0x2];
+	u32 res14[0x2];
 	__be32 ifc_ccr;
 	__be32 ifc_csr;
-	u32 res16[0x2EB];
+	__be32 ddr_ccr_low;
+};
+
+
+struct fsl_ifc_runtime {
 	struct fsl_ifc_nand ifc_nand;
 	struct fsl_ifc_nor ifc_nor;
 	struct fsl_ifc_gpcm ifc_gpcm;
@@ -831,7 +845,8 @@ extern int fsl_ifc_find(phys_addr_t addr_base);
 struct fsl_ifc_ctrl {
 	/* device info */
 	struct device			*dev;
-	struct fsl_ifc_regs __iomem	*regs;
+	struct fsl_ifc_global __iomem	*gregs;
+	struct fsl_ifc_runtime __iomem	*rregs;
 	int				irq;
 	int				nand_irq;
 	spinlock_t			lock;
-- 
cgit v1.2.3


From 9d02fc2a5129449581c3108c260e96377cf35f7e Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 26 Aug 2015 16:08:12 +0200
Subject: mtd: nand: export default read/write oob functions

Export the default read/write oob functions (for the standard and syndrome
scheme), so that drivers can use them for their raw implementation and
implement their own functions for the normal oob operation.

This is required if your ECC engine is capable of fixing some of the OOB
data. In this case you have to overload the ->read_oob() and ->write_oob(),
but if you don't specify the ->read/write_oob_raw() functions they are
assigned to the ->read/write_oob() implementation, which is not what you
want.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 18 ++++++++++--------
 include/linux/mtd/nand.h     | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0f0c5b190316..13fcddc8a10e 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1893,13 +1893,13 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-			     int page)
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
 	chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
 	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 	return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_std);
 
 /**
  * nand_read_oob_syndrome - [REPLACEABLE] OOB data read function for HW ECC
@@ -1908,8 +1908,8 @@ static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-				  int page)
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			   int page)
 {
 	int length = mtd->oobsize;
 	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
@@ -1937,6 +1937,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
 	return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_syndrome);
 
 /**
  * nand_write_oob_std - [REPLACEABLE] the most common OOB data write function
@@ -1944,8 +1945,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-			      int page)
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
 	int status = 0;
 	const uint8_t *buf = chip->oob_poi;
@@ -1960,6 +1960,7 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
 
 	return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_std);
 
 /**
  * nand_write_oob_syndrome - [REPLACEABLE] OOB data write function for HW ECC
@@ -1968,8 +1969,8 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_syndrome(struct mtd_info *mtd,
-				   struct nand_chip *chip, int page)
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			    int page)
 {
 	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
 	int eccsize = chip->ecc.size, length = mtd->oobsize;
@@ -2019,6 +2020,7 @@ static int nand_write_oob_syndrome(struct mtd_info *mtd,
 
 	return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_syndrome);
 
 /**
  * nand_do_read_oob - [INTERN] NAND read out-of-band
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 1b673e19667c..7e06afb8552c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1078,4 +1078,18 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
 				void *ecc, int ecclen,
 				void *extraoob, int extraooblen,
 				int threshold);
+
+/* Default write_oob implementation */
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default write_oob syndrome implementation */
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			    int page);
+
+/* Default read_oob implementation */
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default read_oob syndrome implementation */
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			   int page);
 #endif /* __LINUX_MTD_NAND_H */
-- 
cgit v1.2.3


From 75eb2cec251fda33c9bb716ecc372819abb9278a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 4 Feb 2016 09:52:30 +0100
Subject: mtd: add mtd_ooblayout_xxx() helper functions

In order to make the ecclayout definition completely dynamic we need to
rework the way the OOB layout are defined and iterated.

Create a few mtd_ooblayout_xxx() helpers to ease OOB bytes manipulation
and hide ecclayout internals to their users.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdcore.c   | 400 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/mtd.h |  33 ++++
 2 files changed, 433 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index bee180bd11e7..0290c41e44fc 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1016,6 +1016,406 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to,
 }
 EXPORT_SYMBOL_GPL(mtd_write_oob);
 
+/**
+ * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *	    information
+ *
+ * This functions return ECC section information in the OOB area. I you want
+ * to get all the ECC bytes information, then you should call
+ * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+		      struct mtd_oob_region *oobecc)
+{
+	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
+
+	memset(oobecc, 0, sizeof(*oobecc));
+
+	if (!mtd || section < 0)
+		return -EINVAL;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	/*
+	 * This logic allows us to reuse the ->ecclayout information and
+	 * expose them as ECC regions (as done for the OOB free regions).
+	 *
+	 * TODO: this should be dropped as soon as we get rid of the
+	 * ->ecclayout field.
+	 */
+	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
+		eccpos = mtd->ecclayout->eccpos[eccbyte];
+
+		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
+			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
+
+			if (eccpos + 1 == neccpos) {
+				length++;
+				continue;
+			}
+		}
+
+		if (section == cursection)
+			break;
+
+		length = 0;
+		cursection++;
+	}
+
+	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
+		return -ERANGE;
+
+	oobecc->length = length + 1;
+	oobecc->offset = eccpos - length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
+
+/**
+ * mtd_ooblayout_free - Get the OOB region definition of a specific free
+ *			section
+ * @mtd: MTD device structure
+ * @section: Free section you are interested in. Depending on the layout
+ *	     you may have all the free bytes stored in a single contiguous
+ *	     section, or one section per ECC chunk plus an extra section
+ *	     for the remaining bytes (or other funky layout).
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *	     information
+ *
+ * This functions return free bytes position in the OOB area. I you want
+ * to get all the free bytes information, then you should call
+ * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+		       struct mtd_oob_region *oobfree)
+{
+	memset(oobfree, 0, sizeof(*oobfree));
+
+	if (!mtd || section < 0)
+		return -EINVAL;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE)
+		return -ERANGE;
+
+	oobfree->offset = mtd->ecclayout->oobfree[section].offset;
+	oobfree->length = mtd->ecclayout->oobfree[section].length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
+
+/**
+ * mtd_ooblayout_find_region - Find the region attached to a specific byte
+ * @mtd: mtd info structure
+ * @byte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: used to retrieve the ECC position
+ * @iter: iterator function. Should be either mtd_ooblayout_free or
+ *	  mtd_ooblayout_ecc depending on the region type you're searching for
+ *
+ * This functions returns the section id and oobregion information of a
+ * specific byte. For example, say you want to know where the 4th ECC byte is
+ * stored, you'll use:
+ *
+ * mtd_ooblayout_find_region(mtd, 3, &section, &oobregion, mtd_ooblayout_ecc);
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte,
+				int *sectionp, struct mtd_oob_region *oobregion,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	int pos = 0, ret, section = 0;
+
+	memset(oobregion, 0, sizeof(*oobregion));
+
+	while (1) {
+		ret = iter(mtd, section, oobregion);
+		if (ret)
+			return ret;
+
+		if (pos + oobregion->length > byte)
+			break;
+
+		pos += oobregion->length;
+		section++;
+	}
+
+	/*
+	 * Adjust region info to make it start at the beginning at the
+	 * 'start' ECC byte.
+	 */
+	oobregion->offset += byte - pos;
+	oobregion->length -= byte - pos;
+	*sectionp = section;
+
+	return 0;
+}
+
+/**
+ * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific
+ *				  ECC byte
+ * @mtd: mtd info structure
+ * @eccbyte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: OOB region information
+ *
+ * Works like mtd_ooblayout_find_region() except it searches for a specific ECC
+ * byte.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+				 int *section,
+				 struct mtd_oob_region *oobregion)
+{
+	return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion,
+					 mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion);
+
+/**
+ * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @buf: destination buffer to store OOB bytes
+ * @oobbuf: OOB buffer
+ * @start: first byte to retrieve
+ * @nbytes: number of bytes to retrieve
+ * @iter: section iterator
+ *
+ * Extract bytes attached to a specific category (ECC or free)
+ * from the OOB buffer and copy them into buf.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf,
+				const u8 *oobbuf, int start, int nbytes,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret;
+
+	ret = mtd_ooblayout_find_region(mtd, start, &section,
+					&oobregion, iter);
+
+	while (!ret) {
+		int cnt;
+
+		cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+		memcpy(buf, oobbuf + oobregion.offset, cnt);
+		buf += cnt;
+		nbytes -= cnt;
+
+		if (!nbytes)
+			break;
+
+		ret = iter(mtd, ++section, &oobregion);
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @buf: source buffer to get OOB bytes from
+ * @oobbuf: OOB buffer
+ * @start: first OOB byte to set
+ * @nbytes: number of OOB bytes to set
+ * @iter: section iterator
+ *
+ * Fill the OOB buffer with data provided in buf. The category (ECC or free)
+ * is selected by passing the appropriate iterator.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf,
+				u8 *oobbuf, int start, int nbytes,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret;
+
+	ret = mtd_ooblayout_find_region(mtd, start, &section,
+					&oobregion, iter);
+
+	while (!ret) {
+		int cnt;
+
+		cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+		memcpy(oobbuf + oobregion.offset, buf, cnt);
+		buf += cnt;
+		nbytes -= cnt;
+
+		if (!nbytes)
+			break;
+
+		ret = iter(mtd, ++section, &oobregion);
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category
+ * @mtd: mtd info structure
+ * @iter: category iterator
+ *
+ * Count the number of bytes in a given category.
+ *
+ * Returns a positive value on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_count_bytes(struct mtd_info *mtd,
+				int (*iter)(struct mtd_info *,
+					    int section,
+					    struct mtd_oob_region *oobregion))
+{
+	struct mtd_oob_region oobregion = { };
+	int section = 0, ret, nbytes = 0;
+
+	while (1) {
+		ret = iter(mtd, section++, &oobregion);
+		if (ret) {
+			if (ret == -ERANGE)
+				ret = nbytes;
+			break;
+		}
+
+		nbytes += oobregion.length;
+	}
+
+	return ret;
+}
+
+/**
+ * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+			       const u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes);
+
+/**
+ * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get ECC bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+			       u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes);
+
+/**
+ * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @databuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+				const u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_get_bytes(mtd, databuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_databytes);
+
+/**
+ * mtd_ooblayout_get_eccbytes - set data bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get data bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+				u8 *oobbuf, int start, int nbytes)
+{
+	return mtd_ooblayout_set_bytes(mtd, databuf, oobbuf, start, nbytes,
+				       mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_databytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of free bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd)
+{
+	return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_freebytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of ECC bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
+{
+	return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index ef9fea4fc400..117ca1ff581d 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -108,6 +108,21 @@ struct nand_ecclayout {
 	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
 };
 
+/**
+ * struct mtd_oob_region - oob region definition
+ * @offset: region offset
+ * @length: region length
+ *
+ * This structure describes a region of the OOB area, and is used
+ * to retrieve ECC or free bytes sections.
+ * Each section is defined by an offset within the OOB area and a
+ * length.
+ */
+struct mtd_oob_region {
+	u32 offset;
+	u32 length;
+};
+
 struct module;	/* only needed for owner field in mtd_info */
 
 struct mtd_info {
@@ -253,6 +268,24 @@ struct mtd_info {
 	int usecount;
 };
 
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+		      struct mtd_oob_region *oobecc);
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+				 int *section,
+				 struct mtd_oob_region *oobregion);
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+			       const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+			       u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+		       struct mtd_oob_region *oobfree);
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+				const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+				u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
 				   struct device_node *np)
 {
-- 
cgit v1.2.3


From 036d6543f85319ffe96afad6de73d3a220917a63 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 18:53:44 +0100
Subject: mtd: add mtd_set_ecclayout() helper function

Add an mtd_set_ecclayout() helper function to avoid direct accesses to the
mtd->ecclayout field. This will ease future reworks of ECC layout
definition.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/mtd.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 117ca1ff581d..e62da8462493 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -286,6 +286,12 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
+static inline void mtd_set_ecclayout(struct mtd_info *mtd,
+				     struct nand_ecclayout *ecclayout)
+{
+	mtd->ecclayout = ecclayout;
+}
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
 				   struct device_node *np)
 {
-- 
cgit v1.2.3


From adbbc3bc827eb1f43a932d783f09ba55c8ec8379 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:01:31 +0100
Subject: mtd: create an mtd_ooblayout_ops struct to ease ECC layout definition

ECC layout definitions are currently exposed using the nand_ecclayout
struct which embeds oobfree and eccpos arrays with predefined size.
This approach was acceptable when NAND chips were providing relatively
small OOB regions, but MLC and TLC now provide OOB regions of several
hundreds of bytes, which implies a non negligible overhead for everybody
even those who only need to support legacy NANDs.

Create an mtd_ooblayout_ops interface providing the same functionality
(expose the ECC and oobfree layout) without the need for this huge
structure.

The mtd->ecclayout is now deprecated and should be replaced by the
equivalent mtd_ooblayout_ops. In the meantime we provide a wrapper around
the ->ecclayout field to ease migration to this new model.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdchar.c   |   4 +-
 drivers/mtd/mtdconcat.c |   2 +-
 drivers/mtd/mtdcore.c   | 165 +++++++++++++++++++++++++++++++++++-------------
 drivers/mtd/mtdpart.c   |  23 ++++++-
 include/linux/mtd/mtd.h |  32 ++++++++--
 5 files changed, 174 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index cd64ab76dd7b..3fad2c7425b0 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -888,7 +888,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	{
 		struct nand_oobinfo oi;
 
-		if (!mtd->ecclayout)
+		if (!mtd->ooblayout)
 			return -EOPNOTSUPP;
 
 		ret = get_oobinfo(mtd, &oi);
@@ -982,7 +982,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	{
 		struct nand_ecclayout_user *usrlay;
 
-		if (!mtd->ecclayout)
+		if (!mtd->ooblayout)
 			return -EOPNOTSUPP;
 
 		usrlay = kmalloc(sizeof(*usrlay), GFP_KERNEL);
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 481565e5fbaa..d573606b91c2 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -777,7 +777,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	}
 
-	mtd_set_ecclayout(&concat->mtd, subdev[0]->ecclayout);
+	mtd_set_ooblayout(&concat->mtd, subdev[0]->ooblayout);
 
 	concat->num_subdev = num_devs;
 	concat->mtd.name = name;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 0290c41e44fc..134ed2f7b919 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1035,49 +1035,15 @@ EXPORT_SYMBOL_GPL(mtd_write_oob);
 int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
 		      struct mtd_oob_region *oobecc)
 {
-	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
-
 	memset(oobecc, 0, sizeof(*oobecc));
 
 	if (!mtd || section < 0)
 		return -EINVAL;
 
-	if (!mtd->ecclayout)
+	if (!mtd->ooblayout || !mtd->ooblayout->ecc)
 		return -ENOTSUPP;
 
-	/*
-	 * This logic allows us to reuse the ->ecclayout information and
-	 * expose them as ECC regions (as done for the OOB free regions).
-	 *
-	 * TODO: this should be dropped as soon as we get rid of the
-	 * ->ecclayout field.
-	 */
-	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
-		eccpos = mtd->ecclayout->eccpos[eccbyte];
-
-		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
-			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
-
-			if (eccpos + 1 == neccpos) {
-				length++;
-				continue;
-			}
-		}
-
-		if (section == cursection)
-			break;
-
-		length = 0;
-		cursection++;
-	}
-
-	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
-		return -ERANGE;
-
-	oobecc->length = length + 1;
-	oobecc->offset = eccpos - length;
-
-	return 0;
+	return mtd->ooblayout->ecc(mtd, section, oobecc);
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
 
@@ -1106,16 +1072,10 @@ int mtd_ooblayout_free(struct mtd_info *mtd, int section,
 	if (!mtd || section < 0)
 		return -EINVAL;
 
-	if (!mtd->ecclayout)
+	if (!mtd->ooblayout || !mtd->ooblayout->free)
 		return -ENOTSUPP;
 
-	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE)
-		return -ERANGE;
-
-	oobfree->offset = mtd->ecclayout->oobfree[section].offset;
-	oobfree->length = mtd->ecclayout->oobfree[section].length;
-
-	return 0;
+	return mtd->ooblayout->free(mtd, section, oobfree);
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
 
@@ -1416,6 +1376,123 @@ int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
 
+/**
+ * mtd_ecclayout_ecc - Default ooblayout_ecc iterator implementation
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *	    information
+ *
+ * This function is just a wrapper around the mtd->ecclayout field and is
+ * here to ease the transition to the mtd_ooblayout_ops approach.
+ * All it does is convert the layout->eccpos information into proper oob
+ * region definitions.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ecclayout_ecc(struct mtd_info *mtd, int section,
+			     struct mtd_oob_region *oobecc)
+{
+	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
+
+	if (!mtd->ecclayout)
+		return -ENOTSUPP;
+
+	/*
+	 * This logic allows us to reuse the ->ecclayout information and
+	 * expose them as ECC regions (as done for the OOB free regions).
+	 *
+	 * TODO: this should be dropped as soon as we get rid of the
+	 * ->ecclayout field.
+	 */
+	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
+		eccpos = mtd->ecclayout->eccpos[eccbyte];
+
+		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
+			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
+
+			if (eccpos + 1 == neccpos) {
+				length++;
+				continue;
+			}
+		}
+
+		if (section == cursection)
+			break;
+
+		length = 0;
+		cursection++;
+	}
+
+	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
+		return -ERANGE;
+
+	oobecc->length = length + 1;
+	oobecc->offset = eccpos - length;
+
+	return 0;
+}
+
+/**
+ * mtd_ecclayout_ecc - Default ooblayout_free iterator implementation
+ * @mtd: MTD device structure
+ * @section: Free section. Depending on the layout you may have all the free
+ *	     bytes stored in a single contiguous section, or one section
+ *	     per ECC chunk (and sometime several sections for a single ECC
+ *	     ECC chunk)
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *	     information
+ *
+ * This function is just a wrapper around the mtd->ecclayout field and is
+ * here to ease the transition to the mtd_ooblayout_ops approach.
+ * All it does is convert the layout->oobfree information into proper oob
+ * region definitions.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ecclayout_free(struct mtd_info *mtd, int section,
+			      struct mtd_oob_region *oobfree)
+{
+	struct nand_ecclayout *layout = mtd->ecclayout;
+
+	if (!layout)
+		return -ENOTSUPP;
+
+	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE ||
+	    !layout->oobfree[section].length)
+		return -ERANGE;
+
+	oobfree->offset = layout->oobfree[section].offset;
+	oobfree->length = layout->oobfree[section].length;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops mtd_ecclayout_wrapper_ops = {
+	.ecc = mtd_ecclayout_ecc,
+	.free = mtd_ecclayout_free,
+};
+
+/**
+ * mtd_set_ecclayout - Attach an ecclayout to an MTD device
+ * @mtd: MTD device structure
+ * @ecclayout: The ecclayout to attach to the device
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout)
+{
+	if (!mtd || !ecclayout)
+		return;
+
+	mtd->ecclayout = ecclayout;
+	mtd_set_ooblayout(mtd, &mtd_ecclayout_wrapper_ops);
+}
+EXPORT_SYMBOL_GPL(mtd_set_ecclayout);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index f53d9d72b23a..1f13e32556f8 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -317,6 +317,27 @@ static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	return res;
 }
 
+static int part_ooblayout_ecc(struct mtd_info *mtd, int section,
+			      struct mtd_oob_region *oobregion)
+{
+	struct mtd_part *part = mtd_to_part(mtd);
+
+	return mtd_ooblayout_ecc(part->master, section, oobregion);
+}
+
+static int part_ooblayout_free(struct mtd_info *mtd, int section,
+			       struct mtd_oob_region *oobregion)
+{
+	struct mtd_part *part = mtd_to_part(mtd);
+
+	return mtd_ooblayout_free(part->master, section, oobregion);
+}
+
+static const struct mtd_ooblayout_ops part_ooblayout_ops = {
+	.ecc = part_ooblayout_ecc,
+	.free = part_ooblayout_free,
+};
+
 static inline void free_partition(struct mtd_part *p)
 {
 	kfree(p->mtd.name);
@@ -533,7 +554,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 			part->name);
 	}
 
-	mtd_set_ecclayout(&slave->mtd, master->ecclayout);
+	mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
 	slave->mtd.ecc_step_size = master->ecc_step_size;
 	slave->mtd.ecc_strength = master->ecc_strength;
 	slave->mtd.bitflip_threshold = master->bitflip_threshold;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index e62da8462493..177bf314ad70 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -101,6 +101,9 @@ struct mtd_oob_ops {
  * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
  * for export to user-space via the ECCGETLAYOUT ioctl.
  * nand_ecclayout should be expandable in the future simply by the above macros.
+ *
+ * This structure is now deprecated, you should use struct nand_ecclayout_ops
+ * to describe your OOB layout.
  */
 struct nand_ecclayout {
 	__u32 eccbytes;
@@ -123,6 +126,22 @@ struct mtd_oob_region {
 	u32 length;
 };
 
+/*
+ * struct mtd_ooblayout_ops - NAND OOB layout operations
+ * @ecc: function returning an ECC region in the OOB area.
+ *	 Should return -ERANGE if %section exceeds the total number of
+ *	 ECC sections.
+ * @free: function returning a free region in the OOB area.
+ *	  Should return -ERANGE if %section exceeds the total number of
+ *	  free sections.
+ */
+struct mtd_ooblayout_ops {
+	int (*ecc)(struct mtd_info *mtd, int section,
+		   struct mtd_oob_region *oobecc);
+	int (*free)(struct mtd_info *mtd, int section,
+		    struct mtd_oob_region *oobfree);
+};
+
 struct module;	/* only needed for owner field in mtd_info */
 
 struct mtd_info {
@@ -181,9 +200,12 @@ struct mtd_info {
 	const char *name;
 	int index;
 
-	/* ECC layout structure pointer - read only! */
+	/* [Deprecated] ECC layout structure pointer - read only! */
 	struct nand_ecclayout *ecclayout;
 
+	/* OOB layout description */
+	const struct mtd_ooblayout_ops *ooblayout;
+
 	/* the ecc step size. */
 	unsigned int ecc_step_size;
 
@@ -286,10 +308,12 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
-static inline void mtd_set_ecclayout(struct mtd_info *mtd,
-				     struct nand_ecclayout *ecclayout)
+void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout);
+
+static inline void mtd_set_ooblayout(struct mtd_info *mtd,
+				     const struct mtd_ooblayout_ops *ooblayout)
 {
-	mtd->ecclayout = ecclayout;
+	mtd->ooblayout = ooblayout;
 }
 
 static inline void mtd_set_of_node(struct mtd_info *mtd,
-- 
cgit v1.2.3


From 41b207a70d3a86b9e2eede155e87838234c7cbd5 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:06:15 +0100
Subject: mtd: nand: implement the default mtd_ooblayout_ops

Replace the default nand_ecclayout definitions for large and small page
devices with the equivalent mtd_ooblayout_ops.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 142 ++++++++++++++++++++++++++++---------------
 include/linux/mtd/nand.h     |   3 +
 2 files changed, 96 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index feb1448e9dd6..e8332ea45739 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -47,54 +47,96 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of_mtd.h>
 
+static int nand_get_device(struct mtd_info *mtd, int new_state);
+
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops);
+
 /* Define default oob placement schemes for large and small page devices */
-static struct nand_ecclayout nand_oob_8 = {
-	.eccbytes = 3,
-	.eccpos = {0, 1, 2},
-	.oobfree = {
-		{.offset = 3,
-		 .length = 2},
-		{.offset = 6,
-		 .length = 2} }
-};
+static int nand_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+				 struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static struct nand_ecclayout nand_oob_16 = {
-	.eccbytes = 6,
-	.eccpos = {0, 1, 2, 3, 6, 7},
-	.oobfree = {
-		{.offset = 8,
-		 . length = 8} }
-};
+	if (section > 1)
+		return -ERANGE;
 
-static struct nand_ecclayout nand_oob_64 = {
-	.eccbytes = 24,
-	.eccpos = {
-		   40, 41, 42, 43, 44, 45, 46, 47,
-		   48, 49, 50, 51, 52, 53, 54, 55,
-		   56, 57, 58, 59, 60, 61, 62, 63},
-	.oobfree = {
-		{.offset = 2,
-		 .length = 38} }
-};
+	if (!section) {
+		oobregion->offset = 0;
+		oobregion->length = 4;
+	} else {
+		oobregion->offset = 6;
+		oobregion->length = ecc->total - 4;
+	}
 
-static struct nand_ecclayout nand_oob_128 = {
-	.eccbytes = 48,
-	.eccpos = {
-		   80, 81, 82, 83, 84, 85, 86, 87,
-		   88, 89, 90, 91, 92, 93, 94, 95,
-		   96, 97, 98, 99, 100, 101, 102, 103,
-		   104, 105, 106, 107, 108, 109, 110, 111,
-		   112, 113, 114, 115, 116, 117, 118, 119,
-		   120, 121, 122, 123, 124, 125, 126, 127},
-	.oobfree = {
-		{.offset = 2,
-		 .length = 78} }
+	return 0;
+}
+
+static int nand_ooblayout_free_sp(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *oobregion)
+{
+	if (section > 1)
+		return -ERANGE;
+
+	if (mtd->oobsize == 16) {
+		if (section)
+			return -ERANGE;
+
+		oobregion->length = 8;
+		oobregion->offset = 8;
+	} else {
+		oobregion->length = 2;
+		if (!section)
+			oobregion->offset = 3;
+		else
+			oobregion->offset = 6;
+	}
+
+	return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_sp_ops = {
+	.ecc = nand_ooblayout_ecc_sp,
+	.free = nand_ooblayout_free_sp,
 };
+EXPORT_SYMBOL_GPL(nand_ooblayout_sp_ops);
 
-static int nand_get_device(struct mtd_info *mtd, int new_state);
+static int nand_ooblayout_ecc_lp(struct mtd_info *mtd, int section,
+				 struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
-			     struct mtd_oob_ops *ops);
+	if (section)
+		return -ERANGE;
+
+	oobregion->length = ecc->total;
+	oobregion->offset = mtd->oobsize - oobregion->length;
+
+	return 0;
+}
+
+static int nand_ooblayout_free_lp(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *oobregion)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+	if (section)
+		return -ERANGE;
+
+	oobregion->length = mtd->oobsize - ecc->total - 2;
+	oobregion->offset = 2;
+
+	return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_lp_ops = {
+	.ecc = nand_ooblayout_ecc_lp,
+	.free = nand_ooblayout_free_lp,
+};
+EXPORT_SYMBOL_GPL(nand_ooblayout_lp_ops);
 
 static int check_offs_len(struct mtd_info *mtd,
 					loff_t ofs, uint64_t len)
@@ -4109,22 +4151,25 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Set the internal oob buffer location, just after the page data */
 	chip->oob_poi = chip->buffers->databuf + mtd->writesize;
 
+	/*
+	 * Set the provided ECC layout. If ecc->layout is NULL, the MTD core
+	 * will just leave mtd->ooblayout to NULL, if it's not NULL, it will
+	 * set ->ooblayout to the default ecclayout wrapper.
+	 */
+	mtd_set_ecclayout(mtd, ecc->layout);
+
 	/*
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
-	if (!ecc->layout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
+	if (!mtd->ooblayout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
 		switch (mtd->oobsize) {
 		case 8:
-			ecc->layout = &nand_oob_8;
-			break;
 		case 16:
-			ecc->layout = &nand_oob_16;
+			mtd_set_ooblayout(mtd, &nand_ooblayout_sp_ops);
 			break;
 		case 64:
-			ecc->layout = &nand_oob_64;
-			break;
 		case 128:
-			ecc->layout = &nand_oob_128;
+			mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
 			break;
 		default:
 			WARN(1, "No oob scheme defined for oobsize %d\n",
@@ -4285,7 +4330,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		ecc->write_oob_raw = ecc->write_oob;
 
 	/* propagate ecc info to mtd_info */
-	mtd_set_ecclayout(mtd, ecc->layout);
 	mtd->ecc_strength = ecc->strength;
 	mtd->ecc_step_size = ecc->size;
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7e06afb8552c..f2ded7b1b3b8 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -748,6 +748,9 @@ struct nand_chip {
 	void *priv;
 };
 
+extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
+extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
+
 static inline void nand_set_flash_node(struct nand_chip *chip,
 				       struct device_node *np)
 {
-- 
cgit v1.2.3


From 74e1fbb1375a3ede3e17da22911761ce9bc8f53f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:17 +0200
Subject: of: Introduce struct of_phandle_iterator

This struct carrys all necessary information to iterate over
a list of phandles and extract the arguments. Add an
init-function for the iterator and make use of it in
__of_parse_phandle_with_args().

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 99 +++++++++++++++++++++++++++++++++---------------------
 include/linux/of.h | 33 ++++++++++++++++++
 2 files changed, 93 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index b299de2b3afa..1c6f43b5737d 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1440,35 +1440,56 @@ void of_print_phandle_args(const char *msg, const struct of_phandle_args *args)
 	printk("\n");
 }
 
+int of_phandle_iterator_init(struct of_phandle_iterator *it,
+		const struct device_node *np,
+		const char *list_name,
+		const char *cells_name,
+		int cell_count)
+{
+	const __be32 *list;
+	int size;
+
+	memset(it, 0, sizeof(*it));
+
+	list = of_get_property(np, list_name, &size);
+	if (!list)
+		return -ENOENT;
+
+	it->cells_name = cells_name;
+	it->cell_count = cell_count;
+	it->parent = np;
+	it->list_end = list + size / sizeof(*list);
+	it->phandle_end = list;
+	it->cur = list;
+
+	return 0;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
 					int cell_count, int index,
 					struct of_phandle_args *out_args)
 {
-	const __be32 *list, *list_end;
-	int rc = 0, size, cur_index = 0;
-	uint32_t count = 0;
-	struct device_node *node = NULL;
-	phandle phandle;
+	struct of_phandle_iterator it;
+	int rc, cur_index = 0;
 
-	/* Retrieve the phandle list property */
-	list = of_get_property(np, list_name, &size);
-	if (!list)
-		return -ENOENT;
-	list_end = list + size / sizeof(*list);
+	rc = of_phandle_iterator_init(&it, np, list_name,
+				      cells_name, cell_count);
+	if (rc)
+		return rc;
 
 	/* Loop over the phandles until all the requested entry is found */
-	while (list < list_end) {
+	while (it.cur < it.list_end) {
 		rc = -EINVAL;
-		count = 0;
+		it.cur_count = 0;
 
 		/*
 		 * If phandle is 0, then it is an empty entry with no
 		 * arguments.  Skip forward to the next entry.
 		 */
-		phandle = be32_to_cpup(list++);
-		if (phandle) {
+		it.phandle = be32_to_cpup(it.cur++);
+		if (it.phandle) {
 			/*
 			 * Find the provider node and parse the #*-cells
 			 * property to determine the argument length.
@@ -1478,34 +1499,34 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 			 * except when we're going to return the found node
 			 * below.
 			 */
-			if (cells_name || cur_index == index) {
-				node = of_find_node_by_phandle(phandle);
-				if (!node) {
+			if (it.cells_name || cur_index == index) {
+				it.node = of_find_node_by_phandle(it.phandle);
+				if (!it.node) {
 					pr_err("%s: could not find phandle\n",
-						np->full_name);
+						it.parent->full_name);
 					goto err;
 				}
 			}
 
-			if (cells_name) {
-				if (of_property_read_u32(node, cells_name,
-							 &count)) {
+			if (it.cells_name) {
+				if (of_property_read_u32(it.node, it.cells_name,
+							 &it.cur_count)) {
 					pr_err("%s: could not get %s for %s\n",
-						np->full_name, cells_name,
-						node->full_name);
+						it.parent->full_name, it.cells_name,
+						it.node->full_name);
 					goto err;
 				}
 			} else {
-				count = cell_count;
+				it.cur_count = it.cell_count;
 			}
 
 			/*
 			 * Make sure that the arguments actually fit in the
 			 * remaining property data length
 			 */
-			if (list + count > list_end) {
+			if (it.cur + it.cur_count > it.list_end) {
 				pr_err("%s: arguments longer than property\n",
-					 np->full_name);
+					 it.parent->full_name);
 				goto err;
 			}
 		}
@@ -1518,28 +1539,28 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 		 */
 		rc = -ENOENT;
 		if (cur_index == index) {
-			if (!phandle)
+			if (!it.phandle)
 				goto err;
 
 			if (out_args) {
 				int i;
-				if (WARN_ON(count > MAX_PHANDLE_ARGS))
-					count = MAX_PHANDLE_ARGS;
-				out_args->np = node;
-				out_args->args_count = count;
-				for (i = 0; i < count; i++)
-					out_args->args[i] = be32_to_cpup(list++);
+				if (WARN_ON(it.cur_count > MAX_PHANDLE_ARGS))
+					it.cur_count = MAX_PHANDLE_ARGS;
+				out_args->np = it.node;
+				out_args->args_count = it.cur_count;
+				for (i = 0; i < it.cur_count; i++)
+					out_args->args[i] = be32_to_cpup(it.cur++);
 			} else {
-				of_node_put(node);
+				of_node_put(it.node);
 			}
 
 			/* Found it! return success */
 			return 0;
 		}
 
-		of_node_put(node);
-		node = NULL;
-		list += count;
+		of_node_put(it.node);
+		it.node = NULL;
+		it.cur += it.cur_count;
 		cur_index++;
 	}
 
@@ -1551,8 +1572,8 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	 */
 	rc = index < 0 ? cur_index : -ENOENT;
  err:
-	if (node)
-		of_node_put(node);
+	if (it.node)
+		of_node_put(it.node);
 	return rc;
 }
 
diff --git a/include/linux/of.h b/include/linux/of.h
index 7fcb681baadf..0f187dbb890b 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -75,6 +75,23 @@ struct of_phandle_args {
 	uint32_t args[MAX_PHANDLE_ARGS];
 };
 
+struct of_phandle_iterator {
+	/* Common iterator information */
+	const char *cells_name;
+	int cell_count;
+	const struct device_node *parent;
+
+	/* List size information */
+	const __be32 *list_end;
+	const __be32 *phandle_end;
+
+	/* Current position state */
+	const __be32 *cur;
+	uint32_t cur_count;
+	phandle phandle;
+	struct device_node *node;
+};
+
 struct of_reconfig_data {
 	struct device_node	*dn;
 	struct property		*prop;
@@ -334,6 +351,13 @@ extern int of_parse_phandle_with_fixed_args(const struct device_node *np,
 extern int of_count_phandle_with_args(const struct device_node *np,
 	const char *list_name, const char *cells_name);
 
+/* phandle iterator functions */
+extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
+				    const struct device_node *np,
+				    const char *list_name,
+				    const char *cells_name,
+				    int cell_count);
+
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
@@ -608,6 +632,15 @@ static inline int of_count_phandle_with_args(struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
+					   const struct device_node *np,
+					   const char *list_name,
+					   const char *cells_name,
+					   int cell_count)
+{
+	return -ENOSYS;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From cd209b412c8a5d632b51af1e45576f0d00b8105f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:18 +0200
Subject: of: Move phandle walking to of_phandle_iterator_next()

Move the code to walk over the phandles out of the loop in
__of_parse_phandle_with_args() to a separate function that
just works with the iterator handle: of_phandle_iterator_next().

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 130 ++++++++++++++++++++++++++++++-----------------------
 include/linux/of.h |   7 +++
 2 files changed, 81 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 1c6f43b5737d..69286ec206f7 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1465,6 +1465,75 @@ int of_phandle_iterator_init(struct of_phandle_iterator *it,
 	return 0;
 }
 
+int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+	uint32_t count = 0;
+
+	if (it->node) {
+		of_node_put(it->node);
+		it->node = NULL;
+	}
+
+	if (!it->cur || it->phandle_end >= it->list_end)
+		return -ENOENT;
+
+	it->cur = it->phandle_end;
+
+	/* If phandle is 0, then it is an empty entry with no arguments. */
+	it->phandle = be32_to_cpup(it->cur++);
+
+	if (it->phandle) {
+
+		/*
+		 * Find the provider node and parse the #*-cells property to
+		 * determine the argument length.
+		 */
+		it->node = of_find_node_by_phandle(it->phandle);
+
+		if (it->cells_name) {
+			if (!it->node) {
+				pr_err("%s: could not find phandle\n",
+				       it->parent->full_name);
+				goto err;
+			}
+
+			if (of_property_read_u32(it->node, it->cells_name,
+						 &count)) {
+				pr_err("%s: could not get %s for %s\n",
+				       it->parent->full_name,
+				       it->cells_name,
+				       it->node->full_name);
+				goto err;
+			}
+		} else {
+			count = it->cell_count;
+		}
+
+		/*
+		 * Make sure that the arguments actually fit in the remaining
+		 * property data length
+		 */
+		if (it->cur + count > it->list_end) {
+			pr_err("%s: arguments longer than property\n",
+			       it->parent->full_name);
+			goto err;
+		}
+	}
+
+	it->phandle_end = it->cur + count;
+	it->cur_count = count;
+
+	return 0;
+
+err:
+	if (it->node) {
+		of_node_put(it->node);
+		it->node = NULL;
+	}
+
+	return -EINVAL;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
@@ -1480,59 +1549,9 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 		return rc;
 
 	/* Loop over the phandles until all the requested entry is found */
-	while (it.cur < it.list_end) {
-		rc = -EINVAL;
-		it.cur_count = 0;
-
-		/*
-		 * If phandle is 0, then it is an empty entry with no
-		 * arguments.  Skip forward to the next entry.
-		 */
-		it.phandle = be32_to_cpup(it.cur++);
-		if (it.phandle) {
-			/*
-			 * Find the provider node and parse the #*-cells
-			 * property to determine the argument length.
-			 *
-			 * This is not needed if the cell count is hard-coded
-			 * (i.e. cells_name not set, but cell_count is set),
-			 * except when we're going to return the found node
-			 * below.
-			 */
-			if (it.cells_name || cur_index == index) {
-				it.node = of_find_node_by_phandle(it.phandle);
-				if (!it.node) {
-					pr_err("%s: could not find phandle\n",
-						it.parent->full_name);
-					goto err;
-				}
-			}
-
-			if (it.cells_name) {
-				if (of_property_read_u32(it.node, it.cells_name,
-							 &it.cur_count)) {
-					pr_err("%s: could not get %s for %s\n",
-						it.parent->full_name, it.cells_name,
-						it.node->full_name);
-					goto err;
-				}
-			} else {
-				it.cur_count = it.cell_count;
-			}
-
-			/*
-			 * Make sure that the arguments actually fit in the
-			 * remaining property data length
-			 */
-			if (it.cur + it.cur_count > it.list_end) {
-				pr_err("%s: arguments longer than property\n",
-					 it.parent->full_name);
-				goto err;
-			}
-		}
-
+	while ((rc = of_phandle_iterator_next(&it)) == 0) {
 		/*
-		 * All of the error cases above bail out of the loop, so at
+		 * All of the error cases bail out of the loop, so at
 		 * this point, the parsing is successful. If the requested
 		 * index matches, then fill the out_args structure and return,
 		 * or return -ENOENT for an empty entry.
@@ -1558,9 +1577,6 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 			return 0;
 		}
 
-		of_node_put(it.node);
-		it.node = NULL;
-		it.cur += it.cur_count;
 		cur_index++;
 	}
 
@@ -1570,7 +1586,9 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	 * -EINVAL : parsing error on data
 	 * [1..n]  : Number of phandle (count mode; when index = -1)
 	 */
-	rc = index < 0 ? cur_index : -ENOENT;
+	if (rc == -ENOENT && index < 0)
+		rc = cur_index;
+
  err:
 	if (it.node)
 		of_node_put(it.node);
diff --git a/include/linux/of.h b/include/linux/of.h
index 0f187dbb890b..1f5e108f6716 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -358,6 +358,8 @@ extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
 				    const char *cells_name,
 				    int cell_count);
 
+extern int of_phandle_iterator_next(struct of_phandle_iterator *it);
+
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
 extern int of_alias_get_highest_id(const char *stem);
@@ -641,6 +643,11 @@ static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+	return -ENOSYS;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From f623ce95a51baee6a6638f0b025efc0229a9ac0d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:20 +0200
Subject: of: Introduce of_for_each_phandle() helper macro

With this macro any user can easily iterate over a list of
phandles. The patch also converts __of_parse_phandle_with_args()
to make use of the macro.

The of_count_phandle_with_args() function is not converted,
because the macro hides the return value of of_phandle_iterator_init(),
which is needed in there.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 7 +------
 include/linux/of.h | 6 ++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index fcff2b62ec10..ea5a13d3c5a5 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1543,13 +1543,8 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 	struct of_phandle_iterator it;
 	int rc, cur_index = 0;
 
-	rc = of_phandle_iterator_init(&it, np, list_name,
-				      cells_name, cell_count);
-	if (rc)
-		return rc;
-
 	/* Loop over the phandles until all the requested entry is found */
-	while ((rc = of_phandle_iterator_next(&it)) == 0) {
+	of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) {
 		/*
 		 * All of the error cases bail out of the loop, so at
 		 * this point, the parsing is successful. If the requested
diff --git a/include/linux/of.h b/include/linux/of.h
index 1f5e108f6716..b0b80716fbfb 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -908,6 +908,12 @@ static inline int of_property_read_s32(const struct device_node *np,
 	return of_property_read_u32(np, propname, (u32*) out_value);
 }
 
+#define of_for_each_phandle(it, err, np, ln, cn, cc)			\
+	for (of_phandle_iterator_init((it), (np), (ln), (cn), (cc)),	\
+	     err = of_phandle_iterator_next(it);			\
+	     err == 0;							\
+	     err = of_phandle_iterator_next(it))
+
 #define of_property_for_each_u32(np, propname, prop, p, u)	\
 	for (prop = of_find_property(np, propname, NULL),	\
 		p = of_prop_next_u32(prop, NULL, &u);		\
-- 
cgit v1.2.3


From abdaa77b18480361f3565d958a2acffad268c39c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 4 Apr 2016 17:49:21 +0200
Subject: of: Introduce of_phandle_iterator_args()

This helper function can be used to copy the arguments of a
phandle to an array.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/base.c  | 29 +++++++++++++++++++++++------
 include/linux/of.h | 10 ++++++++++
 2 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index ea5a13d3c5a5..e87e21df19d8 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1534,6 +1534,23 @@ err:
 	return -EINVAL;
 }
 
+int of_phandle_iterator_args(struct of_phandle_iterator *it,
+			     uint32_t *args,
+			     int size)
+{
+	int i, count;
+
+	count = it->cur_count;
+
+	if (WARN_ON(size < count))
+		count = size;
+
+	for (i = 0; i < count; i++)
+		args[i] = be32_to_cpup(it->cur++);
+
+	return count;
+}
+
 static int __of_parse_phandle_with_args(const struct device_node *np,
 					const char *list_name,
 					const char *cells_name,
@@ -1557,13 +1574,13 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
 				goto err;
 
 			if (out_args) {
-				int i;
-				if (WARN_ON(it.cur_count > MAX_PHANDLE_ARGS))
-					it.cur_count = MAX_PHANDLE_ARGS;
+				int c;
+
+				c = of_phandle_iterator_args(&it,
+							     out_args->args,
+							     MAX_PHANDLE_ARGS);
 				out_args->np = it.node;
-				out_args->args_count = it.cur_count;
-				for (i = 0; i < it.cur_count; i++)
-					out_args->args[i] = be32_to_cpup(it.cur++);
+				out_args->args_count = c;
 			} else {
 				of_node_put(it.node);
 			}
diff --git a/include/linux/of.h b/include/linux/of.h
index b0b80716fbfb..71e1c35a5960 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -359,6 +359,9 @@ extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
 				    int cell_count);
 
 extern int of_phandle_iterator_next(struct of_phandle_iterator *it);
+extern int of_phandle_iterator_args(struct of_phandle_iterator *it,
+				    uint32_t *args,
+				    int size);
 
 extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
 extern int of_alias_get_id(struct device_node *np, const char *stem);
@@ -648,6 +651,13 @@ static inline int of_phandle_iterator_next(struct of_phandle_iterator *it)
 	return -ENOSYS;
 }
 
+static inline int of_phandle_iterator_args(struct of_phandle_iterator *it,
+					   uint32_t *args,
+					   int size)
+{
+	return 0;
+}
+
 static inline int of_alias_get_id(struct device_node *np, const char *stem)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:55 -0600
Subject: tracing: Add enable_hist/disable_hist triggers

Similar to enable_event/disable_event triggers, these triggers enable
and disable the aggregation of events into maps rather than enabling
and disabling their writing into the trace buffer.

They can be used to automatically start and stop hist triggers based
on a matching filter condition.

If there's a paused hist trigger on system:event, the following would
start it when the filter condition was hit:

  # echo enable_hist:system:event [ if filter] > event/trigger

And the following would disable a running system:event hist trigger:

  # echo disable_hist:system:event [ if filter] > event/trigger

See Documentation/trace/events.txt for real examples.

Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/trace.c                |   8 +++
 kernel/trace/trace.h                |  32 ++++++++++
 kernel/trace/trace_events_hist.c    | 115 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |  71 ++++++++++++----------
 5 files changed, 196 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 404603720650..5f89a5b0c7e6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -408,6 +408,7 @@ enum event_trigger_type {
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
 	ETT_EVENT_HIST		= (1 << 4),
+	ETT_HIST_ENABLE		= (1 << 5),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2238bfde799b..8430145bea12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3807,6 +3807,10 @@ static const char readme_msg[] =
 	"\t   trigger: traceon, traceoff\n"
 	"\t            enable_event:<system>:<event>\n"
 	"\t            disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t            enable_hist:<system>:<event>\n"
+	"\t            disable_hist:<system>:<event>\n"
+#endif
 #ifdef CONFIG_STACKTRACE
 	"\t\t    stacktrace\n"
 #endif
@@ -3867,6 +3871,10 @@ static const char readme_msg[] =
 	"\t    The 'clear' parameter will clear the contents of a running\n"
 	"\t    hist trigger and leave its current paused/active state\n"
 	"\t    unchanged.\n\n"
+	"\t    The enable_hist and disable_hist triggers can be used to\n"
+	"\t    have one event conditionally start and stop another event's\n"
+	"\t    already-attached hist trigger.  The syntax is analagous to\n"
+	"\t    the enable_event and disable_event triggers.\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 505f8a45f426..cab1f4bfe85b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops;
 
 #ifdef CONFIG_HIST_TRIGGERS
 extern int register_trigger_hist_cmd(void);
+extern int register_trigger_hist_enable_disable_cmds(void);
 #else
 static inline int register_trigger_hist_cmd(void) { return 0; }
+static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
 #endif
 
 extern int register_trigger_cmds(void);
@@ -1185,6 +1187,34 @@ struct event_trigger_data {
 	struct list_head		list;
 };
 
+/* Avoid typos */
+#define ENABLE_EVENT_STR	"enable_event"
+#define DISABLE_EVENT_STR	"disable_event"
+#define ENABLE_HIST_STR		"enable_hist"
+#define DISABLE_HIST_STR	"disable_hist"
+
+struct enable_trigger_data {
+	struct trace_event_file		*file;
+	bool				enable;
+	bool				hist;
+};
+
+extern int event_enable_trigger_print(struct seq_file *m,
+				      struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern int event_enable_trigger_func(struct event_command *cmd_ops,
+				     struct trace_event_file *file,
+				     char *glob, char *cmd, char *param);
+extern int event_enable_register_trigger(char *glob,
+					 struct event_trigger_ops *ops,
+					 struct event_trigger_data *data,
+					 struct trace_event_file *file);
+extern void event_enable_unregister_trigger(char *glob,
+					    struct event_trigger_ops *ops,
+					    struct event_trigger_data *test,
+					    struct trace_event_file *file);
 extern void trigger_data_free(struct event_trigger_data *data);
 extern int event_trigger_init(struct event_trigger_ops *ops,
 			      struct event_trigger_data *data);
@@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
 extern int register_event_command(struct event_command *cmd);
+extern int unregister_event_command(struct event_command *cmd);
+extern int register_trigger_hist_enable_disable_cmds(void);
 
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4f4041d76926..5d4f02792440 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void)
 
 	return ret;
 }
+
+static void
+hist_enable_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct enable_trigger_data *enable_data = data->private_data;
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (enable_data->enable)
+				test->paused = false;
+			else
+				test->paused = true;
+			break;
+		}
+	}
+}
+
+static void
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+{
+	if (!data->count)
+		return;
+
+	if (data->count != -1)
+		(data->count)--;
+
+	hist_enable_trigger(data, rec);
+}
+
+static struct event_trigger_ops hist_enable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_enable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops *
+hist_enable_get_trigger_ops(char *cmd, char *param)
+{
+	struct event_trigger_ops *ops;
+	bool enable;
+
+	enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
+
+	if (enable)
+		ops = param ? &hist_enable_count_trigger_ops :
+			&hist_enable_trigger_ops;
+	else
+		ops = param ? &hist_disable_count_trigger_ops :
+			&hist_disable_trigger_ops;
+
+	return ops;
+}
+
+static struct event_command trigger_hist_enable_cmd = {
+	.name			= ENABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static struct event_command trigger_hist_disable_cmd = {
+	.name			= DISABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static __init void unregister_trigger_hist_enable_disable_cmds(void)
+{
+	unregister_event_command(&trigger_hist_enable_cmd);
+	unregister_event_command(&trigger_hist_disable_cmd);
+}
+
+__init int register_trigger_hist_enable_disable_cmds(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_enable_cmd);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = register_event_command(&trigger_hist_disable_cmd);
+	if (WARN_ON(ret < 0))
+		unregister_trigger_hist_enable_disable_cmds();
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d29092afe005..d133f2094566 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd)
  * Currently we only unregister event commands from __init, so mark
  * this __init too.
  */
-static __init int unregister_event_command(struct event_command *cmd)
+__init int unregister_event_command(struct event_command *cmd)
 {
 	struct event_command *p, *n;
 	int ret = -ENODEV;
@@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 	unregister_event_command(&trigger_traceoff_cmd);
 }
 
-/* Avoid typos */
-#define ENABLE_EVENT_STR	"enable_event"
-#define DISABLE_EVENT_STR	"disable_event"
-
-struct enable_trigger_data {
-	struct trace_event_file		*file;
-	bool				enable;
-};
-
 static void
 event_enable_trigger(struct event_trigger_data *data, void *rec)
 {
@@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	event_enable_trigger(data, rec);
 }
 
-static int
-event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-			   struct event_trigger_data *data)
+int event_enable_trigger_print(struct seq_file *m,
+			       struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	seq_printf(m, "%s:%s:%s",
-		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+		   enable_data->hist ?
+		   (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) :
+		   (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR),
 		   enable_data->file->event_call->class->system,
 		   trace_event_name(enable_data->file->event_call));
 
@@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	return 0;
 }
 
-static void
-event_enable_trigger_free(struct event_trigger_ops *ops,
-			  struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 	.free			= event_enable_trigger_free,
 };
 
-static int
-event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct trace_event_file *file,
-			  char *glob, char *cmd, char *param)
+int event_enable_trigger_func(struct event_command *cmd_ops,
+			      struct trace_event_file *file,
+			      char *glob, char *cmd, char *param)
 {
 	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
@@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	struct trace_array *tr = file->tr;
 	const char *system;
 	const char *event;
+	bool hist = false;
 	char *trigger;
 	char *number;
 	bool enable;
@@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	if (!event_enable_file)
 		goto out;
 
-	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#ifdef CONFIG_HIST_TRIGGERS
+	hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
+		(strcmp(cmd, DISABLE_HIST_STR) == 0));
 
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
+	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#endif
 	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
 
 	ret = -ENOMEM;
@@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	INIT_LIST_HEAD(&trigger_data->list);
 	RCU_INIT_POINTER(trigger_data->filter, NULL);
 
+	enable_data->hist = hist;
 	enable_data->enable = enable;
 	enable_data->file = event_enable_file;
 	trigger_data->private_data = enable_data;
@@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	goto out;
 }
 
-static int event_enable_register_trigger(char *glob,
-					 struct event_trigger_ops *ops,
-					 struct event_trigger_data *data,
-					 struct trace_event_file *file)
+int event_enable_register_trigger(char *glob,
+				  struct event_trigger_ops *ops,
+				  struct event_trigger_data *data,
+				  struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob,
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		test_enable_data = test->private_data;
 		if (test_enable_data &&
+		    (test->cmd_ops->trigger_type ==
+		     data->cmd_ops->trigger_type) &&
 		    (test_enable_data->file == enable_data->file)) {
 			ret = -EEXIST;
 			goto out;
@@ -1343,10 +1345,10 @@ out:
 	return ret;
 }
 
-static void event_enable_unregister_trigger(char *glob,
-					    struct event_trigger_ops *ops,
-					    struct event_trigger_data *test,
-					    struct trace_event_file *file)
+void event_enable_unregister_trigger(char *glob,
+				     struct event_trigger_ops *ops,
+				     struct event_trigger_data *test,
+				     struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
@@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob,
 	list_for_each_entry_rcu(data, &file->triggers, list) {
 		enable_data = data->private_data;
 		if (enable_data &&
+		    (data->cmd_ops->trigger_type ==
+		     test->cmd_ops->trigger_type) &&
 		    (enable_data->file == test_enable_data->file)) {
 			unregistered = true;
 			list_del_rcu(&data->list);
@@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param)
 	struct event_trigger_ops *ops;
 	bool enable;
 
+#ifdef CONFIG_HIST_TRIGGERS
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
 	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-
+#endif
 	if (enable)
 		ops = param ? &event_enable_count_trigger_ops :
 			&event_enable_trigger_ops;
@@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_enable_disable_cmds();
 	register_trigger_hist_cmd();
 
 	return 0;
-- 
cgit v1.2.3


From c1d61c9bb163e696bf06850bcabbd26386554489 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 31 Mar 2016 16:34:32 -0600
Subject: PCI: Reverse standard ACS vs device-specific ACS enabling

The original thought was that if a device implemented ACS, then surely
we want to use that... well, it turns out that devices can make an ACS
capability so broken that we still need to fall back to quirks.

Reverse the order of ACS enabling to give quirks first shot at it.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c    | 10 ++++------
 drivers/pci/quirks.c |  6 ++++--
 include/linux/pci.h  |  7 +++++--
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..c98c4e2aed3c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2547,7 +2547,7 @@ void pci_request_acs(void)
  * pci_std_enable_acs - enable ACS on devices using standard ACS capabilites
  * @dev: the PCI device
  */
-static int pci_std_enable_acs(struct pci_dev *dev)
+static void pci_std_enable_acs(struct pci_dev *dev)
 {
 	int pos;
 	u16 cap;
@@ -2555,7 +2555,7 @@ static int pci_std_enable_acs(struct pci_dev *dev)
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS);
 	if (!pos)
-		return -ENODEV;
+		return;
 
 	pci_read_config_word(dev, pos + PCI_ACS_CAP, &cap);
 	pci_read_config_word(dev, pos + PCI_ACS_CTRL, &ctrl);
@@ -2573,8 +2573,6 @@ static int pci_std_enable_acs(struct pci_dev *dev)
 	ctrl |= (cap & PCI_ACS_UF);
 
 	pci_write_config_word(dev, pos + PCI_ACS_CTRL, ctrl);
-
-	return 0;
 }
 
 /**
@@ -2586,10 +2584,10 @@ void pci_enable_acs(struct pci_dev *dev)
 	if (!pci_acs_enable)
 		return;
 
-	if (!pci_std_enable_acs(dev))
+	if (!pci_dev_specific_enable_acs(dev))
 		return;
 
-	pci_dev_specific_enable_acs(dev);
+	pci_std_enable_acs(dev);
 }
 
 static bool pci_acs_flags_enabled(struct pci_dev *pdev, u16 acs_flags)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e248c2aad000..1f5c7898a246 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4201,7 +4201,7 @@ static const struct pci_dev_enable_acs {
 	{ 0 }
 };
 
-void pci_dev_specific_enable_acs(struct pci_dev *dev)
+int pci_dev_specific_enable_acs(struct pci_dev *dev)
 {
 	const struct pci_dev_enable_acs *i;
 	int ret;
@@ -4213,9 +4213,11 @@ void pci_dev_specific_enable_acs(struct pci_dev *dev)
 		     i->device == (u16)PCI_ANY_ID)) {
 			ret = i->enable_acs(dev);
 			if (ret >= 0)
-				return;
+				return ret;
 		}
 	}
+
+	return -ENOTTY;
 }
 
 /*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..aaec79aee805 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1663,7 +1663,7 @@ enum pci_fixup_pass {
 #ifdef CONFIG_PCI_QUIRKS
 void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
 int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
-void pci_dev_specific_enable_acs(struct pci_dev *dev);
+int pci_dev_specific_enable_acs(struct pci_dev *dev);
 #else
 static inline void pci_fixup_device(enum pci_fixup_pass pass,
 				    struct pci_dev *dev) { }
@@ -1672,7 +1672,10 @@ static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
 {
 	return -ENOTTY;
 }
-static inline void pci_dev_specific_enable_acs(struct pci_dev *dev) { }
+static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
+{
+	return -ENOTTY;
+}
 #endif
 
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
-- 
cgit v1.2.3


From a14b9e0512404ed7d4415b888dc9f1f9785a4fa3 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 16:40:47 -0800
Subject: clkdev: Remove clk_register_clkdevs()

Now that we've converted the only caller over to another clkdev
API, remove this one.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clkdev.c   | 27 ---------------------------
 include/linux/clkdev.h |  1 -
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index eb20b941154b..ae8e40a82d34 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -402,30 +402,3 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 	return cl ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL(clk_register_clkdev);
-
-/**
- * clk_register_clkdevs - register a set of clk_lookup for a struct clk
- * @clk: struct clk to associate with all clk_lookups
- * @cl: array of clk_lookup structures with con_id and dev_id pre-initialized
- * @num: number of clk_lookup structures to register
- *
- * To make things easier for mass registration, we detect error clks
- * from a previous clk_register() call, and return the error code for
- * those.  This is to permit this function to be called immediately
- * after clk_register().
- */
-int clk_register_clkdevs(struct clk *clk, struct clk_lookup *cl, size_t num)
-{
-	unsigned i;
-
-	if (IS_ERR(clk))
-		return PTR_ERR(clk);
-
-	for (i = 0; i < num; i++, cl++) {
-		cl->clk_hw = __clk_get_hw(clk);
-		__clkdev_add(cl);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(clk_register_clkdevs);
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index c2c04f7cbe8a..e6f8eb1d585f 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -45,7 +45,6 @@ void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
 int clk_register_clkdev(struct clk *, const char *, const char *);
-int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t);
 
 #ifdef CONFIG_COMMON_CLK
 int __clk_get(struct clk *clk);
-- 
cgit v1.2.3


From 4143804c4fdef40358c654d1fb2271a1a0f1fedf Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 17:02:52 -0800
Subject: clk: Add {devm_}clk_hw_{register,unregister}() APIs

We've largely split the clk consumer and provider APIs along
struct clk and struct clk_hw, but clk_register() still returns a
struct clk pointer for each struct clk_hw that's registered.
Eventually we'd like to only allocate struct clks when there's a
user, because struct clk is per-user now, so clk_register() needs
to change.

Let's add new APIs to register struct clk_hws, but this time
we'll hide the struct clk from the caller by returning an int
error code. Also add an unregistration API that takes the clk_hw
structure that was passed to the registration API. This way
provider drivers never have to deal with a struct clk pointer
unless they're using the clk consumer APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 Documentation/driver-model/devres.txt |  1 +
 drivers/clk/clk.c                     | 86 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h          |  6 +++
 3 files changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 73b98dfbcea4..108d45553e1b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -236,6 +236,7 @@ certainly invest a bit more effort into libata core layer).
 CLOCK
   devm_clk_get()
   devm_clk_put()
+  devm_clk_hw_register()
 
 DMA
   dmam_alloc_coherent()
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index fb74dc1f7520..0ef919666827 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2536,6 +2536,22 @@ fail_out:
 }
 EXPORT_SYMBOL_GPL(clk_register);
 
+/**
+ * clk_hw_register - register a clk_hw and return an error code
+ * @dev: device that is registering this clock
+ * @hw: link to hardware-specific clock data
+ *
+ * clk_hw_register is the primary interface for populating the clock tree with
+ * new clock nodes. It returns an integer equal to zero indicating success or
+ * less than zero indicating failure. Drivers must test for an error code after
+ * calling clk_hw_register().
+ */
+int clk_hw_register(struct device *dev, struct clk_hw *hw)
+{
+	return PTR_ERR_OR_ZERO(clk_register(dev, hw));
+}
+EXPORT_SYMBOL_GPL(clk_hw_register);
+
 /* Free memory allocated for a clock. */
 static void __clk_release(struct kref *ref)
 {
@@ -2637,11 +2653,26 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(clk_unregister);
 
+/**
+ * clk_hw_unregister - unregister a currently registered clk_hw
+ * @hw: hardware-specific clock data to unregister
+ */
+void clk_hw_unregister(struct clk_hw *hw)
+{
+	clk_unregister(hw->clk);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister);
+
 static void devm_clk_release(struct device *dev, void *res)
 {
 	clk_unregister(*(struct clk **)res);
 }
 
+static void devm_clk_hw_release(struct device *dev, void *res)
+{
+	clk_hw_unregister(*(struct clk_hw **)res);
+}
+
 /**
  * devm_clk_register - resource managed clk_register()
  * @dev: device that is registering this clock
@@ -2672,6 +2703,36 @@ struct clk *devm_clk_register(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_register);
 
+/**
+ * devm_clk_hw_register - resource managed clk_hw_register()
+ * @dev: device that is registering this clock
+ * @hw: link to hardware-specific clock data
+ *
+ * Managed clk_hw_register(). Clocks returned from this function are
+ * automatically clk_hw_unregister()ed on driver detach. See clk_hw_register()
+ * for more information.
+ */
+int devm_clk_hw_register(struct device *dev, struct clk_hw *hw)
+{
+	struct clk_hw **hwp;
+	int ret;
+
+	hwp = devres_alloc(devm_clk_hw_release, sizeof(*hwp), GFP_KERNEL);
+	if (!hwp)
+		return -ENOMEM;
+
+	ret = clk_hw_register(dev, hw);
+	if (!ret) {
+		*hwp = hw;
+		devres_add(dev, hwp);
+	} else {
+		devres_free(hwp);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_register);
+
 static int devm_clk_match(struct device *dev, void *res, void *data)
 {
 	struct clk *c = res;
@@ -2680,6 +2741,15 @@ static int devm_clk_match(struct device *dev, void *res, void *data)
 	return c == data;
 }
 
+static int devm_clk_hw_match(struct device *dev, void *res, void *data)
+{
+	struct clk_hw *hw = res;
+
+	if (WARN_ON(!hw))
+		return 0;
+	return hw == data;
+}
+
 /**
  * devm_clk_unregister - resource managed clk_unregister()
  * @clk: clock to unregister
@@ -2694,6 +2764,22 @@ void devm_clk_unregister(struct device *dev, struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(devm_clk_unregister);
 
+/**
+ * devm_clk_hw_unregister - resource managed clk_hw_unregister()
+ * @dev: device that is unregistering the hardware-specific clock data
+ * @hw: link to hardware-specific clock data
+ *
+ * Unregister a clk_hw registered with devm_clk_hw_register(). Normally
+ * this function will not need to be called and the resource management
+ * code will ensure that the resource is freed.
+ */
+void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
+{
+	WARN_ON(devres_release(dev, devm_clk_hw_release, devm_clk_hw_match,
+				hw));
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
+
 /*
  * clkdev helpers
  */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..bc6c8de1fac1 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -655,9 +655,15 @@ struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 struct clk *clk_register(struct device *dev, struct clk_hw *hw);
 struct clk *devm_clk_register(struct device *dev, struct clk_hw *hw);
 
+int __must_check clk_hw_register(struct device *dev, struct clk_hw *hw);
+int __must_check devm_clk_hw_register(struct device *dev, struct clk_hw *hw);
+
 void clk_unregister(struct clk *clk);
 void devm_clk_unregister(struct device *dev, struct clk *clk);
 
+void clk_hw_unregister(struct clk_hw *hw);
+void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
+
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
-- 
cgit v1.2.3


From 0861e5b8cf80038e91942f1005c8dfce79d18c38 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 5 Feb 2016 17:38:26 -0800
Subject: clk: Add clk_hw OF clk providers

Now that we have a clk registration API that doesn't return
struct clks, we need to have some way to hand out struct clks via
the clk_get() APIs that doesn't involve associating struct clk
pointers with an OF node. Currently we ask the OF provider to
give us a struct clk pointer for some clkspec, turn that struct
clk into a struct clk_hw and then allocate a new struct clk to
return to the caller.

Let's add a clk_hw based OF provider hook that returns a struct
clk_hw directly, so that we skip the intermediate step of
converting from struct clk to struct clk_hw. Eventually when
we've converted all OF clk providers to struct clk_hw based APIs
we can remove the struct clk based ones.

It should also be noted that we change the onecell provider to
have a flex array instead of a pointer for the array of clk_hw
pointers. This allows providers to allocate one structure of the
correct length in one step instead of two.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 85 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/clk-provider.h | 30 ++++++++++++++++
 2 files changed, 111 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 0ef919666827..e813b2aabc87 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2941,6 +2941,7 @@ struct of_clk_provider {
 
 	struct device_node *node;
 	struct clk *(*get)(struct of_phandle_args *clkspec, void *data);
+	struct clk_hw *(*get_hw)(struct of_phandle_args *clkspec, void *data);
 	void *data;
 };
 
@@ -2957,6 +2958,12 @@ struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 }
 EXPORT_SYMBOL_GPL(of_clk_src_simple_get);
 
+struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec, void *data)
+{
+	return data;
+}
+EXPORT_SYMBOL_GPL(of_clk_hw_simple_get);
+
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data)
 {
 	struct clk_onecell_data *clk_data = data;
@@ -2971,6 +2978,21 @@ struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data)
 }
 EXPORT_SYMBOL_GPL(of_clk_src_onecell_get);
 
+struct clk_hw *
+of_clk_hw_onecell_get(struct of_phandle_args *clkspec, void *data)
+{
+	struct clk_hw_onecell_data *hw_data = data;
+	unsigned int idx = clkspec->args[0];
+
+	if (idx >= hw_data->num) {
+		pr_err("%s: invalid index %u\n", __func__, idx);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return hw_data->hws[idx];
+}
+EXPORT_SYMBOL_GPL(of_clk_hw_onecell_get);
+
 /**
  * of_clk_add_provider() - Register a clock provider for a node
  * @np: Device node pointer associated with clock provider
@@ -3006,6 +3028,41 @@ int of_clk_add_provider(struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_clk_add_provider);
 
+/**
+ * of_clk_add_hw_provider() - Register a clock provider for a node
+ * @np: Device node pointer associated with clock provider
+ * @get: callback for decoding clk_hw
+ * @data: context pointer for @get callback.
+ */
+int of_clk_add_hw_provider(struct device_node *np,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data)
+{
+	struct of_clk_provider *cp;
+	int ret;
+
+	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+	if (!cp)
+		return -ENOMEM;
+
+	cp->node = of_node_get(np);
+	cp->data = data;
+	cp->get_hw = get;
+
+	mutex_lock(&of_clk_mutex);
+	list_add(&cp->link, &of_clk_providers);
+	mutex_unlock(&of_clk_mutex);
+	pr_debug("Added clk_hw provider from %s\n", np->full_name);
+
+	ret = of_clk_set_defaults(np, true);
+	if (ret < 0)
+		of_clk_del_provider(np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_clk_add_hw_provider);
+
 /**
  * of_clk_del_provider() - Remove a previously registered clock provider
  * @np: Device node pointer associated with clock provider
@@ -3027,11 +3084,32 @@ void of_clk_del_provider(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_clk_del_provider);
 
+static struct clk_hw *
+__of_clk_get_hw_from_provider(struct of_clk_provider *provider,
+			      struct of_phandle_args *clkspec)
+{
+	struct clk *clk;
+	struct clk_hw *hw = ERR_PTR(-EPROBE_DEFER);
+
+	if (provider->get_hw) {
+		hw = provider->get_hw(clkspec, provider->data);
+	} else if (provider->get) {
+		clk = provider->get(clkspec, provider->data);
+		if (!IS_ERR(clk))
+			hw = __clk_get_hw(clk);
+		else
+			hw = ERR_CAST(clk);
+	}
+
+	return hw;
+}
+
 struct clk *__of_clk_get_from_provider(struct of_phandle_args *clkspec,
 				       const char *dev_id, const char *con_id)
 {
 	struct of_clk_provider *provider;
 	struct clk *clk = ERR_PTR(-EPROBE_DEFER);
+	struct clk_hw *hw = ERR_PTR(-EPROBE_DEFER);
 
 	if (!clkspec)
 		return ERR_PTR(-EINVAL);
@@ -3040,10 +3118,9 @@ struct clk *__of_clk_get_from_provider(struct of_phandle_args *clkspec,
 	mutex_lock(&of_clk_mutex);
 	list_for_each_entry(provider, &of_clk_providers, link) {
 		if (provider->node == clkspec->np)
-			clk = provider->get(clkspec, provider->data);
-		if (!IS_ERR(clk)) {
-			clk = __clk_create_clk(__clk_get_hw(clk), dev_id,
-					       con_id);
+			hw = __of_clk_get_hw_from_provider(provider, clkspec);
+		if (!IS_ERR(hw)) {
+			clk = __clk_create_clk(hw, dev_id, con_id);
 
 			if (!IS_ERR(clk) && !__clk_get(clk)) {
 				__clk_free_clk(clk);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bc6c8de1fac1..bf8c8bb8c2cb 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -709,6 +709,11 @@ struct clk_onecell_data {
 	unsigned int clk_num;
 };
 
+struct clk_hw_onecell_data {
+	size_t num;
+	struct clk_hw *hws[];
+};
+
 extern struct of_device_id __clk_of_table;
 
 #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
@@ -718,10 +723,18 @@ int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
 						   void *data),
 			void *data);
+int of_clk_add_hw_provider(struct device_node *np,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data);
 void of_clk_del_provider(struct device_node *np);
 struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
+struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec,
+				    void *data);
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data);
+struct clk_hw *of_clk_hw_onecell_get(struct of_phandle_args *clkspec,
+				     void *data);
 unsigned int of_clk_get_parent_count(struct device_node *np);
 int of_clk_parent_fill(struct device_node *np, const char **parents,
 		       unsigned int size);
@@ -738,17 +751,34 @@ static inline int of_clk_add_provider(struct device_node *np,
 {
 	return 0;
 }
+static inline int of_clk_add_hw_provider(struct device_node *np,
+			struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+					      void *data),
+			void *data)
+{
+	return 0;
+}
 static inline void of_clk_del_provider(struct device_node *np) {}
 static inline struct clk *of_clk_src_simple_get(
 	struct of_phandle_args *clkspec, void *data)
 {
 	return ERR_PTR(-ENOENT);
 }
+static inline struct clk_hw *
+of_clk_hw_simple_get(struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
 static inline struct clk *of_clk_src_onecell_get(
 	struct of_phandle_args *clkspec, void *data)
 {
 	return ERR_PTR(-ENOENT);
 }
+static inline struct clk_hw *
+of_clk_hw_onecell_get(struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
 static inline int of_clk_get_parent_count(struct device_node *np)
 {
 	return 0;
-- 
cgit v1.2.3


From e4f1b49bda6d6aa2e13730ff7eeccbe65a6271f1 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 8 Feb 2016 14:59:49 -0800
Subject: clkdev: Add clk_hw based registration APIs

Now that we have a clk registration API that doesn't return
struct clks, we need to have some way to hand out struct clks via
the clk_get() APIs that doesn't involve associating struct clk
pointers with a struct clk_lookup. Luckily, clkdev already
operates on struct clk_hw pointers, except for the registration
facing APIs where it converts struct clk pointers into struct
clk_hw pointers almost immediately.

Let's add clk_hw based registration APIs so that we can skip the
conversion step and provide a way for clk provider drivers to
operate exclusively on clk_hw structs. This way we clearly
split the API between consumers and providers.

Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clkdev.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clkdev.h |  6 +++++
 2 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index ae8e40a82d34..89cc700fbc37 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -301,6 +301,20 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 }
 EXPORT_SYMBOL(clkdev_alloc);
 
+struct clk_lookup *
+clkdev_hw_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, ...)
+{
+	struct clk_lookup *cl;
+	va_list ap;
+
+	va_start(ap, dev_fmt);
+	cl = vclkdev_alloc(hw, con_id, dev_fmt, ap);
+	va_end(ap);
+
+	return cl;
+}
+EXPORT_SYMBOL(clkdev_hw_alloc);
+
 /**
  * clkdev_create - allocate and add a clkdev lookup structure
  * @clk: struct clk to associate with all clk_lookups
@@ -324,6 +338,29 @@ struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
 }
 EXPORT_SYMBOL_GPL(clkdev_create);
 
+/**
+ * clkdev_hw_create - allocate and add a clkdev lookup structure
+ * @hw: struct clk_hw to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_fmt: format string describing device name
+ *
+ * Returns a clk_lookup structure, which can be later unregistered and
+ * freed.
+ */
+struct clk_lookup *clkdev_hw_create(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...)
+{
+	struct clk_lookup *cl;
+	va_list ap;
+
+	va_start(ap, dev_fmt);
+	cl = vclkdev_create(hw, con_id, dev_fmt, ap);
+	va_end(ap);
+
+	return cl;
+}
+EXPORT_SYMBOL_GPL(clkdev_hw_create);
+
 int clk_add_alias(const char *alias, const char *alias_dev_name,
 	const char *con_id, struct device *dev)
 {
@@ -402,3 +439,30 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 	return cl ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL(clk_register_clkdev);
+
+/**
+ * clk_hw_register_clkdev - register one clock lookup for a struct clk_hw
+ * @hw: struct clk_hw to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_id: format string describing device name
+ *
+ * con_id or dev_id may be NULL as a wildcard, just as in the rest of
+ * clkdev.
+ */
+int clk_hw_register_clkdev(struct clk_hw *hw, const char *con_id,
+	const char *dev_id)
+{
+	struct clk_lookup *cl;
+
+	/*
+	 * Since dev_id can be NULL, and NULL is handled specially, we must
+	 * pass it as either a NULL format string, or with "%s".
+	 */
+	if (dev_id)
+		cl = __clk_register_clkdev(hw, con_id, "%s", dev_id);
+	else
+		cl = __clk_register_clkdev(hw, con_id, NULL);
+
+	return cl ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(clk_hw_register_clkdev);
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index e6f8eb1d585f..2eabc862abdb 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -15,6 +15,7 @@
 #include <asm/clkdev.h>
 
 struct clk;
+struct clk_hw;
 struct device;
 
 struct clk_lookup {
@@ -34,17 +35,22 @@ struct clk_lookup {
 
 struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
 	const char *dev_fmt, ...) __printf(3, 4);
+struct clk_lookup *clkdev_hw_alloc(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
 struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
 	const char *dev_fmt, ...) __printf(3, 4);
+struct clk_lookup *clkdev_hw_create(struct clk_hw *hw, const char *con_id,
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
 int clk_register_clkdev(struct clk *, const char *, const char *);
+int clk_hw_register_clkdev(struct clk_hw *, const char *, const char *);
 
 #ifdef CONFIG_COMMON_CLK
 int __clk_get(struct clk *clk);
-- 
cgit v1.2.3


From eb7d264f3bf9ca7c093efb77bdde557c6c6e826f Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sat, 6 Feb 2016 23:26:37 -0800
Subject: clk: divider: Add hw based registration APIs

Add registration APIs in the clk divider code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-divider.c    | 91 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/clk-provider.h | 10 +++++
 2 files changed, 93 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 00e035b51c69..a0f55bc1ad3d 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -426,15 +426,16 @@ const struct clk_ops clk_divider_ro_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_divider_ro_ops);
 
-static struct clk *_register_divider(struct device *dev, const char *name,
+static struct clk_hw *_register_divider(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock)
 {
 	struct clk_divider *div;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	if (clk_divider_flags & CLK_DIVIDER_HIWORD_MASK) {
 		if (width + shift > 16) {
@@ -467,12 +468,14 @@ static struct clk *_register_divider(struct device *dev, const char *name,
 	div->table = table;
 
 	/* register the clock */
-	clk = clk_register(dev, &div->hw);
-
-	if (IS_ERR(clk))
+	hw = &div->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(div);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
 }
 
 /**
@@ -492,11 +495,38 @@ struct clk *clk_register_divider(struct device *dev, const char *name,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, spinlock_t *lock)
 {
-	return _register_divider(dev, name, parent_name, flags, reg, shift,
+	struct clk_hw *hw;
+
+	hw =  _register_divider(dev, name, parent_name, flags, reg, shift,
 			width, clk_divider_flags, NULL, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_divider);
 
+/**
+ * clk_hw_register_divider - register a divider clock with the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+struct clk_hw *clk_hw_register_divider(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, spinlock_t *lock)
+{
+	return _register_divider(dev, name, parent_name, flags, reg, shift,
+			width, clk_divider_flags, NULL, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_divider);
+
 /**
  * clk_register_divider_table - register a table based divider clock with
  * the clock framework
@@ -517,11 +547,41 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock)
 {
-	return _register_divider(dev, name, parent_name, flags, reg, shift,
+	struct clk_hw *hw;
+
+	hw =  _register_divider(dev, name, parent_name, flags, reg, shift,
 			width, clk_divider_flags, table, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_divider_table);
 
+/**
+ * clk_hw_register_divider_table - register a table based divider clock with
+ * the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @table: array of divider/value pairs ending with a div set to 0
+ * @lock: shared register lock for this clock
+ */
+struct clk_hw *clk_hw_register_divider_table(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, const struct clk_div_table *table,
+		spinlock_t *lock)
+{
+	return _register_divider(dev, name, parent_name, flags, reg, shift,
+			width, clk_divider_flags, table, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_divider_table);
+
 void clk_unregister_divider(struct clk *clk)
 {
 	struct clk_divider *div;
@@ -537,3 +597,18 @@ void clk_unregister_divider(struct clk *clk)
 	kfree(div);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_divider);
+
+/**
+ * clk_hw_unregister_divider - unregister a clk divider
+ * @hw: hardware-specific clock data to unregister
+ */
+void clk_hw_unregister_divider(struct clk_hw *hw)
+{
+	struct clk_divider *div;
+
+	div = to_clk_divider(hw);
+
+	clk_hw_unregister(hw);
+	kfree(div);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_divider);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bf8c8bb8c2cb..8885d0350596 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -407,12 +407,22 @@ struct clk *clk_register_divider(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_divider(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, spinlock_t *lock);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_divider_flags, const struct clk_div_table *table,
 		spinlock_t *lock);
+struct clk_hw *clk_hw_register_divider_table(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_divider_flags, const struct clk_div_table *table,
+		spinlock_t *lock);
 void clk_unregister_divider(struct clk *clk);
+void clk_hw_unregister_divider(struct clk_hw *hw);
 
 /**
  * struct clk_mux - multiplexer clock
-- 
cgit v1.2.3


From e270d8cb13763f58107198e879cf396511ba2867 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sat, 6 Feb 2016 23:54:45 -0800
Subject: clk: gate: Add hw based registration APIs

Add registration APIs in the clk gate code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gate.c       | 43 ++++++++++++++++++++++++++++++++++++-------
 include/linux/clk-provider.h |  5 +++++
 2 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index d0d8ec8e1f1b..4e691e35483a 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -110,7 +110,7 @@ const struct clk_ops clk_gate_ops = {
 EXPORT_SYMBOL_GPL(clk_gate_ops);
 
 /**
- * clk_register_gate - register a gate clock with the clock framework
+ * clk_hw_register_gate - register a gate clock with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of this clock's parent
@@ -120,14 +120,15 @@ EXPORT_SYMBOL_GPL(clk_gate_ops);
  * @clk_gate_flags: gate-specific flags for this clock
  * @lock: shared register lock for this clock
  */
-struct clk *clk_register_gate(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock)
 {
 	struct clk_gate *gate;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	if (clk_gate_flags & CLK_GATE_HIWORD_MASK) {
 		if (bit_idx > 15) {
@@ -154,12 +155,29 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	gate->lock = lock;
 	gate->hw.init = &init;
 
-	clk = clk_register(dev, &gate->hw);
-
-	if (IS_ERR(clk))
+	hw = &gate->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(gate);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_gate);
+
+struct clk *clk_register_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 bit_idx,
+		u8 clk_gate_flags, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gate(dev, name, parent_name, flags, reg,
+				  bit_idx, clk_gate_flags, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_gate);
 
@@ -178,3 +196,14 @@ void clk_unregister_gate(struct clk *clk)
 	kfree(gate);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_gate);
+
+void clk_hw_unregister_gate(struct clk_hw *hw)
+{
+	struct clk_gate *gate;
+
+	gate = to_clk_gate(hw);
+
+	clk_hw_unregister(hw);
+	kfree(gate);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_gate);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 8885d0350596..bf12050aadd5 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -326,7 +326,12 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 bit_idx,
 		u8 clk_gate_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 bit_idx,
+		u8 clk_gate_flags, spinlock_t *lock);
 void clk_unregister_gate(struct clk *clk);
+void clk_hw_unregister_gate(struct clk_hw *hw);
 
 struct clk_div_table {
 	unsigned int	val;
-- 
cgit v1.2.3


From 264b31719735eb1fcbed47cecdb20f517e804856 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:05:48 -0800
Subject: clk: mux: Add hw based registration APIs

Add registration APIs in the clk mux code to return struct clk_hw
pointers instead of struct clk pointers. This way we hide the
struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-mux.c        | 57 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h | 11 +++++++++
 2 files changed, 62 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 252188fd8bcd..16a3d5717f4e 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -113,16 +113,17 @@ const struct clk_ops clk_mux_ro_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_mux_ro_ops);
 
-struct clk *clk_register_mux_table(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
 {
 	struct clk_mux *mux;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
 	u8 width = 0;
+	int ret;
 
 	if (clk_mux_flags & CLK_MUX_HIWORD_MASK) {
 		width = fls(mask) - ffs(mask) + 1;
@@ -157,12 +158,31 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 	mux->table = table;
 	mux->hw.init = &init;
 
-	clk = clk_register(dev, &mux->hw);
-
-	if (IS_ERR(clk))
+	hw = &mux->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(mux);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_mux_table);
+
+struct clk *clk_register_mux_table(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_mux_table(dev, name, parent_names, num_parents,
+				       flags, reg, shift, mask, clk_mux_flags,
+				       table, lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_mux_table);
 
@@ -180,6 +200,20 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 }
 EXPORT_SYMBOL_GPL(clk_register_mux);
 
+struct clk_hw *clk_hw_register_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_mux_flags, spinlock_t *lock)
+{
+	u32 mask = BIT(width) - 1;
+
+	return clk_hw_register_mux_table(dev, name, parent_names, num_parents,
+				      flags, reg, shift, mask, clk_mux_flags,
+				      NULL, lock);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_mux);
+
 void clk_unregister_mux(struct clk *clk)
 {
 	struct clk_mux *mux;
@@ -195,3 +229,14 @@ void clk_unregister_mux(struct clk *clk)
 	kfree(mux);
 }
 EXPORT_SYMBOL_GPL(clk_unregister_mux);
+
+void clk_hw_unregister_mux(struct clk_hw *hw)
+{
+	struct clk_mux *mux;
+
+	mux = to_clk_mux(hw);
+
+	clk_hw_unregister(hw);
+	kfree(mux);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_mux);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bf12050aadd5..d690d99b9c1c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -478,14 +478,25 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_mux_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_mux_flags, spinlock_t *lock);
 
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
+struct clk_hw *clk_hw_register_mux_table(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 
 void clk_unregister_mux(struct clk *clk);
+void clk_hw_unregister_mux(struct clk_hw *hw);
 
 void of_fixed_factor_clk_setup(struct device_node *node);
 
-- 
cgit v1.2.3


From 0759ac8a73dc2c8cc8ac697fbe5dbd8d67348d37 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:11:06 -0800
Subject: clk: fixed-factor: Add hw based registration APIs

Add registration APIs in the clk fixed-factor code to return
struct clk_hw pointers instead of struct clk pointers. This way
we hide the struct clk pointer from providers unless they need to
use consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fixed-factor.c | 42 +++++++++++++++++++++++++++++++++++-------
 include/linux/clk-provider.h   |  4 ++++
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 053448e2453d..75cd6c792cb8 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -68,13 +68,14 @@ const struct clk_ops clk_fixed_factor_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_fixed_factor_ops);
 
-struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
-		const char *parent_name, unsigned long flags,
+struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
 	struct clk_fixed_factor *fix;
 	struct clk_init_data init;
-	struct clk *clk;
+	struct clk_hw *hw;
+	int ret;
 
 	fix = kmalloc(sizeof(*fix), GFP_KERNEL);
 	if (!fix)
@@ -91,12 +92,28 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 	init.parent_names = &parent_name;
 	init.num_parents = 1;
 
-	clk = clk_register(dev, &fix->hw);
-
-	if (IS_ERR(clk))
+	hw = &fix->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fix);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor);
+
+struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div)
+{
+	struct clk_hw *hw;
 
-	return clk;
+	hw = clk_hw_register_fixed_factor(dev, name, parent_name, flags, mult,
+					  div);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fixed_factor);
 
@@ -113,6 +130,17 @@ void clk_unregister_fixed_factor(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_unregister_fixed_factor);
 
+void clk_hw_unregister_fixed_factor(struct clk_hw *hw)
+{
+	struct clk_fixed_factor *fix;
+
+	fix = to_clk_fixed_factor(hw);
+
+	clk_hw_unregister(hw);
+	kfree(fix);
+}
+EXPORT_SYMBOL_GPL(clk_hw_unregister_fixed_factor);
+
 #ifdef CONFIG_OF
 /**
  * of_fixed_factor_clk_setup() - Setup function for simple fixed factor clock
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d690d99b9c1c..79ad1a8a6831 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -525,6 +525,10 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div);
 void clk_unregister_fixed_factor(struct clk *clk);
+struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div);
+void clk_hw_unregister_fixed_factor(struct clk_hw *hw);
 
 /**
  * struct clk_fractional_divider - adjustable fractional divider clock
-- 
cgit v1.2.3


From 39b44cff4ad4af6d7abd9dd2acb288b005c26503 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:15:09 -0800
Subject: clk: fractional-divider: Add hw based registration APIs

Add registration APIs in the clk fractional divider code to
return struct clk_hw pointers instead of struct clk pointers.
This way we hide the struct clk pointer from providers unless
they need to use consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fractional-divider.c | 40 +++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h         |  5 +++++
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fractional-divider.c b/drivers/clk/clk-fractional-divider.c
index 1abcd76b4993..aab904618eb6 100644
--- a/drivers/clk/clk-fractional-divider.c
+++ b/drivers/clk/clk-fractional-divider.c
@@ -116,14 +116,15 @@ const struct clk_ops clk_fractional_divider_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_fractional_divider_ops);
 
-struct clk *clk_register_fractional_divider(struct device *dev,
+struct clk_hw *clk_hw_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
 		u8 clk_divider_flags, spinlock_t *lock)
 {
 	struct clk_fractional_divider *fd;
 	struct clk_init_data init;
-	struct clk *clk;
+	struct clk_hw *hw;
+	int ret;
 
 	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
 	if (!fd)
@@ -146,10 +147,39 @@ struct clk *clk_register_fractional_divider(struct device *dev,
 	fd->lock = lock;
 	fd->hw.init = &init;
 
-	clk = clk_register(dev, &fd->hw);
-	if (IS_ERR(clk))
+	hw = &fd->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fd);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fractional_divider);
 
-	return clk;
+struct clk *clk_register_fractional_divider(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
+		u8 clk_divider_flags, spinlock_t *lock)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_fractional_divider(dev, name, parent_name, flags,
+			reg, mshift, mwidth, nshift, nwidth, clk_divider_flags,
+			lock);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fractional_divider);
+
+void clk_hw_unregister_fractional_divider(struct clk_hw *hw)
+{
+	struct clk_fractional_divider *fd;
+
+	fd = to_clk_fd(hw);
+
+	clk_hw_unregister(hw);
+	kfree(fd);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 79ad1a8a6831..bcbaf6c95d52 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -563,6 +563,11 @@ struct clk *clk_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
 		u8 clk_divider_flags, spinlock_t *lock);
+struct clk_hw *clk_hw_register_fractional_divider(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
+		u8 clk_divider_flags, spinlock_t *lock);
+void clk_hw_unregister_fractional_divider(struct clk_hw *hw);
 
 /**
  * struct clk_multiplier - adjustable multiplier clock
-- 
cgit v1.2.3


From 49cb392d36397a296dcd51ec57cf83585a89a94a Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:20:31 -0800
Subject: clk: composite: Add hw based registration APIs

Add registration APIs in the clk composite code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-composite.c  | 45 ++++++++++++++++++++++++++++++++------------
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 1f903e1f86a2..463fadd5a68f 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -184,17 +184,18 @@ static void clk_composite_disable(struct clk_hw *hw)
 	gate_ops->disable(gate_hw);
 }
 
-struct clk *clk_register_composite(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_composite(struct device *dev, const char *name,
 			const char * const *parent_names, int num_parents,
 			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
 			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 			unsigned long flags)
 {
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
 	struct clk_composite *composite;
 	struct clk_ops *clk_composite_ops;
+	int ret;
 
 	composite = kzalloc(sizeof(*composite), GFP_KERNEL);
 	if (!composite)
@@ -204,12 +205,13 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	init.flags = flags | CLK_IS_BASIC;
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
+	hw = &composite->hw;
 
 	clk_composite_ops = &composite->ops;
 
 	if (mux_hw && mux_ops) {
 		if (!mux_ops->get_parent) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 
@@ -224,7 +226,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 
 	if (rate_hw && rate_ops) {
 		if (!rate_ops->recalc_rate) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 		clk_composite_ops->recalc_rate = clk_composite_recalc_rate;
@@ -253,7 +255,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	if (gate_hw && gate_ops) {
 		if (!gate_ops->is_enabled || !gate_ops->enable ||
 		    !gate_ops->disable) {
-			clk = ERR_PTR(-EINVAL);
+			hw = ERR_PTR(-EINVAL);
 			goto err;
 		}
 
@@ -267,22 +269,41 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	init.ops = clk_composite_ops;
 	composite->hw.init = &init;
 
-	clk = clk_register(dev, &composite->hw);
-	if (IS_ERR(clk))
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
+		hw = ERR_PTR(ret);
 		goto err;
+	}
 
 	if (composite->mux_hw)
-		composite->mux_hw->clk = clk;
+		composite->mux_hw->clk = hw->clk;
 
 	if (composite->rate_hw)
-		composite->rate_hw->clk = clk;
+		composite->rate_hw->clk = hw->clk;
 
 	if (composite->gate_hw)
-		composite->gate_hw->clk = clk;
+		composite->gate_hw->clk = hw->clk;
 
-	return clk;
+	return hw;
 
 err:
 	kfree(composite);
-	return clk;
+	return hw;
+}
+
+struct clk *clk_register_composite(struct device *dev, const char *name,
+			const char * const *parent_names, int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_composite(dev, name, parent_names, num_parents,
+			mux_hw, mux_ops, rate_hw, rate_ops, gate_hw, gate_ops,
+			flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bcbaf6c95d52..456c3ced1ac9 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -638,6 +638,13 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+struct clk_hw *clk_hw_register_composite(struct device *dev, const char *name,
+		const char * const *parent_names, int num_parents,
+		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+		unsigned long flags);
+void clk_hw_unregister_composite(struct clk_hw *hw);
 
 /***
  * struct clk_gpio_gate - gpio gated clock
-- 
cgit v1.2.3


From b120743a64a3ec68b8c5310a6009094329b4a33b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:27:55 -0800
Subject: clk: gpio: Add hw based registration APIs

Add registration APIs in the clk gpio code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gpio.c       | 52 ++++++++++++++++++++++++++++++++++----------
 include/linux/clk-provider.h |  8 +++++++
 2 files changed, 49 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c
index 08f65acc5d57..86b245746a6b 100644
--- a/drivers/clk/clk-gpio.c
+++ b/drivers/clk/clk-gpio.c
@@ -94,13 +94,13 @@ const struct clk_ops clk_gpio_mux_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_gpio_mux_ops);
 
-static struct clk *clk_register_gpio(struct device *dev, const char *name,
+static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags,
 		const struct clk_ops *clk_gpio_ops)
 {
 	struct clk_gpio *clk_gpio;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init = {};
 	unsigned long gpio_flags;
 	int err;
@@ -141,24 +141,26 @@ static struct clk *clk_register_gpio(struct device *dev, const char *name,
 	clk_gpio->gpiod = gpio_to_desc(gpio);
 	clk_gpio->hw.init = &init;
 
+	hw = &clk_gpio->hw;
 	if (dev)
-		clk = devm_clk_register(dev, &clk_gpio->hw);
+		err = devm_clk_hw_register(dev, hw);
 	else
-		clk = clk_register(NULL, &clk_gpio->hw);
+		err = clk_hw_register(NULL, hw);
 
-	if (!IS_ERR(clk))
-		return clk;
+	if (!err)
+		return hw;
 
 	if (!dev) {
 		gpiod_put(clk_gpio->gpiod);
 		kfree(clk_gpio);
 	}
 
-	return clk;
+	return ERR_PTR(err);
 }
 
 /**
- * clk_register_gpio_gate - register a gpio clock gate with the clock framework
+ * clk_hw_register_gpio_gate - register a gpio clock gate with the clock
+ * framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of this clock's parent
@@ -166,7 +168,7 @@ static struct clk *clk_register_gpio(struct device *dev, const char *name,
  * @active_low: true if gpio should be set to 0 to enable clock
  * @flags: clock flags
  */
-struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned gpio, bool active_low,
 		unsigned long flags)
 {
@@ -175,10 +177,24 @@ struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 			(parent_name ? 1 : 0), gpio, active_low, flags,
 			&clk_gpio_gate_ops);
 }
+EXPORT_SYMBOL_GPL(clk_hw_register_gpio_gate);
+
+struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned gpio, bool active_low,
+		unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gpio_gate(dev, name, parent_name, gpio, active_low,
+				       flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
+}
 EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
 
 /**
- * clk_register_gpio_mux - register a gpio clock mux with the clock framework
+ * clk_hw_register_gpio_mux - register a gpio clock mux with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_names: names of this clock's parents
@@ -187,7 +203,7 @@ EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
  * @active_low: true if gpio should be set to 0 to enable clock
  * @flags: clock flags
  */
-struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags)
 {
@@ -199,6 +215,20 @@ struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 	return clk_register_gpio(dev, name, parent_names, num_parents,
 			gpio, active_low, flags, &clk_gpio_mux_ops);
 }
+EXPORT_SYMBOL_GPL(clk_hw_register_gpio_mux);
+
+struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_gpio_mux(dev, name, parent_names, num_parents,
+			gpio, active_low, flags);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
+}
 EXPORT_SYMBOL_GPL(clk_register_gpio_mux);
 
 static int gpio_clk_driver_probe(struct platform_device *pdev)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 456c3ced1ac9..6c36c5e8ccbe 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -667,6 +667,10 @@ extern const struct clk_ops clk_gpio_gate_ops;
 struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
 		const char *parent_name, unsigned gpio, bool active_low,
 		unsigned long flags);
+struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
+		const char *parent_name, unsigned gpio, bool active_low,
+		unsigned long flags);
+void clk_hw_unregister_gpio_gate(struct clk_hw *hw);
 
 /**
  * struct clk_gpio_mux - gpio controlled clock multiplexer
@@ -682,6 +686,10 @@ extern const struct clk_ops clk_gpio_mux_ops;
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents, unsigned gpio,
 		bool active_low, unsigned long flags);
+struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
+		const char * const *parent_names, u8 num_parents, unsigned gpio,
+		bool active_low, unsigned long flags);
+void clk_hw_unregister_gpio_mux(struct clk_hw *hw);
 
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
-- 
cgit v1.2.3


From 26ef56be9e0944a9b136169eb47140f309ce745b Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Sun, 7 Feb 2016 00:34:13 -0800
Subject: clk: fixed-rate: Add hw based registration APIs

Add registration APIs in the clk fixed-rate code to return struct
clk_hw pointers instead of struct clk pointers. This way we hide
the struct clk pointer from providers unless they need to use
consumer facing APIs.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fixed-rate.c | 44 ++++++++++++++++++++++++++++++++++++--------
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-rate.c b/drivers/clk/clk-fixed-rate.c
index cd9dc925b3f8..8e4453eb54e8 100644
--- a/drivers/clk/clk-fixed-rate.c
+++ b/drivers/clk/clk-fixed-rate.c
@@ -45,8 +45,8 @@ const struct clk_ops clk_fixed_rate_ops = {
 EXPORT_SYMBOL_GPL(clk_fixed_rate_ops);
 
 /**
- * clk_register_fixed_rate_with_accuracy - register fixed-rate clock with the
- *					   clock framework
+ * clk_hw_register_fixed_rate_with_accuracy - register fixed-rate clock with
+ * the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of clock's parent
@@ -54,13 +54,14 @@ EXPORT_SYMBOL_GPL(clk_fixed_rate_ops);
  * @fixed_rate: non-adjustable clock rate
  * @fixed_accuracy: non-adjustable clock rate
  */
-struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
+struct clk_hw *clk_hw_register_fixed_rate_with_accuracy(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate, unsigned long fixed_accuracy)
 {
 	struct clk_fixed_rate *fixed;
-	struct clk *clk;
+	struct clk_hw *hw;
 	struct clk_init_data init;
+	int ret;
 
 	/* allocate fixed-rate clock */
 	fixed = kzalloc(sizeof(*fixed), GFP_KERNEL);
@@ -79,22 +80,49 @@ struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
 	fixed->hw.init = &init;
 
 	/* register the clock */
-	clk = clk_register(dev, &fixed->hw);
-	if (IS_ERR(clk))
+	hw = &fixed->hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
 		kfree(fixed);
+		hw = ERR_PTR(ret);
+	}
 
-	return clk;
+	return hw;
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_rate_with_accuracy);
+
+struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate, unsigned long fixed_accuracy)
+{
+	struct clk_hw *hw;
+
+	hw = clk_hw_register_fixed_rate_with_accuracy(dev, name, parent_name,
+			flags, fixed_rate, fixed_accuracy);
+	if (IS_ERR(hw))
+		return ERR_CAST(hw);
+	return hw->clk;
 }
 EXPORT_SYMBOL_GPL(clk_register_fixed_rate_with_accuracy);
 
 /**
- * clk_register_fixed_rate - register fixed-rate clock with the clock framework
+ * clk_hw_register_fixed_rate - register fixed-rate clock with the clock
+ * framework
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  */
+struct clk_hw *clk_hw_register_fixed_rate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate)
+{
+	return clk_hw_register_fixed_rate_with_accuracy(dev, name, parent_name,
+						     flags, fixed_rate, 0);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_rate);
+
 struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 6c36c5e8ccbe..c3fc042d517c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -282,10 +282,17 @@ extern const struct clk_ops clk_fixed_rate_ops;
 struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate);
+struct clk_hw *clk_hw_register_fixed_rate(struct device *dev, const char *name,
+		const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate);
 struct clk *clk_register_fixed_rate_with_accuracy(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate, unsigned long fixed_accuracy);
 void clk_unregister_fixed_rate(struct clk *clk);
+struct clk_hw *clk_hw_register_fixed_rate_with_accuracy(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned long fixed_rate, unsigned long fixed_accuracy);
+
 void of_fixed_clk_setup(struct device_node *np);
 
 /**
-- 
cgit v1.2.3


From 607ea7cda6315be0ad8be2f98bc9de6f2d656ae6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 18 Apr 2016 14:41:10 +0300
Subject: net/ipv6/addrconf: simplify sysctl registration

Struct ctl_table_header holds pointer to sysctl table which could be used
for freeing it after unregistration. IPv4 sysctls already use that.
Remove redundant NULL assignment: ndev allocated using kzalloc.

This also saves some bytes: sysctl table could be shorter than
DEVCONF_MAX+1 if some options are disable in config.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h |  3 ++-
 net/ipv6/addrconf.c  | 43 +++++++++++++++++--------------------------
 2 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7edc14fb66b6..58d6e158755f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -63,7 +63,8 @@ struct ipv6_devconf {
 	} stable_secret;
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
-	void		*sysctl;
+
+	struct ctl_table_header *sysctl_header;
 };
 
 struct ipv6_params {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a6c99275bd8c..5a42c0fe0449 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -359,7 +359,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
 
 	ndev->cnf.mtu6 = dev->mtu;
-	ndev->cnf.sysctl = NULL;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
 	if (!ndev->nd_parms) {
 		kfree(ndev);
@@ -5620,13 +5619,7 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
 	return ret;
 }
 
-static struct addrconf_sysctl_table
-{
-	struct ctl_table_header *sysctl_header;
-	struct ctl_table addrconf_vars[DEVCONF_MAX+1];
-} addrconf_sysctl __read_mostly = {
-	.sysctl_header = NULL,
-	.addrconf_vars = {
+static const struct ctl_table addrconf_sysctl[] = {
 		{
 			.procname	= "forwarding",
 			.data		= &ipv6_devconf.forwarding,
@@ -5944,52 +5937,50 @@ static struct addrconf_sysctl_table
 		{
 			/* sentinel */
 		}
-	},
 };
 
 static int __addrconf_sysctl_register(struct net *net, char *dev_name,
 		struct inet6_dev *idev, struct ipv6_devconf *p)
 {
 	int i;
-	struct addrconf_sysctl_table *t;
+	struct ctl_table *table;
 	char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
 
-	t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
-	if (!t)
+	table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL);
+	if (!table)
 		goto out;
 
-	for (i = 0; t->addrconf_vars[i].data; i++) {
-		t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
-		t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
-		t->addrconf_vars[i].extra2 = net;
+	for (i = 0; table[i].data; i++) {
+		table[i].data += (char *)p - (char *)&ipv6_devconf;
+		table[i].extra1 = idev; /* embedded; no ref */
+		table[i].extra2 = net;
 	}
 
 	snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
 
-	t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars);
-	if (!t->sysctl_header)
+	p->sysctl_header = register_net_sysctl(net, path, table);
+	if (!p->sysctl_header)
 		goto free;
 
-	p->sysctl = t;
 	return 0;
 
 free:
-	kfree(t);
+	kfree(table);
 out:
 	return -ENOBUFS;
 }
 
 static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
 {
-	struct addrconf_sysctl_table *t;
+	struct ctl_table *table;
 
-	if (!p->sysctl)
+	if (!p->sysctl_header)
 		return;
 
-	t = p->sysctl;
-	p->sysctl = NULL;
-	unregister_net_sysctl_table(t->sysctl_header);
-	kfree(t);
+	table = p->sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(p->sysctl_header);
+	p->sysctl_header = NULL;
+	kfree(table);
 }
 
 static int addrconf_sysctl_register(struct inet6_dev *idev)
-- 
cgit v1.2.3


From bd570ff970a54df653b48ed0cfb373f2ebed083d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 18 Apr 2016 21:01:24 +0200
Subject: bpf: add event output helper for notifications/sampling/logging

This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").

The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.

User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.

While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.

Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/core.c        |  7 +++++++
 kernel/trace/bpf_trace.c | 27 +++++++++++++++++++++++++++
 net/core/filter.c        |  2 ++
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fb3c610fa96..f63afdc43bec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,7 +169,9 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
+const struct bpf_func_proto *bpf_get_event_output_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index be0abf669ced..e4248fe79513 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -764,14 +764,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
 	return NULL;
 }
 
+const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+{
+	return NULL;
+}
+
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
 	.func		= NULL,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b3cc24cb4321..780bcbe1d4de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -277,6 +277,33 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
+static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+
+static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+
+	perf_fetch_caller_regs(regs);
+
+	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+}
+
+static const struct bpf_func_proto bpf_event_output_proto = {
+	.func		= bpf_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+const struct bpf_func_proto *bpf_get_event_output_proto(void)
+{
+	return &bpf_event_output_proto;
+}
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 5d2ac2b9d1c4..218e5de8c402 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2039,6 +2039,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
 		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_perf_event_output:
+		return bpf_get_event_output_proto();
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From b853cb9628bfbcc4017da46d5f5b46e3eba9d8c6 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 28 Mar 2016 21:35:22 -0700
Subject: soc: qcom: smd: Make callback pass channel reference

By passing the smd channel reference to the callback, rather than the
smd device, we can open additional smd channels from sub-devices of smd
devices.

Also updates the two smd clients today found in mainline.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/smd-rpm.c    |  9 ++++++---
 drivers/soc/qcom/smd.c        | 22 ++++++++++++++++++----
 drivers/soc/qcom/wcnss_ctrl.c |  8 ++++----
 include/linux/soc/qcom/smd.h  |  7 +++++--
 4 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smd-rpm.c b/drivers/soc/qcom/smd-rpm.c
index 731fa066f712..6609d7e0edb0 100644
--- a/drivers/soc/qcom/smd-rpm.c
+++ b/drivers/soc/qcom/smd-rpm.c
@@ -33,6 +33,7 @@
  */
 struct qcom_smd_rpm {
 	struct qcom_smd_channel *rpm_channel;
+	struct device *dev;
 
 	struct completion ack;
 	struct mutex lock;
@@ -149,14 +150,14 @@ out:
 }
 EXPORT_SYMBOL(qcom_rpm_smd_write);
 
-static int qcom_smd_rpm_callback(struct qcom_smd_device *qsdev,
+static int qcom_smd_rpm_callback(struct qcom_smd_channel *channel,
 				 const void *data,
 				 size_t count)
 {
 	const struct qcom_rpm_header *hdr = data;
 	size_t hdr_length = le32_to_cpu(hdr->length);
 	const struct qcom_rpm_message *msg;
-	struct qcom_smd_rpm *rpm = dev_get_drvdata(&qsdev->dev);
+	struct qcom_smd_rpm *rpm = qcom_smd_get_drvdata(channel);
 	const u8 *buf = data + sizeof(struct qcom_rpm_header);
 	const u8 *end = buf + hdr_length;
 	char msgbuf[32];
@@ -165,7 +166,7 @@ static int qcom_smd_rpm_callback(struct qcom_smd_device *qsdev,
 
 	if (le32_to_cpu(hdr->service_type) != RPM_SERVICE_TYPE_REQUEST ||
 	    hdr_length < sizeof(struct qcom_rpm_message)) {
-		dev_err(&qsdev->dev, "invalid request\n");
+		dev_err(rpm->dev, "invalid request\n");
 		return 0;
 	}
 
@@ -206,7 +207,9 @@ static int qcom_smd_rpm_probe(struct qcom_smd_device *sdev)
 	mutex_init(&rpm->lock);
 	init_completion(&rpm->ack);
 
+	rpm->dev = &sdev->dev;
 	rpm->rpm_channel = sdev->channel;
+	qcom_smd_set_drvdata(sdev->channel, rpm);
 
 	dev_set_drvdata(&sdev->dev, rpm);
 
diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index b6434c4be86a..ac1957dfdf24 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -194,6 +194,8 @@ struct qcom_smd_channel {
 
 	int pkt_size;
 
+	void *drvdata;
+
 	struct list_head list;
 	struct list_head dev_list;
 };
@@ -513,7 +515,6 @@ static void qcom_smd_channel_advance(struct qcom_smd_channel *channel,
  */
 static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
 {
-	struct qcom_smd_device *qsdev = channel->qsdev;
 	unsigned tail;
 	size_t len;
 	void *ptr;
@@ -533,7 +534,7 @@ static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
 		len = channel->pkt_size;
 	}
 
-	ret = channel->cb(qsdev, ptr, len);
+	ret = channel->cb(channel, ptr, len);
 	if (ret < 0)
 		return ret;
 
@@ -1034,6 +1035,18 @@ int qcom_smd_driver_register(struct qcom_smd_driver *qsdrv)
 }
 EXPORT_SYMBOL(qcom_smd_driver_register);
 
+void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel)
+{
+	return channel->drvdata;
+}
+EXPORT_SYMBOL(qcom_smd_get_drvdata);
+
+void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data)
+{
+	channel->drvdata = data;
+}
+EXPORT_SYMBOL(qcom_smd_set_drvdata);
+
 /**
  * qcom_smd_driver_unregister - unregister a smd driver
  * @qsdrv:	qcom_smd_driver struct
@@ -1079,12 +1092,13 @@ qcom_smd_find_channel(struct qcom_smd_edge *edge, const char *name)
  * Returns a channel handle on success, or -EPROBE_DEFER if the channel isn't
  * ready.
  */
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *parent,
 					       const char *name,
 					       qcom_smd_cb_t cb)
 {
 	struct qcom_smd_channel *channel;
-	struct qcom_smd_edge *edge = sdev->channel->edge;
+	struct qcom_smd_device *sdev = parent->qsdev;
+	struct qcom_smd_edge *edge = parent->edge;
 	int ret;
 
 	/* Wait up to HZ for the channel to appear */
diff --git a/drivers/soc/qcom/wcnss_ctrl.c b/drivers/soc/qcom/wcnss_ctrl.c
index 7a986f881d5c..c544f3d2c6ee 100644
--- a/drivers/soc/qcom/wcnss_ctrl.c
+++ b/drivers/soc/qcom/wcnss_ctrl.c
@@ -100,17 +100,17 @@ struct wcnss_download_nv_resp {
 
 /**
  * wcnss_ctrl_smd_callback() - handler from SMD responses
- * @qsdev:	smd device handle
+ * @channel:	smd channel handle
  * @data:	pointer to the incoming data packet
  * @count:	size of the incoming data packet
  *
  * Handles any incoming packets from the remote WCNSS_CTRL service.
  */
-static int wcnss_ctrl_smd_callback(struct qcom_smd_device *qsdev,
+static int wcnss_ctrl_smd_callback(struct qcom_smd_channel *channel,
 				   const void *data,
 				   size_t count)
 {
-	struct wcnss_ctrl *wcnss = dev_get_drvdata(&qsdev->dev);
+	struct wcnss_ctrl *wcnss = qcom_smd_get_drvdata(channel);
 	const struct wcnss_download_nv_resp *nvresp;
 	const struct wcnss_version_resp *version;
 	const struct wcnss_msg_hdr *hdr = data;
@@ -246,7 +246,7 @@ static int wcnss_ctrl_probe(struct qcom_smd_device *sdev)
 	init_completion(&wcnss->ack);
 	INIT_WORK(&wcnss->download_nv_work, wcnss_download_nv);
 
-	dev_set_drvdata(&sdev->dev, wcnss);
+	qcom_smd_set_drvdata(sdev->channel, wcnss);
 
 	return wcnss_request_version(wcnss);
 }
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index bd51c8a9d807..cb2f81559bc0 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -26,7 +26,7 @@ struct qcom_smd_device {
 	struct qcom_smd_channel *channel;
 };
 
-typedef int (*qcom_smd_cb_t)(struct qcom_smd_device *, const void *, size_t);
+typedef int (*qcom_smd_cb_t)(struct qcom_smd_channel *, const void *, size_t);
 
 /**
  * struct qcom_smd_driver - smd driver struct
@@ -50,13 +50,16 @@ struct qcom_smd_driver {
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
 void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
+void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel);
+void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data);
+
 #define module_qcom_smd_driver(__smd_driver) \
 	module_driver(__smd_driver, qcom_smd_driver_register, \
 		      qcom_smd_driver_unregister)
 
 int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_device *sdev,
+struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *channel,
 					       const char *name,
 					       qcom_smd_cb_t cb);
 
-- 
cgit v1.2.3


From 12524e348bcb2189a8cf43829e90256f7e7d4f3d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 13 Apr 2016 11:18:06 +0200
Subject: clk: renesas: mstp: Provide dummy attach/detach_dev callbacks

Provide dummy cpg_mstp_{at,de}tach_dev() PM Domain callbacks if CPG/MSTP
support is not included, so the rcar-sysc driver won't have to care
about this.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
---
 include/linux/clk/renesas.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h
index 095b1681daf4..ab57a298a374 100644
--- a/include/linux/clk/renesas.h
+++ b/include/linux/clk/renesas.h
@@ -25,7 +25,12 @@ void r8a7779_clocks_init(u32 mode);
 void rcar_gen2_clocks_init(u32 mode);
 
 void cpg_mstp_add_clk_domain(struct device_node *np);
+#ifdef CONFIG_CLK_RENESAS_CPG_MSTP
 int cpg_mstp_attach_dev(struct generic_pm_domain *unused, struct device *dev);
 void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev);
+#else
+#define cpg_mstp_attach_dev	NULL
+#define cpg_mstp_detach_dev	NULL
+#endif
 
 #endif
-- 
cgit v1.2.3


From 2066390ad47b374f3d35075a32325b47d15bf735 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 4 Mar 2016 17:03:46 +0100
Subject: clk: renesas: cpg-mssr: Export cpg_mssr_{at,de}tach_dev()

The R-Car SYSC PM Domain driver has to power manage devices in power
areas using clocks. To reuse code and to share knowledge of clocks
suitable for power management, this is ideally done through the existing
cpg_mssr_attach_dev() and cpg_mssr_detach_dev() callbacks.

Hence these callbacks can no longer rely on their "domain" parameter
pointing to the CPG/MSSR Clock Domain. To handle this, keep a pointer to
the clock domain in a static variable. cpg_mssr_attach_dev() has to
support probe deferral, as the R-Car SYSC PM Domain may be initialized,
and devices may be added to it, before the CPG/MSSR Clock Domain is
initialized.

Dummy callbacks are provided for the case where CPG/MSTP support is not
included, so the rcar-sysc driver won't have to care about this.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
---
 drivers/clk/renesas/renesas-cpg-mssr.c | 18 ++++++++++++------
 include/linux/clk/renesas.h            |  7 +++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/renesas/renesas-cpg-mssr.c b/drivers/clk/renesas/renesas-cpg-mssr.c
index 703bdb157528..1f2dc3629f0e 100644
--- a/drivers/clk/renesas/renesas-cpg-mssr.c
+++ b/drivers/clk/renesas/renesas-cpg-mssr.c
@@ -15,6 +15,7 @@
 
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
+#include <linux/clk/renesas.h>
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/mod_devicetable.h>
@@ -388,6 +389,8 @@ struct cpg_mssr_clk_domain {
 	unsigned int core_pm_clks[0];
 };
 
+static struct cpg_mssr_clk_domain *cpg_mssr_clk_domain;
+
 static bool cpg_mssr_is_pm_clk(const struct of_phandle_args *clkspec,
 			       struct cpg_mssr_clk_domain *pd)
 {
@@ -411,17 +414,20 @@ static bool cpg_mssr_is_pm_clk(const struct of_phandle_args *clkspec,
 	}
 }
 
-static int cpg_mssr_attach_dev(struct generic_pm_domain *genpd,
-			       struct device *dev)
+int cpg_mssr_attach_dev(struct generic_pm_domain *unused, struct device *dev)
 {
-	struct cpg_mssr_clk_domain *pd =
-		container_of(genpd, struct cpg_mssr_clk_domain, genpd);
+	struct cpg_mssr_clk_domain *pd = cpg_mssr_clk_domain;
 	struct device_node *np = dev->of_node;
 	struct of_phandle_args clkspec;
 	struct clk *clk;
 	int i = 0;
 	int error;
 
+	if (!pd) {
+		dev_dbg(dev, "CPG/MSSR clock domain not yet available\n");
+		return -EPROBE_DEFER;
+	}
+
 	while (!of_parse_phandle_with_args(np, "clocks", "#clock-cells", i,
 					   &clkspec)) {
 		if (cpg_mssr_is_pm_clk(&clkspec, pd))
@@ -461,8 +467,7 @@ fail_put:
 	return error;
 }
 
-static void cpg_mssr_detach_dev(struct generic_pm_domain *genpd,
-				struct device *dev)
+void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev)
 {
 	if (!list_empty(&dev->power.subsys_data->clock_list))
 		pm_clk_destroy(dev);
@@ -491,6 +496,7 @@ static int __init cpg_mssr_add_clk_domain(struct device *dev,
 	pm_genpd_init(genpd, &simple_qos_governor, false);
 	genpd->attach_dev = cpg_mssr_attach_dev;
 	genpd->detach_dev = cpg_mssr_detach_dev;
+	cpg_mssr_clk_domain = pd;
 
 	of_genpd_add_provider_simple(np, genpd);
 	return 0;
diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h
index ab57a298a374..ba6fa4148515 100644
--- a/include/linux/clk/renesas.h
+++ b/include/linux/clk/renesas.h
@@ -33,4 +33,11 @@ void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev);
 #define cpg_mstp_detach_dev	NULL
 #endif
 
+#ifdef CONFIG_CLK_RENESAS_CPG_MSSR
+int cpg_mssr_attach_dev(struct generic_pm_domain *unused, struct device *dev);
+void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev);
+#else
+#define cpg_mssr_attach_dev	NULL
+#define cpg_mssr_detach_dev	NULL
+#endif
 #endif
-- 
cgit v1.2.3


From 2e4682ba2ed79d8082b78d292b3b80f54d970b7a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 10 Mar 2016 16:30:22 +0100
Subject: KVM: add missing memory barrier in kvm_{make,check}_request

kvm_make_request and kvm_check_request imply a producer-consumer
relationship; add implicit memory barriers to them.  There was indeed
already a place that was adding an explicit smp_mb() to order between
kvm_check_request and the processing of the request.  That memory
barrier can be removed (as an added benefit, kvm_check_request can use
smp_mb__after_atomic which is free on x86).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c  |  3 ---
 include/linux/kvm_host.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 54ead79e444b..dfb4c6476877 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -382,9 +382,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 	u32 i, nr_ioapic_pins;
 	int idx;
 
-	/* kvm->irq_routing must be read after clearing
-	 * KVM_SCAN_IOAPIC. */
-	smp_mb();
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5276fe0916fc..ad40d44784c7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1091,6 +1091,11 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
+	/*
+	 * Ensure the rest of the request is published to kvm_check_request's
+	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
+	 */
+	smp_wmb();
 	set_bit(req, &vcpu->requests);
 }
 
@@ -1098,6 +1103,12 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
 	if (test_bit(req, &vcpu->requests)) {
 		clear_bit(req, &vcpu->requests);
+
+		/*
+		 * Ensure the rest of the request is visible to kvm_check_request's
+		 * caller.  Paired with the smp_wmb in kvm_make_request.
+		 */
+		smp_mb__after_atomic();
 		return true;
 	} else {
 		return false;
-- 
cgit v1.2.3


From 9257b4a206fc0229dd5f84b78e4d1ebf3f91d270 Mon Sep 17 00:00:00 2001
From: Omer Peleg <omer@cs.technion.ac.il>
Date: Wed, 20 Apr 2016 11:34:11 +0300
Subject: iommu/iova: introduce per-cpu caching to iova allocation

IOVA allocation has two problems that impede high-throughput I/O.
First, it can do a linear search over the allocated IOVA ranges.
Second, the rbtree spinlock that serializes IOVA allocations becomes
contended.

Address these problems by creating an API for caching allocated IOVA
ranges, so that the IOVA allocator isn't accessed frequently.  This
patch adds a per-CPU cache, from which CPUs can alloc/free IOVAs
without taking the rbtree spinlock.  The per-CPU caches are backed by
a global cache, to avoid invoking the (linear-time) IOVA allocator
without needing to make the per-CPU cache size excessive.  This design
is based on magazines, as described in "Magazines and Vmem: Extending
the Slab Allocator to Many CPUs and Arbitrary Resources" (currently
available at https://www.usenix.org/legacy/event/usenix01/bonwick.html)

Adding caching on top of the existing rbtree allocator maintains the
property that IOVAs are densely packed in the IO virtual address space,
which is important for keeping IOMMU page table usage low.

To keep the cache size reasonable, we bound the IOVA space a CPU can
cache by 32 MiB (we cache a bounded number of IOVA ranges, and only
ranges of size <= 128 KiB).  The shared global cache is bounded at
4 MiB of IOVA space.

Signed-off-by: Omer Peleg <omer@cs.technion.ac.il>
[mad@cs.technion.ac.il: rebased, cleaned up and reworded the commit message]
Signed-off-by: Adam Morrison <mad@cs.technion.ac.il>
Reviewed-by: Shaohua Li <shli@fb.com>
Reviewed-by: Ben Serebrin <serebrin@google.com>
[dwmw2: split out VT-d part into a separate patch]
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/iommu/iova.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/iova.h |  23 ++-
 2 files changed, 414 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index fa0adef32bd6..ba764a0835d3 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,6 +20,17 @@
 #include <linux/iova.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+			       unsigned long pfn,
+			       unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -38,6 +49,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
+	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -291,33 +303,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	unsigned long flags;
-	struct rb_node *node;
+	struct rb_node *node = iovad->rbroot.rb_node;
+
+	assert_spin_locked(&iovad->iova_rbtree_lock);
 
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = iovad->rbroot.rb_node;
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
 
 		/* If pfn falls within iova's range, return iova */
 		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by design that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
 			return iova;
 		}
 
@@ -327,9 +324,35 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
 			node = node->rb_right;
 	}
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return NULL;
 }
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	assert_spin_locked(&iovad->iova_rbtree_lock);
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+	free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct iova *iova;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = private_find_iova(iovad, pfn);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -344,10 +367,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	__cached_rbnode_delete_update(iovad, iova);
-	rb_erase(&iova->node, &iovad->rbroot);
+	private_free_iova(iovad, iova);
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -369,6 +390,63 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+		unsigned long limit_pfn)
+{
+	bool flushed_rcache = false;
+	unsigned long iova_pfn;
+	struct iova *new_iova;
+
+	iova_pfn = iova_rcache_get(iovad, size, limit_pfn);
+	if (iova_pfn)
+		return iova_pfn;
+
+retry:
+	new_iova = alloc_iova(iovad, size, limit_pfn, true);
+	if (!new_iova) {
+		unsigned int cpu;
+
+		if (flushed_rcache)
+			return 0;
+
+		/* Try replenishing IOVAs by flushing rcache. */
+		flushed_rcache = true;
+		for_each_online_cpu(cpu)
+			free_cpu_cached_iovas(cpu, iovad);
+		goto retry;
+	}
+
+	return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+	if (iova_rcache_insert(iovad, pfn, size))
+		return;
+
+	free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -379,6 +457,7 @@ void put_iova_domain(struct iova_domain *iovad)
 	struct rb_node *node;
 	unsigned long flags;
 
+	free_iova_rcaches(iovad);
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	node = rb_first(&iovad->rbroot);
 	while (node) {
@@ -550,5 +629,295 @@ error:
 	return NULL;
 }
 
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+	unsigned long size;
+	unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+	spinlock_t lock;
+	struct iova_magazine *loaded;
+	struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+	return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+	kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+	unsigned long flags;
+	int i;
+
+	if (!mag)
+		return;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+	for (i = 0 ; i < mag->size; ++i) {
+		struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+		BUG_ON(!iova);
+		private_free_iova(iovad, iova);
+	}
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+	mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+	return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+	return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+				       unsigned long limit_pfn)
+{
+	BUG_ON(iova_magazine_empty(mag));
+
+	if (mag->pfns[mag->size - 1] >= limit_pfn)
+		return 0;
+
+	return mag->pfns[--mag->size];
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+	BUG_ON(iova_magazine_full(mag));
+
+	mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned int cpu;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		spin_lock_init(&rcache->lock);
+		rcache->depot_size = 0;
+		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+		if (WARN_ON(!rcache->cpu_rcaches))
+			continue;
+		for_each_possible_cpu(cpu) {
+			cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+			spin_lock_init(&cpu_rcache->lock);
+			cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+			cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+		}
+	}
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+				 struct iova_rcache *rcache,
+				 unsigned long iova_pfn)
+{
+	struct iova_magazine *mag_to_free = NULL;
+	struct iova_cpu_rcache *cpu_rcache;
+	bool can_insert = false;
+	unsigned long flags;
+
+	cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_full(cpu_rcache->loaded)) {
+		can_insert = true;
+	} else if (!iova_magazine_full(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		can_insert = true;
+	} else {
+		struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+		if (new_mag) {
+			spin_lock(&rcache->lock);
+			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+				rcache->depot[rcache->depot_size++] =
+						cpu_rcache->loaded;
+			} else {
+				mag_to_free = cpu_rcache->loaded;
+			}
+			spin_unlock(&rcache->lock);
+
+			cpu_rcache->loaded = new_mag;
+			can_insert = true;
+		}
+	}
+
+	if (can_insert)
+		iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	if (mag_to_free) {
+		iova_magazine_free_pfns(mag_to_free, iovad);
+		iova_magazine_free(mag_to_free);
+	}
+
+	return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+			       unsigned long size)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return false;
+
+	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+				       unsigned long limit_pfn)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	unsigned long iova_pfn = 0;
+	bool has_pfn = false;
+	unsigned long flags;
+
+	cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	if (!iova_magazine_empty(cpu_rcache->loaded)) {
+		has_pfn = true;
+	} else if (!iova_magazine_empty(cpu_rcache->prev)) {
+		swap(cpu_rcache->prev, cpu_rcache->loaded);
+		has_pfn = true;
+	} else {
+		spin_lock(&rcache->lock);
+		if (rcache->depot_size > 0) {
+			iova_magazine_free(cpu_rcache->loaded);
+			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			has_pfn = true;
+		}
+		spin_unlock(&rcache->lock);
+	}
+
+	if (has_pfn)
+		iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+	return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+				     unsigned long size,
+				     unsigned long limit_pfn)
+{
+	unsigned int log_size = order_base_2(size);
+
+	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+		return 0;
+
+	return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn);
+}
+
+/*
+ * Free a cpu's rcache.
+ */
+static void free_cpu_iova_rcache(unsigned int cpu, struct iova_domain *iovad,
+				 struct iova_rcache *rcache)
+{
+	struct iova_cpu_rcache *cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+	iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+	iova_magazine_free(cpu_rcache->loaded);
+
+	iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+	iova_magazine_free(cpu_rcache->prev);
+
+	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	unsigned int cpu;
+	int i, j;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		for_each_possible_cpu(cpu)
+			free_cpu_iova_rcache(cpu, iovad, rcache);
+		spin_lock_irqsave(&rcache->lock, flags);
+		free_percpu(rcache->cpu_rcaches);
+		for (j = 0; j < rcache->depot_size; ++j) {
+			iova_magazine_free_pfns(rcache->depot[j], iovad);
+			iova_magazine_free(rcache->depot[j]);
+		}
+		spin_unlock_irqrestore(&rcache->lock, flags);
+	}
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+		spin_lock_irqsave(&cpu_rcache->lock, flags);
+		iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+		iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+		spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+	}
+}
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 92f7177db2ce..f27bb2c62fca 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -19,8 +19,21 @@
 /* iova structure */
 struct iova {
 	struct rb_node	node;
-	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
-	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	unsigned long	pfn_hi; /* Highest allocated pfn */
+	unsigned long	pfn_lo; /* Lowest allocated pfn */
+};
+
+struct iova_magazine;
+struct iova_cpu_rcache;
+
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */
+#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
+
+struct iova_rcache {
+	spinlock_t lock;
+	unsigned long depot_size;
+	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +44,7 @@ struct iova_domain {
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
+	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -78,6 +92,10 @@ void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
 	bool size_aligned);
+void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
+		    unsigned long size);
+unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+			      unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -87,5 +105,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 
 #endif
-- 
cgit v1.2.3


From 80e0f8d94d3090f0f7bf3faf3e6180e920ee0d22 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 24 Feb 2016 14:12:59 +0530
Subject: pinctrl: Add devm_ apis for pinctrl_{register, unregister}

Add device managed APIs devm_pinctrl_register() and
devm_pinctrl_unregister() for the APIs pinctrl_register()
and pinctrl_unregister().

This helps in reducing code in error path and sometimes
removal of .remove callback for driver unbind.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 63 +++++++++++++++++++++++++++++++++++++++++
 include/linux/pinctrl/pinctrl.h |  6 ++++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index f67a8b7a4e18..21df52e8192a 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1872,6 +1872,69 @@ void pinctrl_unregister(struct pinctrl_dev *pctldev)
 }
 EXPORT_SYMBOL_GPL(pinctrl_unregister);
 
+static void devm_pinctrl_dev_release(struct device *dev, void *res)
+{
+	struct pinctrl_dev *pctldev = *(struct pinctrl_dev **)res;
+
+	pinctrl_unregister(pctldev);
+}
+
+static int devm_pinctrl_dev_match(struct device *dev, void *res, void *data)
+{
+	struct pctldev **r = res;
+
+	if (WARN_ON(!r || !*r)
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_pinctrl_register() - Resource managed version of pinctrl_register().
+ * @dev: parent device for this pin controller
+ * @pctldesc: descriptor for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ *
+ * Returns an error pointer if pincontrol register failed. Otherwise
+ * it returns valid pinctrl handle.
+ *
+ * The pinctrl device will be automatically released when the device is unbound.
+ */
+struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
+					  struct pinctrl_desc *pctldesc,
+					  void *driver_data)
+{
+	struct pinctrl_dev **ptr, *pctldev;
+
+	ptr = devres_alloc(devm_pinctrl_dev_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	pctldev = pinctrl_register(pctldesc, dev, driver_data);
+	if (IS_ERR(pctldev)) {
+		devres_free(ptr);
+		return pctldev;
+	}
+
+	*ptr = pctldev;
+	devres_add(dev, ptr);
+
+	return pctldev;
+}
+EXPORT_SYMBOL_GPL(devm_pinctrl_register);
+
+/**
+ * devm_pinctrl_unregister() - Resource managed version of pinctrl_unregister().
+ * @dev: device for which which resource was allocated
+ * @pctldev: the pinctrl device to unregister.
+ */
+void devm_pinctrl_unregister(struct device *dev, struct pinctrl_dev *pctldev)
+{
+	WARN_ON(devres_release(dev, devm_pinctrl_dev_release,
+			       devm_pinctrl_dev_match, pctldev));
+}
+EXPORT_SYMBOL_GPL(devm_pinctrl_unregister);
+
 static int __init pinctrl_init(void)
 {
 	pr_info("initialized pinctrl subsystem\n");
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 9ba59fcba549..a42e57da270d 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -144,6 +144,12 @@ struct pinctrl_desc {
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
 extern void pinctrl_unregister(struct pinctrl_dev *pctldev);
+extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
+				struct pinctrl_desc *pctldesc,
+				void *driver_data);
+extern void devm_pinctrl_unregister(struct device *dev,
+				struct pinctrl_dev *pctldev);
+
 extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin);
 extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
-- 
cgit v1.2.3


From b53f27e4fa0d0e72d897830cc4f3f83d2a25d952 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:23 -0700
Subject: string_helpers: add kstrdup_quotable

Handle allocating and escaping a string safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  2 ++
 lib/string_helpers.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index dabe643eb5fa..9de228af00c1 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -68,4 +68,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 	return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
+char *kstrdup_quotable(const char *src, gfp_t gfp);
+
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5c88204b6f1f..aa00c9f989ee 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,7 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>
 
@@ -534,3 +535,30 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 	return p - dst;
 }
 EXPORT_SYMBOL(string_escape_mem);
+
+/*
+ * Return an allocated string that has been escaped of special characters
+ * and double quotes, making it safe to log in quotes.
+ */
+char *kstrdup_quotable(const char *src, gfp_t gfp)
+{
+	size_t slen, dlen;
+	char *dst;
+	const int flags = ESCAPE_HEX;
+	const char esc[] = "\f\n\r\t\v\a\e\\\"";
+
+	if (!src)
+		return NULL;
+	slen = strlen(src);
+
+	dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
+	dst = kmalloc(dlen + 1, gfp);
+	if (!dst)
+		return NULL;
+
+	WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
+	dst[dlen] = '\0';
+
+	return dst;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable);
-- 
cgit v1.2.3


From 0d0443288f2244d7054796086e481ddef6abdbba Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:24 -0700
Subject: string_helpers: add kstrdup_quotable_cmdline

Provide an escaped (but readable: no inter-argument NULLs) commandline
safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  1 +
 lib/string_helpers.c           | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9de228af00c1..684d2695fc36 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -69,5 +69,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 }
 
 char *kstrdup_quotable(const char *src, gfp_t gfp);
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
 
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index aa00c9f989ee..b16ee85aaf87 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,7 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string_helpers.h>
@@ -562,3 +563,36 @@ char *kstrdup_quotable(const char *src, gfp_t gfp)
 	return dst;
 }
 EXPORT_SYMBOL_GPL(kstrdup_quotable);
+
+/*
+ * Returns allocated NULL-terminated string containing process
+ * command line, with inter-argument NULLs replaced with spaces,
+ * and other special characters escaped.
+ */
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
+{
+	char *buffer, *quoted;
+	int i, res;
+
+	buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+	if (!buffer)
+		return NULL;
+
+	res = get_cmdline(task, buffer, PAGE_SIZE - 1);
+	buffer[res] = '\0';
+
+	/* Collapse trailing NULLs, leave res pointing to last non-NULL. */
+	while (--res >= 0 && buffer[res] == '\0')
+		;
+
+	/* Replace inter-argument NULLs. */
+	for (i = 0; i <= res; i++)
+		if (buffer[i] == '\0')
+			buffer[i] = ' ';
+
+	/* Make sure result is printable. */
+	quoted = kstrdup_quotable(buffer, gfp);
+	kfree(buffer);
+	return quoted;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
-- 
cgit v1.2.3


From 21985319add60b55fc27230d9421a3e5af7e998a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:25 -0700
Subject: string_helpers: add kstrdup_quotable_file

Allocate a NULL-terminated file path with special characters escaped,
safe for logging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/string_helpers.h |  3 +++
 lib/string_helpers.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 684d2695fc36..5ce9538f290e 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -3,6 +3,8 @@
 
 #include <linux/types.h>
 
+struct file;
+
 /* Descriptions of the types of units to
  * print in */
 enum string_size_units {
@@ -70,5 +72,6 @@ static inline int string_escape_str_any_np(const char *src, char *dst,
 
 char *kstrdup_quotable(const char *src, gfp_t gfp);
 char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp);
 
 #endif
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index b16ee85aaf87..ecaac2c0526f 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,8 @@
 #include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -596,3 +598,31 @@ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
 	return quoted;
 }
 EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
+
+/*
+ * Returns allocated NULL-terminated string containing pathname,
+ * with special characters escaped, able to be safely logged. If
+ * there is an error, the leading character will be "<".
+ */
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
+{
+	char *temp, *pathname;
+
+	if (!file)
+		return kstrdup("<unknown>", gfp);
+
+	/* We add 11 spaces for ' (deleted)' to be appended */
+	temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+	if (!temp)
+		return kstrdup("<no_memory>", gfp);
+
+	pathname = file_path(file, temp, PATH_MAX + 11);
+	if (IS_ERR(pathname))
+		pathname = kstrdup("<too_long>", gfp);
+	else
+		pathname = kstrdup_quotable(pathname, gfp);
+
+	kfree(temp);
+	return pathname;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_file);
-- 
cgit v1.2.3


From 1284ab5b2dcb927d38e4f3fbc2e307f3d1af9262 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 20 Apr 2016 15:46:27 -0700
Subject: fs: define a string representation of the kernel_read_file_id
 enumeration

A string representation of the kernel_read_file_id enumeration is
needed for displaying messages (eg. pr_info, auditing) that can be
used by multiple LSMs and the integrity subsystem.  To simplify
keeping the list of strings up to date with the enumeration, this
patch defines two new preprocessing macros named __fid_enumify and
__fid_stringify to create the enumeration and an array of strings.
kernel_read_file_id_str() returns a string based on the enumeration.

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
[kees: removed removal of my old version, constified pointer values]
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/fs.h | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..90477550b935 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2580,15 +2580,34 @@ static inline void i_readcount_inc(struct inode *inode)
 #endif
 extern int do_pipe_flags(int *, int);
 
+#define __kernel_read_file_id(id) \
+	id(UNKNOWN, unknown)		\
+	id(FIRMWARE, firmware)		\
+	id(MODULE, kernel-module)		\
+	id(KEXEC_IMAGE, kexec-image)		\
+	id(KEXEC_INITRAMFS, kexec-initramfs)	\
+	id(POLICY, security-policy)		\
+	id(MAX_ID, )
+
+#define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
+#define __fid_stringify(dummy, str) #str,
+
 enum kernel_read_file_id {
-	READING_FIRMWARE = 1,
-	READING_MODULE,
-	READING_KEXEC_IMAGE,
-	READING_KEXEC_INITRAMFS,
-	READING_POLICY,
-	READING_MAX_ID
+	__kernel_read_file_id(__fid_enumify)
+};
+
+static const char * const kernel_read_file_str[] = {
+	__kernel_read_file_id(__fid_stringify)
 };
 
+static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)
+{
+	if (id < 0 || id >= READING_MAX_ID)
+		return kernel_read_file_str[READING_UNKNOWN];
+
+	return kernel_read_file_str[id];
+}
+
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 			    enum kernel_read_file_id);
-- 
cgit v1.2.3


From 9b091556a073a9f5f93e2ad23d118f45c4796a84 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 20 Apr 2016 15:46:28 -0700
Subject: LSM: LoadPin for kernel file loading restrictions

This LSM enforces that kernel-loaded files (modules, firmware, etc)
must all come from the same filesystem, with the expectation that
such a filesystem is backed by a read-only device such as dm-verity
or CDROM. This allows systems that have a verified and/or unchangeable
filesystem to enforce module and firmware loading restrictions without
needing to sign the files individually.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 Documentation/security/LoadPin.txt |  17 ++++
 MAINTAINERS                        |   6 ++
 include/linux/lsm_hooks.h          |   5 +
 security/Kconfig                   |   1 +
 security/Makefile                  |   2 +
 security/loadpin/Kconfig           |  10 ++
 security/loadpin/Makefile          |   1 +
 security/loadpin/loadpin.c         | 190 +++++++++++++++++++++++++++++++++++++
 security/security.c                |   1 +
 9 files changed, 233 insertions(+)
 create mode 100644 Documentation/security/LoadPin.txt
 create mode 100644 security/loadpin/Kconfig
 create mode 100644 security/loadpin/Makefile
 create mode 100644 security/loadpin/loadpin.c

(limited to 'include/linux')

diff --git a/Documentation/security/LoadPin.txt b/Documentation/security/LoadPin.txt
new file mode 100644
index 000000000000..e11877f5d3d4
--- /dev/null
+++ b/Documentation/security/LoadPin.txt
@@ -0,0 +1,17 @@
+LoadPin is a Linux Security Module that ensures all kernel-loaded files
+(modules, firmware, etc) all originate from the same filesystem, with
+the expectation that such a filesystem is backed by a read-only device
+such as dm-verity or CDROM. This allows systems that have a verified
+and/or unchangeable filesystem to enforce module and firmware loading
+restrictions without needing to sign the files individually.
+
+The LSM is selectable at build-time with CONFIG_SECURITY_LOADPIN, and
+can be controlled at boot-time with the kernel command line option
+"loadpin.enabled". By default, it is enabled, but can be disabled at
+boot ("loadpin.enabled=0").
+
+LoadPin starts pinning when it sees the first file loaded. If the
+block device backing the filesystem is not read-only, a sysctl is
+created to toggle pinning: /proc/sys/kernel/loadpin/enabled. (Having
+a mutable filesystem means pinning is mutable too, but having the
+sysctl allows for easy testing on systems with a mutable filesystem.)
diff --git a/MAINTAINERS b/MAINTAINERS
index 1c32f8a3d6c4..b4b1e8179018 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9962,6 +9962,12 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
 S:	Supported
 F:	security/apparmor/
 
+LOADPIN SECURITY MODULE
+M:	Kees Cook <keescook@chromium.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/loadpin
+S:	Supported
+F:	security/loadpin/
+
 YAMA SECURITY MODULE
 M:	Kees Cook <keescook@chromium.org>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index ae2537886177..6e466fc0666c 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1892,5 +1892,10 @@ extern void __init yama_add_hooks(void);
 #else
 static inline void __init yama_add_hooks(void) { }
 #endif
+#ifdef CONFIG_SECURITY_LOADPIN
+void __init loadpin_add_hooks(void);
+#else
+static inline void loadpin_add_hooks(void) { };
+#endif
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/security/Kconfig b/security/Kconfig
index e45237897b43..176758cdfa57 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -122,6 +122,7 @@ source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig
 source security/apparmor/Kconfig
+source security/loadpin/Kconfig
 source security/yama/Kconfig
 
 source security/integrity/Kconfig
diff --git a/security/Makefile b/security/Makefile
index c9bfbc84ff50..f2d71cdb8e19 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -8,6 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK)		+= smack
 subdir-$(CONFIG_SECURITY_TOMOYO)        += tomoyo
 subdir-$(CONFIG_SECURITY_APPARMOR)	+= apparmor
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
+subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -22,6 +23,7 @@ obj-$(CONFIG_AUDIT)			+= lsm_audit.o
 obj-$(CONFIG_SECURITY_TOMOYO)		+= tomoyo/
 obj-$(CONFIG_SECURITY_APPARMOR)		+= apparmor/
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
+obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists
diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
new file mode 100644
index 000000000000..c668ac4eda65
--- /dev/null
+++ b/security/loadpin/Kconfig
@@ -0,0 +1,10 @@
+config SECURITY_LOADPIN
+	bool "Pin load of kernel files (modules, fw, etc) to one filesystem"
+	depends on SECURITY && BLOCK
+	help
+	  Any files read through the kernel file reading interface
+	  (kernel modules, firmware, kexec images, security policy) will
+	  be pinned to the first filesystem used for loading. Any files
+	  that come from other filesystems will be rejected. This is best
+	  used on systems without an initrd that have a root filesystem
+	  backed by a read-only device such as dm-verity or a CDROM.
diff --git a/security/loadpin/Makefile b/security/loadpin/Makefile
new file mode 100644
index 000000000000..c2d77f83037b
--- /dev/null
+++ b/security/loadpin/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SECURITY_LOADPIN) += loadpin.o
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
new file mode 100644
index 000000000000..e4debae3c4d6
--- /dev/null
+++ b/security/loadpin/loadpin.c
@@ -0,0 +1,190 @@
+/*
+ * Module and Firmware Pinning Security Module
+ *
+ * Copyright 2011-2016 Google Inc.
+ *
+ * Author: Kees Cook <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "LoadPin: " fmt
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/lsm_hooks.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sched.h>	/* current */
+#include <linux/string_helpers.h>
+
+static void report_load(const char *origin, struct file *file, char *operation)
+{
+	char *cmdline, *pathname;
+
+	pathname = kstrdup_quotable_file(file, GFP_KERNEL);
+	cmdline = kstrdup_quotable_cmdline(current, GFP_KERNEL);
+
+	pr_notice("%s %s obj=%s%s%s pid=%d cmdline=%s%s%s\n",
+		  origin, operation,
+		  (pathname && pathname[0] != '<') ? "\"" : "",
+		  pathname,
+		  (pathname && pathname[0] != '<') ? "\"" : "",
+		  task_pid_nr(current),
+		  cmdline ? "\"" : "", cmdline, cmdline ? "\"" : "");
+
+	kfree(cmdline);
+	kfree(pathname);
+}
+
+static int enabled = 1;
+static struct super_block *pinned_root;
+static DEFINE_SPINLOCK(pinned_root_spinlock);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int one = 1;
+
+static struct ctl_path loadpin_sysctl_path[] = {
+	{ .procname = "kernel", },
+	{ .procname = "loadpin", },
+	{ }
+};
+
+static struct ctl_table loadpin_sysctl_table[] = {
+	{
+		.procname       = "enabled",
+		.data           = &enabled,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+	{ }
+};
+
+/*
+ * This must be called after early kernel init, since then the rootdev
+ * is available.
+ */
+static void check_pinning_enforcement(struct super_block *mnt_sb)
+{
+	bool ro = false;
+
+	/*
+	 * If load pinning is not enforced via a read-only block
+	 * device, allow sysctl to change modes for testing.
+	 */
+	if (mnt_sb->s_bdev) {
+		ro = bdev_read_only(mnt_sb->s_bdev);
+		pr_info("dev(%u,%u): %s\n",
+			MAJOR(mnt_sb->s_bdev->bd_dev),
+			MINOR(mnt_sb->s_bdev->bd_dev),
+			ro ? "read-only" : "writable");
+	} else
+		pr_info("mnt_sb lacks block device, treating as: writable\n");
+
+	if (!ro) {
+		if (!register_sysctl_paths(loadpin_sysctl_path,
+					   loadpin_sysctl_table))
+			pr_notice("sysctl registration failed!\n");
+		else
+			pr_info("load pinning can be disabled.\n");
+	} else
+		pr_info("load pinning engaged.\n");
+}
+#else
+static void check_pinning_enforcement(struct super_block *mnt_sb)
+{
+	pr_info("load pinning engaged.\n");
+}
+#endif
+
+static void loadpin_sb_free_security(struct super_block *mnt_sb)
+{
+	/*
+	 * When unmounting the filesystem we were using for load
+	 * pinning, we acknowledge the superblock release, but make sure
+	 * no other modules or firmware can be loaded.
+	 */
+	if (!IS_ERR_OR_NULL(pinned_root) && mnt_sb == pinned_root) {
+		pinned_root = ERR_PTR(-EIO);
+		pr_info("umount pinned fs: refusing further loads\n");
+	}
+}
+
+static int loadpin_read_file(struct file *file, enum kernel_read_file_id id)
+{
+	struct super_block *load_root;
+	const char *origin = kernel_read_file_id_str(id);
+
+	/* This handles the older init_module API that has a NULL file. */
+	if (!file) {
+		if (!enabled) {
+			report_load(origin, NULL, "old-api-pinning-ignored");
+			return 0;
+		}
+
+		report_load(origin, NULL, "old-api-denied");
+		return -EPERM;
+	}
+
+	load_root = file->f_path.mnt->mnt_sb;
+
+	/* First loaded module/firmware defines the root for all others. */
+	spin_lock(&pinned_root_spinlock);
+	/*
+	 * pinned_root is only NULL at startup. Otherwise, it is either
+	 * a valid reference, or an ERR_PTR.
+	 */
+	if (!pinned_root) {
+		pinned_root = load_root;
+		/*
+		 * Unlock now since it's only pinned_root we care about.
+		 * In the worst case, we will (correctly) report pinning
+		 * failures before we have announced that pinning is
+		 * enabled. This would be purely cosmetic.
+		 */
+		spin_unlock(&pinned_root_spinlock);
+		check_pinning_enforcement(pinned_root);
+		report_load(origin, file, "pinned");
+	} else {
+		spin_unlock(&pinned_root_spinlock);
+	}
+
+	if (IS_ERR_OR_NULL(pinned_root) || load_root != pinned_root) {
+		if (unlikely(!enabled)) {
+			report_load(origin, file, "pinning-ignored");
+			return 0;
+		}
+
+		report_load(origin, file, "denied");
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static struct security_hook_list loadpin_hooks[] = {
+	LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security),
+	LSM_HOOK_INIT(kernel_read_file, loadpin_read_file),
+};
+
+void __init loadpin_add_hooks(void)
+{
+	pr_info("ready to pin (currently %sabled)", enabled ? "en" : "dis");
+	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks));
+}
+
+/* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
+module_param(enabled, int, 0);
+MODULE_PARM_DESC(enabled, "Pin module/firmware loading (default: true)");
diff --git a/security/security.c b/security/security.c
index 554c3fb7d4a5..e42860899f23 100644
--- a/security/security.c
+++ b/security/security.c
@@ -60,6 +60,7 @@ int __init security_init(void)
 	 */
 	capability_add_hooks();
 	yama_add_hooks();
+	loadpin_add_hooks();
 
 	/*
 	 * Load all the remaining security modules.
-- 
cgit v1.2.3


From 85b67bcb7e4a23ced05e7020bf5843b9857f6881 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 18 Apr 2016 20:11:50 -0700
Subject: perf, bpf: minimize the size of perf_trace_() tracepoint handler

move trace_call_bpf() into helper function to minimize the size
of perf_trace_*() tracepoint handlers.
    text	   data	    bss	    dec	 	   hex	filename
10541679	5526646	2945024	19013349	1221ee5	vmlinux_before
10509422	5526646	2945024	18981092	121a0e4	vmlinux_after

It may seem that perf_fetch_caller_regs() can also be moved,
but that is incorrect, since ip/sp will be wrong.

bpf+tracepoint performance is not affected, since
perf_swevent_put_recursion_context() is now inlined.
export_symbol_gpl can also be dropped.

No measurable change in normal perf tracepoints.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h |  5 +++++
 include/trace/perf.h         | 13 +++----------
 kernel/events/core.c         | 20 +++++++++++++++++++-
 3 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fe6441203b59..222f6aa0418f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -609,6 +609,11 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task);
+
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
diff --git a/include/trace/perf.h b/include/trace/perf.h
index a182306eefd7..88de5c205e86 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -64,16 +64,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	if (prog) {							\
-		*(struct pt_regs **)entry = __regs;			\
-		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
-			perf_swevent_put_recursion_context(rctx);	\
-			return;						\
-		}							\
-	}								\
-	perf_trace_buf_submit(entry, __entry_size, rctx,		\
-			      event_call->event.type, __count, __regs,	\
-			      head, __task);				\
+	perf_trace_run_bpf_submit(entry, __entry_size, rctx,		\
+				  event_call, __count, __regs,		\
+				  head, __task);			\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5056abffef27..9eb23dc27462 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6741,7 +6741,6 @@ void perf_swevent_put_recursion_context(int rctx)
 
 	put_recursion_context(swhash->recursion, rctx);
 }
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -6998,6 +6997,25 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task)
+{
+	struct bpf_prog *prog = call->prog;
+
+	if (prog) {
+		*(struct pt_regs **)raw_data = regs;
+		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+			perf_swevent_put_recursion_context(rctx);
+			return;
+		}
+	}
+	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
+		      rctx, task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
-- 
cgit v1.2.3


From b1c20f0b97b4e565fa50cde1e6b44c2fd327a1e0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Tue, 19 Apr 2016 14:02:19 -0400
Subject: netdev_features: Fold NETIF_F_ALL_TSO into NETIF_F_GSO_SOFTWARE

This patch folds NETIF_F_ALL_TSO into the bitmask for NETIF_F_GSO_SOFTWARE.
The idea is to avoid duplication of defines since the only difference
between the two was the GSO_UDP bit.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 15eb0b12fff9..bc8736266749 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -152,11 +152,6 @@ enum {
 #define NETIF_F_GSO_MASK	(__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \
 		__NETIF_F_BIT(NETIF_F_GSO_SHIFT))
 
-/* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
-				 NETIF_F_TSO_MANGLEID | \
-				 NETIF_F_TSO6 | NETIF_F_UFO)
-
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
  * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set--
  * this would be contradictory
@@ -170,6 +165,9 @@ enum {
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
 
+/* List of features with software fallbacks. */
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_UFO)
+
 /*
  * If one device supports one of these features, then enable them
  * for all in netdev_increment_features.
-- 
cgit v1.2.3


From 237cd218099ce96edf2890a49aa191b38b84c2fc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 20 Apr 2016 22:02:09 +0300
Subject: net/mlx5: Introduce device queue counters

A queue counter can collect several statistics for one or more
hardware queues (QPs, RQs, etc ..) that the counter is attached to.

For Ethernet it will provide an "out of buffer" counter which
collects the number of all packets that are dropped due to lack
of software buffers.

Here we add device commands to alloc/query/dealloc queue counters.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 68 ++++++++++++++++++++++++++++
 include/linux/mlx5/qp.h                      |  6 +++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index def289375ecb..b720a274220d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -538,3 +538,71 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 	mlx5_core_destroy_sq(dev, sq->qpn);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
+
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
+
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
+
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, clear, reset);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_size);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
+
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+	void *out;
+	int err;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_q_counter(dev, counter_id, 0, out, outlen);
+	if (!err)
+		*out_of_buffer = MLX5_GET(query_q_counter_out, out,
+					  out_of_buffer);
+
+	kfree(out);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index cf031a3f16c5..64221027bf1f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -668,6 +668,12 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *sq);
 void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 				  struct mlx5_core_qp *sq);
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id);
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id);
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size);
+int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
+				  u32 *out_of_buffer);
 
 static inline const char *mlx5_qp_type_str(int type)
 {
-- 
cgit v1.2.3


From 461017cb006aa1b39b0f647ae0ee2d9d84eef05b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 20 Apr 2016 22:02:13 +0300
Subject: net/mlx5e: Support RX multi-packet WQE (Striding RQ)

Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.

Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.

In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.

For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.

MPWQE default configuration:
	Num of WQEs	= 16
	Strides Per WQE = 2048
	Stride Size	= 64 byte

The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.

Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.

* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
  default, 64B, 1024B, 1478B, 65536B.

* Netperf multi TCP stream:
- No degradation, line rate reached.

* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.

* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
  |     2K      |       ~ 1K          |       0
  |     8K      |       ~ 6K          |       0
  |     16K     |       ~13K          |       0
  |     32K     |       ~28K          |       0
  |     64K     |       ~57K          |     ~24K

As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  77 ++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  15 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 109 +++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 153 +++++++++++++++++++--
 include/linux/mlx5/device.h                        |  39 +++++-
 5 files changed, 349 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 61e249d8d7f8..f519148d7dcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -57,12 +57,30 @@
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xd
 
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x1
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW            0x4
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW            0x6
+
+#define MLX5_MPWRQ_LOG_NUM_STRIDES		11 /* >= 9, HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SIZE		6  /* >= 6, HW restriction */
+#define MLX5_MPWRQ_NUM_STRIDES			BIT(MLX5_MPWRQ_LOG_NUM_STRIDES)
+#define MLX5_MPWRQ_STRIDE_SIZE			BIT(MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_LOG_WQE_SZ			(MLX5_MPWRQ_LOG_NUM_STRIDES +\
+						 MLX5_MPWRQ_LOG_STRIDE_SIZE)
+#define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
+				    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
+#define MLX5_MPWRQ_PAGES_PER_WQE		BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
+#define MLX5_MPWRQ_STRIDES_PER_PAGE		(MLX5_MPWRQ_NUM_STRIDES >> \
+						 MLX5_MPWRQ_WQE_PAGE_ORDER)
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(128)
+
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW            0x2
 
 #define MLX5E_LOG_INDIR_RQT_SIZE       0x7
 #define MLX5E_INDIR_RQT_SIZE           BIT(MLX5E_LOG_INDIR_RQT_SIZE)
@@ -74,6 +92,38 @@
 #define MLX5E_NUM_MAIN_GROUPS 9
 #define MLX5E_NET_IP_ALIGN 2
 
+static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW,
+			     wq_size / 2);
+	default:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES,
+			     wq_size / 2);
+	}
+}
+
+static inline int mlx5_min_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+	}
+}
+
+static inline int mlx5_max_log_rq_size(int wq_type)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW;
+	default:
+		return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	}
+}
+
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg  eth;
@@ -128,6 +178,7 @@ static const char vport_strings[][ETH_GSTRING_LEN] = {
 	"tx_queue_wake",
 	"tx_queue_dropped",
 	"rx_wqe_err",
+	"rx_mpwqe_filler",
 };
 
 struct mlx5e_vport_stats {
@@ -169,8 +220,9 @@ struct mlx5e_vport_stats {
 	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
 
-#define NUM_VPORT_COUNTERS     35
+#define NUM_VPORT_COUNTERS     36
 };
 
 static const char pport_strings[][ETH_GSTRING_LEN] = {
@@ -263,7 +315,8 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"csum_sw",
 	"lro_packets",
 	"lro_bytes",
-	"wqe_err"
+	"wqe_err",
+	"mpwqe_filler",
 };
 
 struct mlx5e_rq_stats {
@@ -274,7 +327,8 @@ struct mlx5e_rq_stats {
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
-#define NUM_RQ_STATS 7
+	u64 mpwqe_filler;
+#define NUM_RQ_STATS 8
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
@@ -318,6 +372,7 @@ struct mlx5e_stats {
 
 struct mlx5e_params {
 	u8  log_sq_size;
+	u8  rq_wq_type;
 	u8  log_rq_size;
 	u16 num_channels;
 	u8  num_tc;
@@ -374,11 +429,23 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq *rq,
 typedef int (*mlx5e_fp_alloc_wqe)(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,
 				  u16 ix);
 
+struct mlx5e_dma_info {
+	struct page	*page;
+	dma_addr_t	addr;
+};
+
+struct mlx5e_mpw_info {
+	struct mlx5e_dma_info dma_info;
+	u16 consumed_strides;
+	u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
+};
+
 struct mlx5e_rq {
 	/* data path */
 	struct mlx5_wq_ll      wq;
 	u32                    wqe_sz;
 	struct sk_buff       **skb;
+	struct mlx5e_mpw_info *wqe_info;
 
 	struct device         *pdev;
 	struct net_device     *netdev;
@@ -393,6 +460,7 @@ struct mlx5e_rq {
 
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;
+	u8                     wq_type;
 	u32                    rqn;
 	struct mlx5e_channel  *channel;
 	struct mlx5e_priv     *priv;
@@ -649,9 +717,12 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 6f40ba448f07..4077856aab76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -273,8 +273,9 @@ static void mlx5e_get_ringparam(struct net_device *dev,
 				struct ethtool_ringparam *param)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	int rq_wq_type = priv->params.rq_wq_type;
 
-	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->rx_max_pending = 1 << mlx5_max_log_rq_size(rq_wq_type);
 	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
 	param->rx_pending     = 1 << priv->params.log_rq_size;
 	param->tx_pending     = 1 << priv->params.log_sq_size;
@@ -285,6 +286,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	bool was_opened;
+	int rq_wq_type = priv->params.rq_wq_type;
 	u16 min_rx_wqes;
 	u8 log_rq_size;
 	u8 log_sq_size;
@@ -300,16 +302,16 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 			    __func__);
 		return -EINVAL;
 	}
-	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending < (1 << mlx5_min_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) < min (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_min_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
-	if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) {
+	if (param->rx_pending > (1 << mlx5_max_log_rq_size(rq_wq_type))) {
 		netdev_info(dev, "%s: rx_pending (%d) > max (%d)\n",
 			    __func__, param->rx_pending,
-			    1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE);
+			    1 << mlx5_max_log_rq_size(rq_wq_type));
 		return -EINVAL;
 	}
 	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
@@ -327,8 +329,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
 
 	log_rq_size = order_base_2(param->rx_pending);
 	log_sq_size = order_base_2(param->tx_pending);
-	min_rx_wqes = min_t(u16, param->rx_pending - 1,
-			    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES);
+	min_rx_wqes = mlx5_min_rx_wqes(rq_wq_type, param->rx_pending);
 
 	if (log_rq_size == priv->params.log_rq_size &&
 	    log_sq_size == priv->params.log_sq_size &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 23ba12c3a738..871f3af204dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -175,6 +175,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->rx_csum_none		= 0;
 	s->rx_csum_sw		= 0;
 	s->rx_wqe_err		= 0;
+	s->rx_mpwqe_filler	= 0;
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -185,6 +186,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		s->rx_csum_none	+= rq_stats->csum_none;
 		s->rx_csum_sw	+= rq_stats->csum_sw;
 		s->rx_wqe_err   += rq_stats->wqe_err;
+		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -323,6 +325,7 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 byte_count;
 	int wq_sz;
 	int err;
 	int i;
@@ -337,28 +340,47 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-	rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
-			       cpu_to_node(c->cpu));
-	if (!rq->skb) {
-		err = -ENOMEM;
-		goto err_rq_wq_destroy;
-	}
 
-	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
-					     MLX5E_SW2HW_MTU(priv->netdev->mtu);
-	rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		rq->wqe_info = kzalloc_node(wq_sz * sizeof(*rq->wqe_info),
+					    GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe_info) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe_mpwrq;
+		rq->alloc_wqe = mlx5e_alloc_rx_mpwqe;
+
+		rq->wqe_sz = MLX5_MPWRQ_NUM_STRIDES * MLX5_MPWRQ_STRIDE_SIZE;
+		byte_count = rq->wqe_sz;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
+				       cpu_to_node(c->cpu));
+		if (!rq->skb) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
+		rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+
+		rq->wqe_sz = (priv->params.lro_en) ?
+				priv->params.lro_wqe_sz :
+				MLX5E_SW2HW_MTU(priv->netdev->mtu);
+		rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
+		byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+		byte_count |= MLX5_HW_START_PADDING;
+	}
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-		u32 byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
 
 		wqe->data.lkey       = c->mkey_be;
-		wqe->data.byte_count =
-			cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+		wqe->data.byte_count = cpu_to_be32(byte_count);
 	}
 
-	rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
-	rq->alloc_wqe = mlx5e_alloc_rx_wqe;
+	rq->wq_type = priv->params.rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = &priv->tstamp;
@@ -376,7 +398,14 @@ err_rq_wq_destroy:
 
 static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
-	kfree(rq->skb);
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->wqe_info);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		kfree(rq->skb);
+	}
+
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
@@ -1065,7 +1094,18 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type,          MLX5_WQ_TYPE_LINKED_LIST);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 MLX5_MPWRQ_LOG_NUM_STRIDES - 9);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 MLX5_MPWRQ_LOG_STRIDE_SIZE - 6);
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	}
+
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
@@ -1111,8 +1151,18 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 				    struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
+	u8 log_cq_size;
 
-	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_rq_size);
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		log_cq_size = priv->params.log_rq_size +
+			MLX5_MPWRQ_LOG_NUM_STRIDES;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		log_cq_size = priv->params.log_rq_size;
+	}
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
@@ -1983,7 +2033,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 	if (changes & NETIF_F_LRO) {
 		bool was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
@@ -1992,7 +2043,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
 				       err);
 
-		if (was_opened)
+		if (was_opened && (priv->params.rq_wq_type ==
+				   MLX5_WQ_TYPE_LINKED_LIST))
 			err = mlx5e_open_locked(priv->netdev);
 	}
 
@@ -2327,8 +2379,21 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 
 	priv->params.log_sq_size           =
 		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
-	priv->params.log_rq_size           =
-		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	priv->params.rq_wq_type = MLX5_CAP_GEN(mdev, striding_rq) ?
+		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
+		MLX5_WQ_TYPE_LINKED_LIST;
+
+	switch (priv->params.rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
+		priv->params.lro_en = true;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	}
+
+	priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
+					    BIT(priv->params.log_rq_size));
 	priv->params.rx_cq_moderation_usec =
 		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_pkts =
@@ -2338,8 +2403,6 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 	priv->params.tx_cq_moderation_pkts =
 		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.tx_max_inline         = mlx5e_get_max_inline_cap(mdev);
-	priv->params.min_rx_wqes           =
-		MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.num_tc                = 1;
 	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d7cccedddf34..71f3a5d244ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -76,6 +76,41 @@ err_free_skb:
 	return -ENOMEM;
 }
 
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[ix];
+	gfp_t gfp_mask;
+	int i;
+
+	gfp_mask = GFP_ATOMIC | __GFP_COLD | __GFP_MEMALLOC;
+	wi->dma_info.page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+					     MLX5_MPWRQ_WQE_PAGE_ORDER);
+	if (unlikely(!wi->dma_info.page))
+		return -ENOMEM;
+
+	wi->dma_info.addr = dma_map_page(rq->pdev, wi->dma_info.page, 0,
+					 rq->wqe_sz, PCI_DMA_FROMDEVICE);
+	if (unlikely(dma_mapping_error(rq->pdev, wi->dma_info.addr))) {
+		put_page(wi->dma_info.page);
+		return -ENOMEM;
+	}
+
+	/* We split the high-order page into order-0 ones and manage their
+	 * reference counter to minimize the memory held by small skb fragments
+	 */
+	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+		atomic_add(MLX5_MPWRQ_STRIDES_PER_PAGE,
+			   &wi->dma_info.page[i]._count);
+		wi->skbs_frags[i] = 0;
+	}
+
+	wi->consumed_strides = 0;
+	wqe->data.addr       = cpu_to_be64(wi->dma_info.addr);
+
+	return 0;
+}
+
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->wq;
@@ -100,7 +135,8 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	return !mlx5_wq_ll_is_full(wq);
 }
 
-static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
+static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+				 u32 cqe_bcnt)
 {
 	struct ethhdr	*eth	= (struct ethhdr *)(skb->data);
 	struct iphdr	*ipv4	= (struct iphdr *)(skb->data + ETH_HLEN);
@@ -111,7 +147,7 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe)
 	int tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA  == l4_hdr_type) ||
 		       (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type));
 
-	u16 tot_len = be32_to_cpu(cqe->byte_cnt) - ETH_HLEN;
+	u16 tot_len = cqe_bcnt - ETH_HLEN;
 
 	if (eth->h_proto == htons(ETH_P_IP)) {
 		tcp = (struct tcphdr *)(skb->data + ETH_HLEN +
@@ -191,19 +227,17 @@ csum_none:
 }
 
 static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
+				      u32 cqe_bcnt,
 				      struct mlx5e_rq *rq,
 				      struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	u32 cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	int lro_num_seg;
 
-	skb_put(skb, cqe_bcnt);
-
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	if (lro_num_seg > 1) {
-		mlx5e_lro_update_hdr(skb, cqe);
+		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
 		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
 		rq->stats.lro_packets++;
 		rq->stats.lro_bytes += cqe_bcnt;
@@ -228,12 +262,24 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
 }
 
+static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	rq->stats.packets++;
+	rq->stats.bytes += cqe_bcnt;
+	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	__be16 wqe_counter_be;
 	u16 wqe_counter;
+	u32 cqe_bcnt;
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
@@ -253,16 +299,103 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto wq_ll_pop;
 	}
 
-	mlx5e_build_rx_skb(cqe, rq, skb);
-	rq->stats.packets++;
-	rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
-	napi_gro_receive(rq->cq.napi, skb);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+	skb_put(skb, cqe_bcnt);
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
+	struct mlx5e_mpw_info *wi = &rq->wqe_info[wqe_id];
+	struct mlx5e_rx_wqe  *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	struct sk_buff *skb;
+	u32 consumed_bytes;
+	u32 head_offset;
+	u32 frag_offset;
+	u32 wqe_offset;
+	u32 page_idx;
+	u16 byte_cnt;
+	u16 cqe_bcnt;
+	u16 headlen;
+	int i;
+
+	wi->consumed_strides += cstrides;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		goto mpwrq_cqe_out;
+	}
+
+	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
+		rq->stats.mpwqe_filler++;
+		goto mpwrq_cqe_out;
+	}
+
+	skb = netdev_alloc_skb(rq->netdev,
+			       ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
+				     sizeof(long)));
+	if (unlikely(!skb))
+		goto mpwrq_cqe_out;
+
+	prefetch(skb->data);
+	wqe_offset = stride_ix * MLX5_MPWRQ_STRIDE_SIZE;
+	consumed_bytes = cstrides * MLX5_MPWRQ_STRIDE_SIZE;
+	dma_sync_single_for_cpu(rq->pdev, wi->dma_info.addr + wqe_offset,
+				consumed_bytes, DMA_FROM_DEVICE);
+
+	head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	page_idx       = wqe_offset >> PAGE_SHIFT;
+	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+	headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	frag_offset = head_offset + headlen;
+
+	byte_cnt = cqe_bcnt - headlen;
+	while (byte_cnt) {
+		u32 pg_consumed_bytes =
+			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize =
+			ALIGN(pg_consumed_bytes, MLX5_MPWRQ_STRIDE_SIZE);
+
+		wi->skbs_frags[page_idx]++;
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+				&wi->dma_info.page[page_idx], frag_offset,
+				pg_consumed_bytes, truesize);
+		byte_cnt -= pg_consumed_bytes;
+		frag_offset = 0;
+		page_idx++;
+	}
+
+	skb_copy_to_linear_data(skb,
+				page_address(wi->dma_info.page) + wqe_offset,
+				ALIGN(headlen, sizeof(long)));
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+
+mpwrq_cqe_out:
+	if (likely(wi->consumed_strides < MLX5_MPWRQ_NUM_STRIDES))
+		return;
+
+	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
+		       PCI_DMA_FROMDEVICE);
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+		atomic_sub(MLX5_MPWRQ_STRIDES_PER_PAGE - wi->skbs_frags[i],
+			   &wi->dma_info.page[i]._count);
+		put_page(&wi->dma_info.page[i]);
+	}
+	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+}
+
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8156e3c9239c..03f8d719b680 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -644,7 +644,8 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8		rsvd0[4];
+	u8              rsvd0[2];
+	__be16          wqe_id;
 	u8		lro_tcppsh_abort_dupack;
 	u8		lro_min_ttl;
 	__be16		lro_tcp_win;
@@ -696,6 +697,42 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
 	return (u64)lo | ((u64)hi << 32);
 }
 
+struct mpwrq_cqe_bc {
+	__be16	filler_consumed_strides;
+	__be16	byte_cnt;
+};
+
+static inline u16 mpwrq_get_cqe_byte_cnt(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return be16_to_cpu(bc->byte_cnt);
+}
+
+static inline u16 mpwrq_get_cqe_bc_consumed_strides(struct mpwrq_cqe_bc *bc)
+{
+	return 0x7fff & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_consumed_strides(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return mpwrq_get_cqe_bc_consumed_strides(bc);
+}
+
+static inline bool mpwrq_is_filler_cqe(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return 0x8000 & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_stride_index(struct mlx5_cqe64 *cqe)
+{
+	return be16_to_cpu(cqe->wqe_counter);
+}
+
 enum {
 	CQE_L4_HDR_TYPE_NONE			= 0x0,
 	CQE_L4_HDR_TYPE_TCP_NO_ACK		= 0x1,
-- 
cgit v1.2.3


From b7aade15485a660cbf5161962c284131324a9534 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Apr 2016 21:19:47 +0200
Subject: vxlan: break dependency with netdev drivers

Currently all drivers depend and autoload the vxlan module because how
vxlan_get_rx_port is linked into them. Remove this dependency:

By using a new event type in the netdevice notifier call chain we proxy
the request from the drivers to flush and resetup the vxlan ports not
directly via function call but by the already existing netdevice
notifier call chain.

I added a separate new event type, NETDEV_OFFLOAD_PUSH_VXLAN, to do so.
We don't need to save those ids, as the event type field is an unsigned
long and using specialized event types for this purpose seemed to be a
more elegant way. This also comes in beneficial if in future we want to
add offloading knobs for vxlan.

Cc: Jesse Gross <jesse@kernel.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 14 +++++++++-----
 include/linux/netdevice.h |  1 +
 include/net/vxlan.h       |  6 ++----
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c2e22c2532a1..6fb93b57a724 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2527,7 +2527,7 @@ static struct device_type vxlan_type = {
  * supply the listening VXLAN udp ports. Callers are expected
  * to implement the ndo_add_vxlan_port.
  */
-void vxlan_get_rx_port(struct net_device *dev)
+static void vxlan_push_rx_ports(struct net_device *dev)
 {
 	struct vxlan_sock *vs;
 	struct net *net = dev_net(dev);
@@ -2536,6 +2536,9 @@ void vxlan_get_rx_port(struct net_device *dev)
 	__be16 port;
 	unsigned int i;
 
+	if (!dev->netdev_ops->ndo_add_vxlan_port)
+		return;
+
 	spin_lock(&vn->sock_lock);
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
 		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
@@ -2547,7 +2550,6 @@ void vxlan_get_rx_port(struct net_device *dev)
 	}
 	spin_unlock(&vn->sock_lock);
 }
-EXPORT_SYMBOL_GPL(vxlan_get_rx_port);
 
 /* Initialize the device structure. */
 static void vxlan_setup(struct net_device *dev)
@@ -3283,20 +3285,22 @@ static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
 	unregister_netdevice_many(&list_kill);
 }
 
-static int vxlan_lowerdev_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
+static int vxlan_netdevice_event(struct notifier_block *unused,
+				 unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 
 	if (event == NETDEV_UNREGISTER)
 		vxlan_handle_lowerdev_unregister(vn, dev);
+	else if (event == NETDEV_OFFLOAD_PUSH_VXLAN)
+		vxlan_push_rx_ports(dev);
 
 	return NOTIFY_DONE;
 }
 
 static struct notifier_block vxlan_notifier_block __read_mostly = {
-	.notifier_call = vxlan_lowerdev_event,
+	.notifier_call = vxlan_netdevice_event,
 };
 
 static __net_init int vxlan_init_net(struct net *net)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3bb534576a3..d4c8cd424f8d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2244,6 +2244,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_BONDING_INFO	0x0019
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
+#define NETDEV_OFFLOAD_PUSH_VXLAN	0x001C
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index d442eb3129cd..673e9f9e6da7 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -390,13 +390,11 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset)
 	return vni_field;
 }
 
-#if IS_ENABLED(CONFIG_VXLAN)
-void vxlan_get_rx_port(struct net_device *netdev);
-#else
 static inline void vxlan_get_rx_port(struct net_device *netdev)
 {
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev);
 }
-#endif
 
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 {
-- 
cgit v1.2.3


From 681e683ff30ada19f73c17c38a528528dd8824f1 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Apr 2016 21:19:48 +0200
Subject: geneve: break dependency with netdev drivers

Equivalent to "vxlan: break dependency with netdev drivers", don't
autoload geneve module in case the driver is loaded. Instead make the
coupling weaker by using netdevice notifiers as proxy.

Cc: Jesse Gross <jesse@kernel.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      | 31 ++++++++++++++++++++++++++++---
 include/linux/netdevice.h |  1 +
 include/net/geneve.h      |  6 ++----
 3 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 512dbe013713..9c40b88fabd5 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1172,7 +1172,7 @@ static struct device_type geneve_type = {
  * supply the listening GENEVE udp ports. Callers are expected
  * to implement the ndo_add_geneve_port.
  */
-void geneve_get_rx_port(struct net_device *dev)
+static void geneve_push_rx_ports(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
@@ -1181,6 +1181,9 @@ void geneve_get_rx_port(struct net_device *dev)
 	struct sock *sk;
 	__be16 port;
 
+	if (!dev->netdev_ops->ndo_add_geneve_port)
+		return;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(gs, &gn->sock_list, list) {
 		sk = gs->sock->sk;
@@ -1190,7 +1193,6 @@ void geneve_get_rx_port(struct net_device *dev)
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL_GPL(geneve_get_rx_port);
 
 /* Initialize the device structure. */
 static void geneve_setup(struct net_device *dev)
@@ -1538,6 +1540,21 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 }
 EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
 
+static int geneve_netdevice_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event == NETDEV_OFFLOAD_PUSH_GENEVE)
+		geneve_push_rx_ports(dev);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block geneve_notifier_block __read_mostly = {
+	.notifier_call = geneve_netdevice_event,
+};
+
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
@@ -1590,11 +1607,18 @@ static int __init geneve_init_module(void)
 	if (rc)
 		goto out1;
 
-	rc = rtnl_link_register(&geneve_link_ops);
+	rc = register_netdevice_notifier(&geneve_notifier_block);
 	if (rc)
 		goto out2;
 
+	rc = rtnl_link_register(&geneve_link_ops);
+	if (rc)
+		goto out3;
+
 	return 0;
+
+out3:
+	unregister_netdevice_notifier(&geneve_notifier_block);
 out2:
 	unregister_pernet_subsys(&geneve_net_ops);
 out1:
@@ -1605,6 +1629,7 @@ late_initcall(geneve_init_module);
 static void __exit geneve_cleanup_module(void)
 {
 	rtnl_link_unregister(&geneve_link_ops);
+	unregister_netdevice_notifier(&geneve_notifier_block);
 	unregister_pernet_subsys(&geneve_net_ops);
 }
 module_exit(geneve_cleanup_module);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4c8cd424f8d..1f6d5db471a2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2245,6 +2245,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
 #define NETDEV_OFFLOAD_PUSH_VXLAN	0x001C
+#define NETDEV_OFFLOAD_PUSH_GENEVE	0x001D
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/geneve.h b/include/net/geneve.h
index e6c23dc765f7..cb544a530146 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -62,13 +62,11 @@ struct genevehdr {
 	struct geneve_opt options[];
 };
 
-#if IS_ENABLED(CONFIG_GENEVE)
-void geneve_get_rx_port(struct net_device *netdev);
-#else
 static inline void geneve_get_rx_port(struct net_device *netdev)
 {
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_GENEVE, netdev);
 }
-#endif
 
 #ifdef CONFIG_INET
 struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
-- 
cgit v1.2.3


From 92a39d9043ba5ff98adb1c31491f00c7bea5466e Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Wed, 23 Mar 2016 17:38:24 +0100
Subject: clk: composite: Add unregister function

The composite clock didn't have any unregistration function, which forced
us to use clk_unregister directly on it.

While it was already not great from an API point of view, it also meant
that we were leaking the clk_composite structure allocated in
clk_register_composite.

Add a clk_unregister_composite function to fix this.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-composite.c  | 15 +++++++++++++++
 include/linux/clk-provider.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 1f903e1f86a2..b0f3b84ebd13 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -286,3 +286,18 @@ err:
 	kfree(composite);
 	return clk;
 }
+
+void clk_unregister_composite(struct clk *clk)
+{
+	struct clk_composite *composite;
+	struct clk_hw *hw;
+
+	hw = __clk_get_hw(clk);
+	if (!hw)
+		return;
+
+	composite = to_clk_composite(hw);
+
+	clk_unregister(clk);
+	kfree(composite);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index da95258127aa..26a8c9b7be71 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -603,6 +603,7 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+void clk_unregister_composite(struct clk *clk);
 
 /***
  * struct clk_gpio_gate - gpio gated clock
-- 
cgit v1.2.3


From be32bcbbd18213803ff24e480340d4d2cca1df4f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 20 Apr 2016 14:02:36 +0200
Subject: soc: renesas: Move pm-rcar to drivers/soc/renesas/rcar-sysc

Move the pm-rcar driver from arch/arm/mach-shmobile/ to
drivers/soc/renesas/, and its header file to include/linux/soc/renesas/,
so it can be shared between arm32 (R-Car H1 and Gen2) and arm64 (R-Car
Gen3). Rename it to rcar-sysc as it's really a driver for the R-Car
System Controller (SYSC).

Kill the intermediate PM_RCAR config symbol, as it's not user
configurable anymore, and to prepare for SoC-specific make rules.

Add the missing #include <linux/types.h> to rcar-sysc.h, which was
exposed by different include order.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 MAINTAINERS                           |   4 +
 arch/arm/mach-shmobile/Kconfig        |  11 +--
 arch/arm/mach-shmobile/Makefile       |   1 -
 arch/arm/mach-shmobile/pm-r8a7779.c   |   3 +-
 arch/arm/mach-shmobile/pm-rcar-gen2.c |   2 +-
 arch/arm/mach-shmobile/pm-rcar.c      | 164 ----------------------------------
 arch/arm/mach-shmobile/pm-rcar.h      |  15 ----
 arch/arm/mach-shmobile/smp-r8a7779.c  |   2 +-
 arch/arm/mach-shmobile/smp-r8a7790.c  |   2 +-
 drivers/soc/Makefile                  |   3 +-
 drivers/soc/renesas/Makefile          |   5 ++
 drivers/soc/renesas/rcar-sysc.c       | 164 ++++++++++++++++++++++++++++++++++
 include/linux/soc/renesas/rcar-sysc.h |  17 ++++
 13 files changed, 201 insertions(+), 192 deletions(-)
 delete mode 100644 arch/arm/mach-shmobile/pm-rcar.c
 delete mode 100644 arch/arm/mach-shmobile/pm-rcar.h
 create mode 100644 drivers/soc/renesas/Makefile
 create mode 100644 drivers/soc/renesas/rcar-sysc.c
 create mode 100644 include/linux/soc/renesas/rcar-sysc.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 03e00c7c88eb..a0e23922a90b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1491,6 +1491,8 @@ Q:	http://patchwork.kernel.org/project/linux-renesas-soc/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
 S:	Supported
 F:	arch/arm64/boot/dts/renesas/
+F:	drivers/soc/renesas/
+F:	include/linux/soc/renesas/
 
 ARM/RISCPC ARCHITECTURE
 M:	Russell King <linux@arm.linux.org.uk>
@@ -1604,6 +1606,8 @@ F:	arch/arm/configs/shmobile_defconfig
 F:	arch/arm/include/debug/renesas-scif.S
 F:	arch/arm/mach-shmobile/
 F:	drivers/sh/
+F:	drivers/soc/renesas/
+F:	include/linux/soc/renesas/
 
 ARM/SOCFPGA ARCHITECTURE
 M:	Dinh Nguyen <dinguyen@opensource.altera.com>
diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig
index f2bc5c353119..fe4ccb52f921 100644
--- a/arch/arm/mach-shmobile/Kconfig
+++ b/arch/arm/mach-shmobile/Kconfig
@@ -4,11 +4,6 @@ config ARCH_SHMOBILE
 config ARCH_SHMOBILE_MULTI
 	bool
 
-config PM_RCAR
-	bool
-	select PM
-	select PM_GENERIC_DOMAINS
-
 config PM_RMOBILE
 	bool
 	select PM
@@ -16,13 +11,15 @@ config PM_RMOBILE
 
 config ARCH_RCAR_GEN1
 	bool
-	select PM_RCAR
+	select PM
+	select PM_GENERIC_DOMAINS
 	select RENESAS_INTC_IRQPIN
 	select SYS_SUPPORTS_SH_TMU
 
 config ARCH_RCAR_GEN2
 	bool
-	select PM_RCAR
+	select PM
+	select PM_GENERIC_DOMAINS
 	select RENESAS_IRQC
 	select SYS_SUPPORTS_SH_CMT
 	select PCI_DOMAINS if PCI
diff --git a/arch/arm/mach-shmobile/Makefile b/arch/arm/mach-shmobile/Makefile
index a65c80ac9009..ebb909c55b85 100644
--- a/arch/arm/mach-shmobile/Makefile
+++ b/arch/arm/mach-shmobile/Makefile
@@ -39,7 +39,6 @@ smp-$(CONFIG_ARCH_EMEV2)	+= smp-emev2.o headsmp-scu.o platsmp-scu.o
 # PM objects
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq.o
-obj-$(CONFIG_PM_RCAR)		+= pm-rcar.o
 obj-$(CONFIG_PM_RMOBILE)	+= pm-rmobile.o
 obj-$(CONFIG_ARCH_RCAR_GEN2)	+= pm-rcar-gen2.o
 
diff --git a/arch/arm/mach-shmobile/pm-r8a7779.c b/arch/arm/mach-shmobile/pm-r8a7779.c
index 14c42a1bdf1e..4174cbcbc467 100644
--- a/arch/arm/mach-shmobile/pm-r8a7779.c
+++ b/arch/arm/mach-shmobile/pm-r8a7779.c
@@ -9,9 +9,10 @@
  * for more details.
  */
 
+#include <linux/soc/renesas/rcar-sysc.h>
+
 #include <asm/io.h>
 
-#include "pm-rcar.h"
 #include "r8a7779.h"
 
 /* SYSC */
diff --git a/arch/arm/mach-shmobile/pm-rcar-gen2.c b/arch/arm/mach-shmobile/pm-rcar-gen2.c
index 6815781ad116..691ac166a277 100644
--- a/arch/arm/mach-shmobile/pm-rcar-gen2.c
+++ b/arch/arm/mach-shmobile/pm-rcar-gen2.c
@@ -13,9 +13,9 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/smp.h>
+#include <linux/soc/renesas/rcar-sysc.h>
 #include <asm/io.h>
 #include "common.h"
-#include "pm-rcar.h"
 #include "rcar-gen2.h"
 
 /* RST */
diff --git a/arch/arm/mach-shmobile/pm-rcar.c b/arch/arm/mach-shmobile/pm-rcar.c
deleted file mode 100644
index 0af05d288b09..000000000000
--- a/arch/arm/mach-shmobile/pm-rcar.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * R-Car SYSC Power management support
- *
- * Copyright (C) 2014  Magnus Damm
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/io.h>
-#include "pm-rcar.h"
-
-/* SYSC Common */
-#define SYSCSR			0x00	/* SYSC Status Register */
-#define SYSCISR			0x04	/* Interrupt Status Register */
-#define SYSCISCR		0x08	/* Interrupt Status Clear Register */
-#define SYSCIER			0x0c	/* Interrupt Enable Register */
-#define SYSCIMR			0x10	/* Interrupt Mask Register */
-
-/* SYSC Status Register */
-#define SYSCSR_PONENB		1	/* Ready for power resume requests */
-#define SYSCSR_POFFENB		0	/* Ready for power shutoff requests */
-
-/*
- * Power Control Register Offsets inside the register block for each domain
- * Note: The "CR" registers for ARM cores exist on H1 only
- *       Use WFI to power off, CPG/APMU to resume ARM cores on R-Car Gen2
- */
-#define PWRSR_OFFS		0x00	/* Power Status Register */
-#define PWROFFCR_OFFS		0x04	/* Power Shutoff Control Register */
-#define PWROFFSR_OFFS		0x08	/* Power Shutoff Status Register */
-#define PWRONCR_OFFS		0x0c	/* Power Resume Control Register */
-#define PWRONSR_OFFS		0x10	/* Power Resume Status Register */
-#define PWRER_OFFS		0x14	/* Power Shutoff/Resume Error */
-
-
-#define SYSCSR_RETRIES		100
-#define SYSCSR_DELAY_US		1
-
-#define PWRER_RETRIES		100
-#define PWRER_DELAY_US		1
-
-#define SYSCISR_RETRIES		1000
-#define SYSCISR_DELAY_US	1
-
-static void __iomem *rcar_sysc_base;
-static DEFINE_SPINLOCK(rcar_sysc_lock); /* SMP CPUs + I/O devices */
-
-static int rcar_sysc_pwr_on_off(const struct rcar_sysc_ch *sysc_ch, bool on)
-{
-	unsigned int sr_bit, reg_offs;
-	int k;
-
-	if (on) {
-		sr_bit = SYSCSR_PONENB;
-		reg_offs = PWRONCR_OFFS;
-	} else {
-		sr_bit = SYSCSR_POFFENB;
-		reg_offs = PWROFFCR_OFFS;
-	}
-
-	/* Wait until SYSC is ready to accept a power request */
-	for (k = 0; k < SYSCSR_RETRIES; k++) {
-		if (ioread32(rcar_sysc_base + SYSCSR) & BIT(sr_bit))
-			break;
-		udelay(SYSCSR_DELAY_US);
-	}
-
-	if (k == SYSCSR_RETRIES)
-		return -EAGAIN;
-
-	/* Submit power shutoff or power resume request */
-	iowrite32(BIT(sysc_ch->chan_bit),
-		  rcar_sysc_base + sysc_ch->chan_offs + reg_offs);
-
-	return 0;
-}
-
-static int rcar_sysc_power(const struct rcar_sysc_ch *sysc_ch, bool on)
-{
-	unsigned int isr_mask = BIT(sysc_ch->isr_bit);
-	unsigned int chan_mask = BIT(sysc_ch->chan_bit);
-	unsigned int status;
-	unsigned long flags;
-	int ret = 0;
-	int k;
-
-	spin_lock_irqsave(&rcar_sysc_lock, flags);
-
-	iowrite32(isr_mask, rcar_sysc_base + SYSCISCR);
-
-	/* Submit power shutoff or resume request until it was accepted */
-	for (k = 0; k < PWRER_RETRIES; k++) {
-		ret = rcar_sysc_pwr_on_off(sysc_ch, on);
-		if (ret)
-			goto out;
-
-		status = ioread32(rcar_sysc_base +
-				  sysc_ch->chan_offs + PWRER_OFFS);
-		if (!(status & chan_mask))
-			break;
-
-		udelay(PWRER_DELAY_US);
-	}
-
-	if (k == PWRER_RETRIES) {
-		ret = -EIO;
-		goto out;
-	}
-
-	/* Wait until the power shutoff or resume request has completed * */
-	for (k = 0; k < SYSCISR_RETRIES; k++) {
-		if (ioread32(rcar_sysc_base + SYSCISR) & isr_mask)
-			break;
-		udelay(SYSCISR_DELAY_US);
-	}
-
-	if (k == SYSCISR_RETRIES)
-		ret = -EIO;
-
-	iowrite32(isr_mask, rcar_sysc_base + SYSCISCR);
-
- out:
-	spin_unlock_irqrestore(&rcar_sysc_lock, flags);
-
-	pr_debug("sysc power domain %d: %08x -> %d\n",
-		 sysc_ch->isr_bit, ioread32(rcar_sysc_base + SYSCISR), ret);
-	return ret;
-}
-
-int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch)
-{
-	return rcar_sysc_power(sysc_ch, false);
-}
-
-int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch)
-{
-	return rcar_sysc_power(sysc_ch, true);
-}
-
-bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch)
-{
-	unsigned int st;
-
-	st = ioread32(rcar_sysc_base + sysc_ch->chan_offs + PWRSR_OFFS);
-	if (st & BIT(sysc_ch->chan_bit))
-		return true;
-
-	return false;
-}
-
-void __iomem *rcar_sysc_init(phys_addr_t base)
-{
-	rcar_sysc_base = ioremap_nocache(base, PAGE_SIZE);
-	if (!rcar_sysc_base)
-		panic("unable to ioremap R-Car SYSC hardware block\n");
-
-	return rcar_sysc_base;
-}
diff --git a/arch/arm/mach-shmobile/pm-rcar.h b/arch/arm/mach-shmobile/pm-rcar.h
deleted file mode 100644
index 1b901db4a24c..000000000000
--- a/arch/arm/mach-shmobile/pm-rcar.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef PM_RCAR_H
-#define PM_RCAR_H
-
-struct rcar_sysc_ch {
-	u16 chan_offs;
-	u8 chan_bit;
-	u8 isr_bit;
-};
-
-int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch);
-int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch);
-bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch);
-void __iomem *rcar_sysc_init(phys_addr_t base);
-
-#endif /* PM_RCAR_H */
diff --git a/arch/arm/mach-shmobile/smp-r8a7779.c b/arch/arm/mach-shmobile/smp-r8a7779.c
index f5c31fbc10b2..c6951ee24588 100644
--- a/arch/arm/mach-shmobile/smp-r8a7779.c
+++ b/arch/arm/mach-shmobile/smp-r8a7779.c
@@ -19,13 +19,13 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/delay.h>
+#include <linux/soc/renesas/rcar-sysc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/smp_plat.h>
 #include <asm/smp_scu.h>
 
 #include "common.h"
-#include "pm-rcar.h"
 #include "r8a7779.h"
 
 #define AVECR IOMEM(0xfe700040)
diff --git a/arch/arm/mach-shmobile/smp-r8a7790.c b/arch/arm/mach-shmobile/smp-r8a7790.c
index f6426c6fdefc..28f26d5362d8 100644
--- a/arch/arm/mach-shmobile/smp-r8a7790.c
+++ b/arch/arm/mach-shmobile/smp-r8a7790.c
@@ -17,12 +17,12 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <linux/soc/renesas/rcar-sysc.h>
 
 #include <asm/smp_plat.h>
 
 #include "common.h"
 #include "platsmp-apmu.h"
-#include "pm-rcar.h"
 #include "rcar-gen2.h"
 #include "r8a7790.h"
 
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index 5ade71306ee1..380230f03874 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_MACH_DOVE)		+= dove/
 obj-y				+= fsl/
 obj-$(CONFIG_ARCH_MEDIATEK)	+= mediatek/
 obj-$(CONFIG_ARCH_QCOM)		+= qcom/
-obj-$(CONFIG_ARCH_ROCKCHIP)		+= rockchip/
+obj-$(CONFIG_ARCH_RENESAS)	+= renesas/
+obj-$(CONFIG_ARCH_ROCKCHIP)	+= rockchip/
 obj-$(CONFIG_SOC_SAMSUNG)	+= samsung/
 obj-$(CONFIG_ARCH_SUNXI)	+= sunxi/
 obj-$(CONFIG_ARCH_TEGRA)	+= tegra/
diff --git a/drivers/soc/renesas/Makefile b/drivers/soc/renesas/Makefile
new file mode 100644
index 000000000000..2b64f6c94681
--- /dev/null
+++ b/drivers/soc/renesas/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_ARCH_R8A7779)	+= rcar-sysc.o
+obj-$(CONFIG_ARCH_R8A7790)	+= rcar-sysc.o
+obj-$(CONFIG_ARCH_R8A7791)	+= rcar-sysc.o
+obj-$(CONFIG_ARCH_R8A7793)	+= rcar-sysc.o
+obj-$(CONFIG_ARCH_R8A7794)	+= rcar-sysc.o
diff --git a/drivers/soc/renesas/rcar-sysc.c b/drivers/soc/renesas/rcar-sysc.c
new file mode 100644
index 000000000000..d59bcdf78f0b
--- /dev/null
+++ b/drivers/soc/renesas/rcar-sysc.c
@@ -0,0 +1,164 @@
+/*
+ * R-Car SYSC Power management support
+ *
+ * Copyright (C) 2014  Magnus Damm
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/soc/renesas/rcar-sysc.h>
+
+/* SYSC Common */
+#define SYSCSR			0x00	/* SYSC Status Register */
+#define SYSCISR			0x04	/* Interrupt Status Register */
+#define SYSCISCR		0x08	/* Interrupt Status Clear Register */
+#define SYSCIER			0x0c	/* Interrupt Enable Register */
+#define SYSCIMR			0x10	/* Interrupt Mask Register */
+
+/* SYSC Status Register */
+#define SYSCSR_PONENB		1	/* Ready for power resume requests */
+#define SYSCSR_POFFENB		0	/* Ready for power shutoff requests */
+
+/*
+ * Power Control Register Offsets inside the register block for each domain
+ * Note: The "CR" registers for ARM cores exist on H1 only
+ *       Use WFI to power off, CPG/APMU to resume ARM cores on R-Car Gen2
+ */
+#define PWRSR_OFFS		0x00	/* Power Status Register */
+#define PWROFFCR_OFFS		0x04	/* Power Shutoff Control Register */
+#define PWROFFSR_OFFS		0x08	/* Power Shutoff Status Register */
+#define PWRONCR_OFFS		0x0c	/* Power Resume Control Register */
+#define PWRONSR_OFFS		0x10	/* Power Resume Status Register */
+#define PWRER_OFFS		0x14	/* Power Shutoff/Resume Error */
+
+
+#define SYSCSR_RETRIES		100
+#define SYSCSR_DELAY_US		1
+
+#define PWRER_RETRIES		100
+#define PWRER_DELAY_US		1
+
+#define SYSCISR_RETRIES		1000
+#define SYSCISR_DELAY_US	1
+
+static void __iomem *rcar_sysc_base;
+static DEFINE_SPINLOCK(rcar_sysc_lock); /* SMP CPUs + I/O devices */
+
+static int rcar_sysc_pwr_on_off(const struct rcar_sysc_ch *sysc_ch, bool on)
+{
+	unsigned int sr_bit, reg_offs;
+	int k;
+
+	if (on) {
+		sr_bit = SYSCSR_PONENB;
+		reg_offs = PWRONCR_OFFS;
+	} else {
+		sr_bit = SYSCSR_POFFENB;
+		reg_offs = PWROFFCR_OFFS;
+	}
+
+	/* Wait until SYSC is ready to accept a power request */
+	for (k = 0; k < SYSCSR_RETRIES; k++) {
+		if (ioread32(rcar_sysc_base + SYSCSR) & BIT(sr_bit))
+			break;
+		udelay(SYSCSR_DELAY_US);
+	}
+
+	if (k == SYSCSR_RETRIES)
+		return -EAGAIN;
+
+	/* Submit power shutoff or power resume request */
+	iowrite32(BIT(sysc_ch->chan_bit),
+		  rcar_sysc_base + sysc_ch->chan_offs + reg_offs);
+
+	return 0;
+}
+
+static int rcar_sysc_power(const struct rcar_sysc_ch *sysc_ch, bool on)
+{
+	unsigned int isr_mask = BIT(sysc_ch->isr_bit);
+	unsigned int chan_mask = BIT(sysc_ch->chan_bit);
+	unsigned int status;
+	unsigned long flags;
+	int ret = 0;
+	int k;
+
+	spin_lock_irqsave(&rcar_sysc_lock, flags);
+
+	iowrite32(isr_mask, rcar_sysc_base + SYSCISCR);
+
+	/* Submit power shutoff or resume request until it was accepted */
+	for (k = 0; k < PWRER_RETRIES; k++) {
+		ret = rcar_sysc_pwr_on_off(sysc_ch, on);
+		if (ret)
+			goto out;
+
+		status = ioread32(rcar_sysc_base +
+				  sysc_ch->chan_offs + PWRER_OFFS);
+		if (!(status & chan_mask))
+			break;
+
+		udelay(PWRER_DELAY_US);
+	}
+
+	if (k == PWRER_RETRIES) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/* Wait until the power shutoff or resume request has completed * */
+	for (k = 0; k < SYSCISR_RETRIES; k++) {
+		if (ioread32(rcar_sysc_base + SYSCISR) & isr_mask)
+			break;
+		udelay(SYSCISR_DELAY_US);
+	}
+
+	if (k == SYSCISR_RETRIES)
+		ret = -EIO;
+
+	iowrite32(isr_mask, rcar_sysc_base + SYSCISCR);
+
+ out:
+	spin_unlock_irqrestore(&rcar_sysc_lock, flags);
+
+	pr_debug("sysc power domain %d: %08x -> %d\n",
+		 sysc_ch->isr_bit, ioread32(rcar_sysc_base + SYSCISR), ret);
+	return ret;
+}
+
+int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch)
+{
+	return rcar_sysc_power(sysc_ch, false);
+}
+
+int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch)
+{
+	return rcar_sysc_power(sysc_ch, true);
+}
+
+bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch)
+{
+	unsigned int st;
+
+	st = ioread32(rcar_sysc_base + sysc_ch->chan_offs + PWRSR_OFFS);
+	if (st & BIT(sysc_ch->chan_bit))
+		return true;
+
+	return false;
+}
+
+void __iomem *rcar_sysc_init(phys_addr_t base)
+{
+	rcar_sysc_base = ioremap_nocache(base, PAGE_SIZE);
+	if (!rcar_sysc_base)
+		panic("unable to ioremap R-Car SYSC hardware block\n");
+
+	return rcar_sysc_base;
+}
diff --git a/include/linux/soc/renesas/rcar-sysc.h b/include/linux/soc/renesas/rcar-sysc.h
new file mode 100644
index 000000000000..96f30c288388
--- /dev/null
+++ b/include/linux/soc/renesas/rcar-sysc.h
@@ -0,0 +1,17 @@
+#ifndef __LINUX_SOC_RENESAS_RCAR_SYSC_H__
+#define __LINUX_SOC_RENESAS_RCAR_SYSC_H__
+
+#include <linux/types.h>
+
+struct rcar_sysc_ch {
+	u16 chan_offs;
+	u8 chan_bit;
+	u8 isr_bit;
+};
+
+int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch);
+int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch);
+bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch);
+void __iomem *rcar_sysc_init(phys_addr_t base);
+
+#endif /* __LINUX_SOC_RENESAS_RCAR_SYSC_H__ */
-- 
cgit v1.2.3


From 2f024cef5b064d5498fe466dfe9492d46b5b12a4 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 20 Apr 2016 14:02:39 +0200
Subject: soc: renesas: rcar-sysc: Make rcar_sysc_power_is_off() static

As of commit b12ff41658171f53 ("ARM: shmobile: r8a7779: Remove legacy PM
Domain remainings"), rcar_sysc_power_is_off() is no longer used from
SoC-specific code.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/soc/renesas/rcar-sysc.c       | 2 +-
 include/linux/soc/renesas/rcar-sysc.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/renesas/rcar-sysc.c b/drivers/soc/renesas/rcar-sysc.c
index 95f2b59cbd76..4e760d63f2ab 100644
--- a/drivers/soc/renesas/rcar-sysc.c
+++ b/drivers/soc/renesas/rcar-sysc.c
@@ -152,7 +152,7 @@ int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch)
 	return rcar_sysc_power(sysc_ch, true);
 }
 
-bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch)
+static bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch)
 {
 	unsigned int st;
 
diff --git a/include/linux/soc/renesas/rcar-sysc.h b/include/linux/soc/renesas/rcar-sysc.h
index 96f30c288388..92fc613ab23d 100644
--- a/include/linux/soc/renesas/rcar-sysc.h
+++ b/include/linux/soc/renesas/rcar-sysc.h
@@ -11,7 +11,6 @@ struct rcar_sysc_ch {
 
 int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch);
 int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch);
-bool rcar_sysc_power_is_off(const struct rcar_sysc_ch *sysc_ch);
 void __iomem *rcar_sysc_init(phys_addr_t base);
 
 #endif /* __LINUX_SOC_RENESAS_RCAR_SYSC_H__ */
-- 
cgit v1.2.3


From a7ab72390b77062420fb50e4451f71c9321aae05 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 20 Apr 2016 08:38:50 +0200
Subject: i2c: mux: add common data for every i2c-mux instance

All i2c-muxes have a parent adapter and one or many child
adapters. A mux also has some means of selection. Previously,
this was stored per child adapter, but it is only needed
to keep track of this per mux.

Add an i2c mux core, that keeps track of this consistently.

Also add some glue for users of the old interface, which will
create one implicit mux core per child adapter.

Signed-off-by: Peter Rosin <peda@axentia.se>
Tested-by: Antti Palosaari <crope@iki.fi>
Tested-by: Crestez Dan Leonard <leonard.crestez@intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-mux.c   | 175 ++++++++++++++++++++++++++++++++++++------------
 include/linux/i2c-mux.h |  34 ++++++++++
 2 files changed, 168 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index d4022878b2f0..5ce1b0704cb5 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -28,33 +28,34 @@
 #include <linux/slab.h>
 
 /* multiplexer per channel data */
+struct i2c_mux_priv_old {
+	void *mux_priv;
+	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
+	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
+};
+
 struct i2c_mux_priv {
 	struct i2c_adapter adap;
 	struct i2c_algorithm algo;
-
-	struct i2c_adapter *parent;
-	struct device *mux_dev;
-	void *mux_priv;
+	struct i2c_mux_core *muxc;
 	u32 chan_id;
-
-	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
 };
 
 static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 			       struct i2c_msg msgs[], int num)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
 	int ret;
 
 	/* Switch to the right mux port and perform the transfer. */
 
-	ret = priv->select(parent, priv->mux_priv, priv->chan_id);
+	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
 		ret = __i2c_transfer(parent, msgs, num);
-	if (priv->deselect)
-		priv->deselect(parent, priv->mux_priv, priv->chan_id);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
 
 	return ret;
 }
@@ -65,17 +66,18 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 			      int size, union i2c_smbus_data *data)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
 	int ret;
 
 	/* Select the right mux port and perform the transfer. */
 
-	ret = priv->select(parent, priv->mux_priv, priv->chan_id);
+	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
 		ret = parent->algo->smbus_xfer(parent, addr, flags,
 					read_write, command, size, data);
-	if (priv->deselect)
-		priv->deselect(parent, priv->mux_priv, priv->chan_id);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
 
 	return ret;
 }
@@ -84,7 +86,7 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 static u32 i2c_mux_functionality(struct i2c_adapter *adap)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_adapter *parent = priv->parent;
+	struct i2c_adapter *parent = priv->muxc->parent;
 
 	return parent->algo->functionality(parent);
 }
@@ -102,6 +104,20 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
+static int i2c_mux_select(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
+
+	return priv->select(muxc->parent, priv->mux_priv, chan);
+}
+
+static int i2c_mux_deselect(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
+
+	return priv->deselect(muxc->parent, priv->mux_priv, chan);
+}
+
 struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 				struct device *mux_dev,
 				void *mux_priv, u32 force_nr, u32 chan_id,
@@ -111,21 +127,77 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 				int (*deselect) (struct i2c_adapter *,
 						 void *, u32))
 {
+	struct i2c_mux_core *muxc;
+	struct i2c_mux_priv_old *priv;
+	int ret;
+
+	muxc = i2c_mux_alloc(parent, mux_dev, 1, sizeof(*priv), 0,
+			     i2c_mux_select, i2c_mux_deselect);
+	if (!muxc)
+		return NULL;
+
+	priv = i2c_mux_priv(muxc);
+	priv->select = select;
+	priv->deselect = deselect;
+	priv->mux_priv = mux_priv;
+
+	ret = i2c_mux_add_adapter(muxc, force_nr, chan_id, class);
+	if (ret) {
+		devm_kfree(mux_dev, muxc);
+		return NULL;
+	}
+
+	return muxc->adapter[0];
+}
+EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
+
+struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
+				   struct device *dev, int max_adapters,
+				   int sizeof_priv, u32 flags,
+				   int (*select)(struct i2c_mux_core *, u32),
+				   int (*deselect)(struct i2c_mux_core *, u32))
+{
+	struct i2c_mux_core *muxc;
+
+	muxc = devm_kzalloc(dev, sizeof(*muxc)
+			    + max_adapters * sizeof(muxc->adapter[0])
+			    + sizeof_priv, GFP_KERNEL);
+	if (!muxc)
+		return NULL;
+	if (sizeof_priv)
+		muxc->priv = &muxc->adapter[max_adapters];
+
+	muxc->parent = parent;
+	muxc->dev = dev;
+	muxc->select = select;
+	muxc->deselect = deselect;
+	muxc->max_adapters = max_adapters;
+
+	return muxc;
+}
+EXPORT_SYMBOL_GPL(i2c_mux_alloc);
+
+int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
+			u32 force_nr, u32 chan_id,
+			unsigned int class)
+{
+	struct i2c_adapter *parent = muxc->parent;
 	struct i2c_mux_priv *priv;
 	char symlink_name[20];
 	int ret;
 
-	priv = kzalloc(sizeof(struct i2c_mux_priv), GFP_KERNEL);
+	if (muxc->num_adapters >= muxc->max_adapters) {
+		dev_err(muxc->dev, "No room for more i2c-mux adapters\n");
+		return -EINVAL;
+	}
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
-		return NULL;
+		return -ENOMEM;
 
 	/* Set up private adapter data */
-	priv->parent = parent;
-	priv->mux_dev = mux_dev;
-	priv->mux_priv = mux_priv;
+	priv->muxc = muxc;
 	priv->chan_id = chan_id;
-	priv->select = select;
-	priv->deselect = deselect;
 
 	/* Need to do algo dynamically because we don't know ahead
 	 * of time what sort of physical adapter we'll be dealing with.
@@ -159,11 +231,11 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 	 * Try to populate the mux adapter's of_node, expands to
 	 * nothing if !CONFIG_OF.
 	 */
-	if (mux_dev->of_node) {
+	if (muxc->dev->of_node) {
 		struct device_node *child;
 		u32 reg;
 
-		for_each_child_of_node(mux_dev->of_node, child) {
+		for_each_child_of_node(muxc->dev->of_node, child) {
 			ret = of_property_read_u32(child, "reg", &reg);
 			if (ret)
 				continue;
@@ -177,8 +249,9 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 	/*
 	 * Associate the mux channel with an ACPI node.
 	 */
-	if (has_acpi_companion(mux_dev))
-		acpi_preset_companion(&priv->adap.dev, ACPI_COMPANION(mux_dev),
+	if (has_acpi_companion(muxc->dev))
+		acpi_preset_companion(&priv->adap.dev,
+				      ACPI_COMPANION(muxc->dev),
 				      chan_id);
 
 	if (force_nr) {
@@ -192,33 +265,53 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 			"failed to add mux-adapter (error=%d)\n",
 			ret);
 		kfree(priv);
-		return NULL;
+		return ret;
 	}
 
-	WARN(sysfs_create_link(&priv->adap.dev.kobj, &mux_dev->kobj, "mux_device"),
-			       "can't create symlink to mux device\n");
+	WARN(sysfs_create_link(&priv->adap.dev.kobj, &muxc->dev->kobj,
+			       "mux_device"),
+	     "can't create symlink to mux device\n");
 
 	snprintf(symlink_name, sizeof(symlink_name), "channel-%u", chan_id);
-	WARN(sysfs_create_link(&mux_dev->kobj, &priv->adap.dev.kobj, symlink_name),
-			       "can't create symlink for channel %u\n", chan_id);
+	WARN(sysfs_create_link(&muxc->dev->kobj, &priv->adap.dev.kobj,
+			       symlink_name),
+	     "can't create symlink for channel %u\n", chan_id);
 	dev_info(&parent->dev, "Added multiplexed i2c bus %d\n",
 		 i2c_adapter_id(&priv->adap));
 
-	return &priv->adap;
+	muxc->adapter[muxc->num_adapters++] = &priv->adap;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
+EXPORT_SYMBOL_GPL(i2c_mux_add_adapter);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap)
+void i2c_mux_del_adapters(struct i2c_mux_core *muxc)
 {
-	struct i2c_mux_priv *priv = adap->algo_data;
 	char symlink_name[20];
 
-	snprintf(symlink_name, sizeof(symlink_name), "channel-%u", priv->chan_id);
-	sysfs_remove_link(&priv->mux_dev->kobj, symlink_name);
+	while (muxc->num_adapters) {
+		struct i2c_adapter *adap = muxc->adapter[--muxc->num_adapters];
+		struct i2c_mux_priv *priv = adap->algo_data;
+
+		muxc->adapter[muxc->num_adapters] = NULL;
+
+		snprintf(symlink_name, sizeof(symlink_name),
+			 "channel-%u", priv->chan_id);
+		sysfs_remove_link(&muxc->dev->kobj, symlink_name);
+
+		sysfs_remove_link(&priv->adap.dev.kobj, "mux_device");
+		i2c_del_adapter(adap);
+		kfree(priv);
+	}
+}
+EXPORT_SYMBOL_GPL(i2c_mux_del_adapters);
+
+void i2c_del_mux_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
 
-	sysfs_remove_link(&priv->adap.dev.kobj, "mux_device");
-	i2c_del_adapter(adap);
-	kfree(priv);
+	i2c_mux_del_adapters(muxc);
+	devm_kfree(muxc->dev, muxc);
 }
 EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
 
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index b5f9a007a3ab..71ac1b3f4f68 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -27,6 +27,31 @@
 
 #ifdef __KERNEL__
 
+struct i2c_mux_core {
+	struct i2c_adapter *parent;
+	struct device *dev;
+
+	void *priv;
+
+	int (*select)(struct i2c_mux_core *, u32 chan_id);
+	int (*deselect)(struct i2c_mux_core *, u32 chan_id);
+
+	int num_adapters;
+	int max_adapters;
+	struct i2c_adapter *adapter[0];
+};
+
+struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
+				   struct device *dev, int max_adapters,
+				   int sizeof_priv, u32 flags,
+				   int (*select)(struct i2c_mux_core *, u32),
+				   int (*deselect)(struct i2c_mux_core *, u32));
+
+static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
+{
+	return muxc->priv;
+}
+
 /*
  * Called to create a i2c bus on a multiplexed bus segment.
  * The mux_dev and chan_id parameters are passed to the select
@@ -41,8 +66,17 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 					       void *mux_dev, u32 chan_id),
 				int (*deselect) (struct i2c_adapter *,
 						 void *mux_dev, u32 chan_id));
+/*
+ * Called to create an i2c bus on a multiplexed bus segment.
+ * The chan_id parameter is passed to the select and deselect
+ * callback functions to perform hardware-specific mux control.
+ */
+int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
+			u32 force_nr, u32 chan_id,
+			unsigned int class);
 
 void i2c_del_mux_adapter(struct i2c_adapter *adap);
+void i2c_mux_del_adapters(struct i2c_mux_core *muxc);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 23fe440c59b9f08afe108e7ec7b6714cb2a3b955 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 2 Mar 2016 15:14:22 +0100
Subject: i2c: mux: drop old unused i2c-mux api

All i2c mux users are using an explicit i2c mux core, drop support
for implicit i2c mux cores.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-mux.c   | 63 -------------------------------------------------
 include/linux/i2c-mux.h | 15 ------------
 2 files changed, 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 5ce1b0704cb5..25e9336b0e6e 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -28,12 +28,6 @@
 #include <linux/slab.h>
 
 /* multiplexer per channel data */
-struct i2c_mux_priv_old {
-	void *mux_priv;
-	int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-	int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id);
-};
-
 struct i2c_mux_priv {
 	struct i2c_adapter adap;
 	struct i2c_algorithm algo;
@@ -104,53 +98,6 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
-static int i2c_mux_select(struct i2c_mux_core *muxc, u32 chan)
-{
-	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
-
-	return priv->select(muxc->parent, priv->mux_priv, chan);
-}
-
-static int i2c_mux_deselect(struct i2c_mux_core *muxc, u32 chan)
-{
-	struct i2c_mux_priv_old *priv = i2c_mux_priv(muxc);
-
-	return priv->deselect(muxc->parent, priv->mux_priv, chan);
-}
-
-struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
-				struct device *mux_dev,
-				void *mux_priv, u32 force_nr, u32 chan_id,
-				unsigned int class,
-				int (*select) (struct i2c_adapter *,
-					       void *, u32),
-				int (*deselect) (struct i2c_adapter *,
-						 void *, u32))
-{
-	struct i2c_mux_core *muxc;
-	struct i2c_mux_priv_old *priv;
-	int ret;
-
-	muxc = i2c_mux_alloc(parent, mux_dev, 1, sizeof(*priv), 0,
-			     i2c_mux_select, i2c_mux_deselect);
-	if (!muxc)
-		return NULL;
-
-	priv = i2c_mux_priv(muxc);
-	priv->select = select;
-	priv->deselect = deselect;
-	priv->mux_priv = mux_priv;
-
-	ret = i2c_mux_add_adapter(muxc, force_nr, chan_id, class);
-	if (ret) {
-		devm_kfree(mux_dev, muxc);
-		return NULL;
-	}
-
-	return muxc->adapter[0];
-}
-EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
-
 struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   struct device *dev, int max_adapters,
 				   int sizeof_priv, u32 flags,
@@ -305,16 +252,6 @@ void i2c_mux_del_adapters(struct i2c_mux_core *muxc)
 }
 EXPORT_SYMBOL_GPL(i2c_mux_del_adapters);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap)
-{
-	struct i2c_mux_priv *priv = adap->algo_data;
-	struct i2c_mux_core *muxc = priv->muxc;
-
-	i2c_mux_del_adapters(muxc);
-	devm_kfree(muxc->dev, muxc);
-}
-EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
-
 MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
 MODULE_DESCRIPTION("I2C driver for multiplexed I2C busses");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 71ac1b3f4f68..2fa93fe1345e 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -52,20 +52,6 @@ static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
 	return muxc->priv;
 }
 
-/*
- * Called to create a i2c bus on a multiplexed bus segment.
- * The mux_dev and chan_id parameters are passed to the select
- * and deselect callback functions to perform hardware-specific
- * mux control.
- */
-struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
-				struct device *mux_dev,
-				void *mux_priv, u32 force_nr, u32 chan_id,
-				unsigned int class,
-				int (*select) (struct i2c_adapter *,
-					       void *mux_dev, u32 chan_id),
-				int (*deselect) (struct i2c_adapter *,
-						 void *mux_dev, u32 chan_id));
 /*
  * Called to create an i2c bus on a multiplexed bus segment.
  * The chan_id parameter is passed to the select and deselect
@@ -75,7 +61,6 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 			u32 force_nr, u32 chan_id,
 			unsigned int class);
 
-void i2c_del_mux_adapter(struct i2c_adapter *adap);
 void i2c_mux_del_adapters(struct i2c_mux_core *muxc);
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 9d90725ddca347450c4ab177ad680ed76063afd4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 18 Mar 2016 11:27:36 -0700
Subject: libnvdimm, blk: move i/o infrastructure to nd_namespace_blk

Consolidate the information for issuing i/o to a blk-namespace, and
eliminate some pointer chasing.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c | 137 ++++++++++++++++++++++++++-------------------------
 include/linux/nd.h   |   2 +
 2 files changed, 71 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 26d039879ba2..4c14ecdc792b 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -21,17 +21,19 @@
 #include <linux/sizes.h>
 #include "nd.h"
 
-struct nd_blk_device {
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
-	size_t disk_size;
-	u32 sector_size;
-	u32 internal_lbasize;
-};
+static u32 nsblk_meta_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - ((nsblk->lbasize >= 4096) ? 4096 : 512);
+}
 
-static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
+static u32 nsblk_internal_lbasize(struct nd_namespace_blk *nsblk)
 {
-	return blk_dev->nsblk->lbasize - blk_dev->sector_size;
+	return roundup(nsblk->lbasize, INT_LBASIZE_ALIGNMENT);
+}
+
+static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - nsblk_meta_size(nsblk);
 }
 
 static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
@@ -55,20 +57,29 @@ static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
 	return SIZE_MAX;
 }
 
+static struct nd_blk_region *to_ndbr(struct nd_namespace_blk *nsblk)
+{
+	struct nd_region *nd_region;
+	struct device *parent;
+
+	parent = nsblk->common.dev.parent;
+	nd_region = container_of(parent, struct nd_region, dev);
+	return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
-	unsigned int len = nd_blk_meta_size(blk_dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
+	unsigned int len = nsblk_meta_size(nsblk);
 	resource_size_t	dev_offset, ns_offset;
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 
-	nsblk = blk_dev->nsblk;
-	ndbr = blk_dev->ndbr;
-	ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size;
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
+	ns_offset = lba * internal_lbasize + sector_size;
 	dev_offset = to_dev_offset(nsblk, ns_offset, len);
 	if (dev_offset == SIZE_MAX)
 		return -EIO;
@@ -102,25 +113,26 @@ static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
 }
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
 	return 0;
 }
 #endif
 
-static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
-			struct bio_integrity_payload *bip, struct page *page,
-			unsigned int len, unsigned int off, int rw,
-			sector_t sector)
+static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, struct page *page,
+		unsigned int len, unsigned int off, int rw, sector_t sector)
 {
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset, ns_offset;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 	void *iobuf;
 	u64 lba;
 
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
 	while (len) {
 		unsigned int cur_len;
 
@@ -130,11 +142,11 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 		 * Block Window setup/move steps. the do_io routine is capable
 		 * of handling len <= PAGE_SIZE.
 		 */
-		cur_len = bip ? min(len, blk_dev->sector_size) : len;
+		cur_len = bip ? min(len, sector_size) : len;
 
-		lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size);
-		ns_offset = lba * blk_dev->internal_lbasize;
-		dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len);
+		lba = div_u64(sector << SECTOR_SHIFT, sector_size);
+		ns_offset = lba * internal_lbasize;
+		dev_offset = to_dev_offset(nsblk, ns_offset, cur_len);
 		if (dev_offset == SIZE_MAX)
 			return -EIO;
 
@@ -145,13 +157,13 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 			return err;
 
 		if (bip) {
-			err = nd_blk_rw_integrity(blk_dev, bip, lba, rw);
+			err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
 			if (err)
 				return err;
 		}
 		len -= cur_len;
 		off += cur_len;
-		sector += blk_dev->sector_size >> SECTOR_SHIFT;
+		sector += sector_size >> SECTOR_SHIFT;
 	}
 
 	return err;
@@ -160,7 +172,7 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct nd_blk_device *blk_dev;
+	struct nd_namespace_blk *nsblk;
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -179,17 +191,17 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	bip = bio_integrity(bio);
-	blk_dev = q->queuedata;
+	nsblk = q->queuedata;
 	rw = bio_data_dir(bio);
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 
 		BUG_ON(len > PAGE_SIZE);
-		err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
-					bvec.bv_offset, rw, iter.bi_sector);
+		err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
+				bvec.bv_offset, rw, iter.bi_sector);
 		if (err) {
-			dev_dbg(&blk_dev->nsblk->common.dev,
+			dev_dbg(&nsblk->common.dev,
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
@@ -205,17 +217,16 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
-static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
+static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
 		resource_size_t offset, void *iobuf, size_t n, int rw)
 {
-	struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
-	struct nd_namespace_blk *nsblk = blk_dev->nsblk;
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset;
 
 	dev_offset = to_dev_offset(nsblk, offset, n);
 
-	if (unlikely(offset + n > blk_dev->disk_size)) {
+	if (unlikely(offset + n > nsblk->size)) {
 		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
 		return -EFAULT;
 	}
@@ -242,16 +253,16 @@ static void nd_blk_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static int nd_blk_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct nd_blk_device *blk_dev)
+static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
 {
+	struct device *dev = &nsblk->common.dev;
 	resource_size_t available_disk_size;
 	struct request_queue *q;
 	struct gendisk *disk;
 	u64 internal_nlba;
 
-	internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
-	available_disk_size = internal_nlba * blk_dev->sector_size;
+	internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
+	available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
 
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
@@ -264,9 +275,9 @@ static int nd_blk_attach_disk(struct device *dev,
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
-	blk_queue_logical_block_size(q, blk_dev->sector_size);
+	blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-	q->queuedata = blk_dev;
+	q->queuedata = nsblk;
 
 	disk = alloc_disk(0);
 	if (!disk)
@@ -276,17 +287,17 @@ static int nd_blk_attach_disk(struct device *dev,
 		return -ENOMEM;
 	}
 
-	disk->driverfs_dev	= &ndns->dev;
+	disk->driverfs_dev	= dev;
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
 	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	nvdimm_namespace_disk_name(ndns, disk->disk_name);
+	nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
 	set_capacity(disk, 0);
 	add_disk(disk);
 
-	if (nd_blk_meta_size(blk_dev)) {
-		int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
+	if (nsblk_meta_size(nsblk)) {
+		int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
 
 		if (rc)
 			return rc;
@@ -301,33 +312,23 @@ static int nd_blk_probe(struct device *dev)
 {
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_blk *nsblk;
-	struct nd_blk_device *blk_dev;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	blk_dev = devm_kzalloc(dev, sizeof(*blk_dev), GFP_KERNEL);
-	if (!blk_dev)
-		return -ENOMEM;
-
 	nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
-	blk_dev->ndbr = to_nd_blk_region(dev->parent);
-	blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->internal_lbasize = roundup(nsblk->lbasize,
-						INT_LBASIZE_ALIGNMENT);
-	blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512);
-	dev_set_drvdata(dev, blk_dev);
-
-	ndns->rw_bytes = nd_blk_rw_bytes;
+	nsblk->size = nvdimm_namespace_capacity(ndns);
+	dev_set_drvdata(dev, nsblk);
+
+	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, blk_dev) == 0) {
+	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
-		return nd_blk_attach_disk(dev, ndns, blk_dev);
+		return nsblk_attach_disk(nsblk);
 }
 
 static int nd_blk_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5489ab756d1a..5ea4aec7fd63 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -82,6 +82,7 @@ struct nd_namespace_pmem {
  * @uuid: namespace name supplied in the dimm label
  * @id: ida allocated id
  * @lbasize: blk namespaces have a native sector size when btt not present
+ * @size: sum of all the resource ranges allocated to this namespace
  * @num_resources: number of dpa extents to claim
  * @res: discontiguous dpa extents for given dimm
  */
@@ -91,6 +92,7 @@ struct nd_namespace_blk {
 	u8 *uuid;
 	int id;
 	unsigned long lbasize;
+	resource_size_t size;
 	int num_resources;
 	struct resource **res;
 };
-- 
cgit v1.2.3


From 200c79da824c978fcf6eec1dc9c0a1e521133267 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Mar 2016 00:22:16 -0700
Subject: libnvdimm, pmem, pfn: make pmem_rw_bytes generic and refactor pfn
 setup

In preparation for providing an alternative (to block device) access
mechanism to persistent memory, convert pmem_rw_bytes() to
nsio_rw_bytes().  This allows ->rw_bytes() functionality without
requiring a 'struct pmem_device' to be instantiated.

In other words, when ->rw_bytes() is in use i/o is driven through
'struct nd_namespace_io', otherwise it is driven through 'struct
pmem_device' and the block layer.  This consolidates the disjoint calls
to devm_exit_badblocks() and devm_memunmap() into a common
devm_nsio_disable() and cleans up the init path to use a unified
pmem_attach_disk() implementation.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c              |   2 +-
 drivers/nvdimm/btt_devs.c         |   4 +-
 drivers/nvdimm/claim.c            |  61 ++++++++++
 drivers/nvdimm/nd.h               |  40 +++++--
 drivers/nvdimm/pfn_devs.c         |   4 +-
 drivers/nvdimm/pmem.c             | 236 ++++++++++++++------------------------
 include/linux/nd.h                |   9 +-
 tools/testing/nvdimm/Kbuild       |   1 +
 tools/testing/nvdimm/test/iomap.c |  27 +++--
 9 files changed, 211 insertions(+), 173 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 4c14ecdc792b..495e06d9f7e7 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -324,7 +324,7 @@ static int nd_blk_probe(struct device *dev)
 	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
+	else if (nd_btt_probe(dev, ndns) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 1886171af80e..816d0dae6398 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,8 +273,7 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	return 0;
 }
 
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct device *btt_dev;
@@ -289,7 +288,6 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!btt_dev)
 		return -ENOMEM;
-	dev_set_drvdata(btt_dev, drvdata);
 	btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
 	rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
 	dev_dbg(dev, "%s: btt: %s\n", __func__,
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index e8f03b0e95e4..6bbd0a36994a 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -12,6 +12,7 @@
  */
 #include <linux/device.h>
 #include <linux/sizes.h>
+#include <linux/pmem.h>
 #include "nd-core.h"
 #include "pfn.h"
 #include "btt.h"
@@ -199,3 +200,63 @@ u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
 	return sum;
 }
 EXPORT_SYMBOL(nd_sb_checksum);
+
+static int nsio_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size, int rw)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+	if (unlikely(offset + size > nsio->size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (rw == READ) {
+		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+
+		if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
+			return -EIO;
+		return memcpy_from_pmem(buf, nsio->addr + offset, size);
+	} else {
+		memcpy_to_pmem(nsio->addr + offset, buf, size);
+		wmb_pmem();
+	}
+
+	return 0;
+}
+
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+	struct nd_namespace_common *ndns = &nsio->common;
+
+	nsio->size = resource_size(res);
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", res);
+		return -EBUSY;
+	}
+
+	ndns->rw_bytes = nsio_rw_bytes;
+	if (devm_init_badblocks(dev, &nsio->bb))
+		return -ENOMEM;
+	nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
+			&nsio->res);
+
+	nsio->addr = devm_memremap(dev, res->start, resource_size(res),
+			ARCH_MEMREMAP_PMEM);
+	if (IS_ERR(nsio->addr))
+		return PTR_ERR(nsio->addr);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_nsio_enable);
+
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+
+	devm_memunmap(dev, nsio->addr);
+	devm_exit_badblocks(dev, &nsio->bb);
+	devm_release_mem_region(dev, res->start, resource_size(res));
+}
+EXPORT_SYMBOL_GPL(devm_nsio_disable);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 0fb14890ba26..10e23fe49012 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -13,6 +13,7 @@
 #ifndef __ND_H__
 #define __ND_H__
 #include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
 #include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
@@ -197,13 +198,12 @@ struct nd_gen_sb {
 
 u64 nd_sb_checksum(struct nd_gen_sb *sb);
 #if IS_ENABLED(CONFIG_BTT)
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_btt(struct device *dev);
 struct device *nd_btt_create(struct nd_region *nd_region);
 #else
 static inline int nd_btt_probe(struct device *dev,
-		struct nd_namespace_common *ndns, void *drvdata)
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -221,14 +221,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 int nd_pfn_validate(struct nd_pfn *nd_pfn);
 #else
-static inline int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+static inline int nd_pfn_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -272,6 +271,20 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
+#else
+static inline int devm_nsio_enable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+	return -ENXIO;
+}
+static inline void devm_nsio_disable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+}
+#endif
 int nd_blk_region_init(struct nd_region *nd_region);
 void __nd_iostat_start(struct bio *bio, unsigned long *start);
 static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
@@ -285,6 +298,19 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 	return true;
 }
 void nd_iostat_end(struct bio *bio, unsigned long start);
+static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
+		unsigned int len)
+{
+	if (bb->count) {
+		sector_t first_bad;
+		int num_bad;
+
+		return !!badblocks_check(bb, sector, len / 512, &first_bad,
+				&num_bad);
+	}
+
+	return false;
+}
 resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 const u8 *nd_dev_to_uuid(struct device *dev);
 bool pmem_should_map_pages(struct device *dev);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 96aa5490c279..9df081ae96e3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -410,8 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 }
 EXPORT_SYMBOL(nd_pfn_validate);
 
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct nd_pfn *nd_pfn;
@@ -427,7 +426,6 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!pfn_dev)
 		return -ENOMEM;
-	dev_set_drvdata(pfn_dev, drvdata);
 	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
 	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 67d48e2e8ca2..b5f81b02205c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -49,19 +49,6 @@ struct pmem_device {
 	struct badblocks	bb;
 };
 
-static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
-{
-	if (bb->count) {
-		sector_t first_bad;
-		int num_bad;
-
-		return !!badblocks_check(bb, sector, len / 512, &first_bad,
-				&num_bad);
-	}
-
-	return false;
-}
-
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
@@ -209,16 +196,40 @@ void pmem_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static struct pmem_device *pmem_alloc(struct device *dev,
-		struct resource *res, int id)
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap);
+
+static int pmem_attach_disk(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct vmem_altmap __altmap, *altmap = NULL;
+	struct resource *res = &nsio->res;
+	struct nd_pfn *nd_pfn = NULL;
+	int nid = dev_to_node(dev);
+	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
+	struct resource pfn_res;
 	struct request_queue *q;
+	struct gendisk *disk;
+	void *addr;
+
+	/* while nsio_rw_bytes is active, parse a pfn info block if present */
+	if (is_nd_pfn(dev)) {
+		nd_pfn = to_nd_pfn(dev);
+		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
+		if (IS_ERR(altmap))
+			return PTR_ERR(altmap);
+	}
+
+	/* we're attaching a block device, disable raw namespace access */
+	devm_nsio_disable(dev, nsio);
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
+	dev_set_drvdata(dev, pmem);
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 	if (!arch_has_wmb_pmem())
@@ -227,22 +238,31 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
 				dev_name(dev))) {
 		dev_warn(dev, "could not reserve region %pR\n", res);
-		return ERR_PTR(-EBUSY);
+		return -EBUSY;
 	}
 
 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+	pmem->pmem_queue = q;
 
 	pmem->pfn_flags = PFN_DEV;
-	if (pmem_should_map_pages(dev)) {
-		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
+	if (is_nd_pfn(dev)) {
+		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
+				altmap);
+		pfn_sb = nd_pfn->pfn_sb;
+		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
+		pmem->pfn_flags |= PFN_MAP;
+		res = &pfn_res; /* for badblocks populate */
+		res->start += pmem->data_offset;
+	} else if (pmem_should_map_pages(dev)) {
+		addr = devm_memremap_pages(dev, &nsio->res,
 				&q->q_usage_counter, NULL);
 		pmem->pfn_flags |= PFN_MAP;
 	} else
-		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
-				pmem->phys_addr, pmem->size,
-				ARCH_MEMREMAP_PMEM);
+		addr = devm_memremap(dev, pmem->phys_addr,
+				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
 	 * At release time the queue must be dead before
@@ -250,23 +270,12 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	 */
 	if (devm_add_action(dev, pmem_release_queue, q)) {
 		blk_cleanup_queue(q);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	if (IS_ERR(pmem->virt_addr))
-		return (void __force *) pmem->virt_addr;
-
-	pmem->pmem_queue = q;
-	return pmem;
-}
-
-static int pmem_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct pmem_device *pmem)
-{
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	int nid = dev_to_node(dev);
-	struct resource bb_res;
-	struct gendisk *disk;
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+	pmem->virt_addr = (void __pmem *) addr;
 
 	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
 	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
@@ -291,20 +300,9 @@ static int pmem_attach_disk(struct device *dev,
 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ 512);
 	pmem->pmem_disk = disk;
-	devm_exit_badblocks(dev, &pmem->bb);
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
-	bb_res.start = nsio->res.start + pmem->data_offset;
-	bb_res.end = nsio->res.end;
-	if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-
-		bb_res.start += __le32_to_cpu(pfn_sb->start_pad);
-		bb_res.end -= __le32_to_cpu(pfn_sb->end_trunc);
-	}
-	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb,
-			&bb_res);
+	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
 	disk->bb = &pmem->bb;
 	add_disk(disk);
 	revalidate_disk(disk);
@@ -312,33 +310,8 @@ static int pmem_attach_disk(struct device *dev,
 	return 0;
 }
 
-static int pmem_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size, int rw)
-{
-	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
-
-	if (unlikely(offset + size > pmem->size)) {
-		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
-		return -EFAULT;
-	}
-
-	if (rw == READ) {
-		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
-
-		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
-			return -EIO;
-		return memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
-	} else {
-		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
-		wmb_pmem();
-	}
-
-	return 0;
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
-	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	u32 start_pad = 0, end_trunc = 0;
 	resource_size_t start, size;
@@ -404,7 +377,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	 * ->direct_access() to those that are included in the memmap.
 	 */
 	start += start_pad;
-	npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K;
+	size = resource_size(&nsio->res);
+	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
 		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
 			- start;
@@ -413,13 +387,13 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= pmem->size) {
+	if (offset + start_pad + end_trunc >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -456,17 +430,14 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
+static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct resource res;
-	struct request_queue *q;
-	struct pmem_device *pmem;
-	struct vmem_altmap *altmap;
-	struct device *dev = &nd_pfn->dev;
 	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u64 offset = le64_to_cpu(pfn_sb->dataoff);
 	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
 	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	resource_size_t base = nsio->res.start + start_pad;
 	struct vmem_altmap __altmap = {
@@ -474,112 +445,75 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 		.reserve = init_altmap_reserve(base),
 	};
 
-	pmem = dev_get_drvdata(dev);
-	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
-	pmem->pfn_pad = start_pad + end_trunc;
+	memcpy(res, &nsio->res, sizeof(*res));
+	res->start += start_pad;
+	res->end -= end_trunc;
+
 	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
 	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (pmem->data_offset < SZ_8K)
-			return -EINVAL;
+		if (offset < SZ_8K)
+			return ERR_PTR(-EINVAL);
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset)
-			/ PAGE_SIZE;
+		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
-		altmap = & __altmap;
-		altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K);
+		memcpy(altmap, &__altmap, sizeof(*altmap));
+		altmap->free = PHYS_PFN(offset - SZ_8K);
 		altmap->alloc = 0;
 	} else
-		return -ENXIO;
+		return ERR_PTR(-ENXIO);
 
-	/* establish pfn range for lookup, and switch to direct map */
-	q = pmem->pmem_queue;
-	memcpy(&res, &nsio->res, sizeof(res));
-	res.start += start_pad;
-	res.end -= end_trunc;
-	devm_remove_action(dev, pmem_release_queue, q);
-	devm_memunmap(dev, (void __force *) pmem->virt_addr);
-	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
-			&q->q_usage_counter, altmap);
-	pmem->pfn_flags |= PFN_MAP;
-
-	/*
-	 * At release time the queue must be dead before
-	 * devm_memremap_pages is unwound
-	 */
-	if (devm_add_action(dev, pmem_release_queue, q)) {
-		blk_cleanup_queue(q);
-		return -ENOMEM;
-	}
-	if (IS_ERR(pmem->virt_addr))
-		return PTR_ERR(pmem->virt_addr);
-
-	/* attach pmem disk in "pfn-mode" */
-	return pmem_attach_disk(dev, ndns, pmem);
+	return altmap;
 }
 
-static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
 	int rc;
 
 	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	rc = nd_pfn_init(nd_pfn);
 	if (rc)
-		return rc;
+		return ERR_PTR(rc);
+
 	/* we need a valid pfn_sb before we can init a vmem_altmap */
-	return __nvdimm_namespace_attach_pfn(nd_pfn);
+	return __nvdimm_setup_pfn(nd_pfn, res, altmap);
 }
 
 static int nd_pmem_probe(struct device *dev)
 {
-	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct nd_namespace_common *ndns;
-	struct nd_namespace_io *nsio;
-	struct pmem_device *pmem;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	nsio = to_nd_namespace_io(&ndns->dev);
-	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
-	if (IS_ERR(pmem))
-		return PTR_ERR(pmem);
-
-	dev_set_drvdata(dev, pmem);
-	ndns->rw_bytes = pmem_rw_bytes;
-	if (devm_init_badblocks(dev, &pmem->bb))
-		return -ENOMEM;
-	nvdimm_badblocks_populate(nd_region, &pmem->bb, &nsio->res);
+	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
+		return -ENXIO;
 
-	if (is_nd_btt(dev)) {
-		/* btt allocates its own request_queue */
-		devm_remove_action(dev, pmem_release_queue, pmem->pmem_queue);
-		blk_cleanup_queue(pmem->pmem_queue);
+	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	}
 
 	if (is_nd_pfn(dev))
-		return nvdimm_namespace_attach_pfn(ndns);
+		return pmem_attach_disk(dev, ndns);
 
-	if (nd_btt_probe(dev, ndns, pmem) == 0
-			|| nd_pfn_probe(dev, ndns, pmem) == 0) {
-		/*
-		 * We'll come back as either btt-pmem, or pfn-pmem, so
-		 * drop the queue allocation for now.
-		 */
+	/* if we find a valid info-block we'll come back as that personality */
+	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
 		return -ENXIO;
-	}
 
-	return pmem_attach_disk(dev, ndns, pmem);
+	/* ...otherwise we're just a raw pmem device */
+	return pmem_attach_disk(dev, ndns);
 }
 
 static int nd_pmem_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5ea4aec7fd63..aee2761d294c 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/ndctl.h>
 #include <linux/device.h>
+#include <linux/badblocks.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -55,13 +56,19 @@ static inline struct nd_namespace_common *to_ndns(struct device *dev)
 }
 
 /**
- * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
+ * struct nd_namespace_io - device representation of a persistent memory range
  * @dev: namespace device created by the nd region driver
  * @res: struct resource conversion of a NFIT SPA table
+ * @size: cached resource_size(@res) for fast path size checks
+ * @addr: virtual address to access the namespace range
+ * @bb: badblocks list for the namespace range
  */
 struct nd_namespace_io {
 	struct nd_namespace_common common;
 	struct resource res;
+	resource_size_t size;
+	void __pmem *addr;
+	struct badblocks bb;
 };
 
 /**
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index a34bfd0c8928..d5bc8c080b44 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -7,6 +7,7 @@ ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
 ldflags-y += --wrap=memunmap
 ldflags-y += --wrap=__devm_request_region
+ldflags-y += --wrap=__devm_release_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 ldflags-y += --wrap=devm_memremap_pages
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 0c1a7e65bb81..c842095f2801 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -239,13 +239,11 @@ struct resource *__wrap___devm_request_region(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap___devm_request_region);
 
-void __wrap___release_region(struct resource *parent, resource_size_t start,
-				resource_size_t n)
+static bool nfit_test_release_region(struct resource *parent,
+		resource_size_t start, resource_size_t n)
 {
-	struct nfit_test_resource *nfit_res;
-
 	if (parent == &iomem_resource) {
-		nfit_res = get_nfit_res(start);
+		struct nfit_test_resource *nfit_res = get_nfit_res(start);
 		if (nfit_res) {
 			struct resource *res = nfit_res->res + 1;
 
@@ -254,11 +252,26 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
 						__func__, start, n, res);
 			else
 				memset(res, 0, sizeof(*res));
-			return;
+			return true;
 		}
 	}
-	__release_region(parent, start, n);
+	return false;
+}
+
+void __wrap___release_region(struct resource *parent, resource_size_t start,
+		resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__release_region(parent, start, n);
 }
 EXPORT_SYMBOL(__wrap___release_region);
 
+void __wrap___devm_release_region(struct device *dev, struct resource *parent,
+		resource_size_t start, resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__devm_release_region(dev, parent, start, n);
+}
+EXPORT_SYMBOL(__wrap___devm_release_region);
+
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From dfc57732ad38f93ae6232a3b4e64fd077383a0f1 Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:43 +0200
Subject: iio:core: mounting matrix support

Expose a rotation matrix to indicate userspace the chip placement with
respect to the overall hardware system. This is needed to adjust
coordinates sampled from a sensor chip when its position deviates from the
main hardware system.

Final coordinates computation is delegated to userspace since:
* computation may involve floating point arithmetics ;
* it allows an application to combine adjustments with arbitrary
  transformations.

This 3 dimentional space rotation matrix is expressed as 3x3 array of
strings to support floating point numbers. It may be retrieved from a
"[<dir>_][<type>_]mount_matrix" sysfs attribute file. It is declared into a
device / driver specific DTS property or platform data.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 51 ++++++++++++++++++++
 drivers/iio/industrialio-core.c         | 82 +++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h                 | 31 +++++++++++++
 3 files changed, 164 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index f155eff910f9..ba8df69d40b0 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1512,3 +1512,54 @@ Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw (unscaled no offset etc.) pH reading of a substance as a negative
 		base-10 logarithm of hydrodium ions in a litre of water.
+
+What:           /sys/bus/iio/devices/iio:deviceX/mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/out_mount_matrix
+KernelVersion:  4.6
+Contact:        linux-iio@vger.kernel.org
+Description:
+		Mounting matrix for IIO sensors. This is a rotation matrix which
+		informs userspace about sensor chip's placement relative to the
+		main hardware it is mounted on.
+		Main hardware placement is defined according to the local
+		reference frame related to the physical quantity the sensor
+		measures.
+		Given that the rotation matrix is defined in a board specific
+		way (platform data and / or device-tree), the main hardware
+		reference frame definition is left to the implementor's choice
+		(see below for a magnetometer example).
+		Applications should apply this rotation matrix to samples so
+		that when main hardware reference frame is aligned onto local
+		reference frame, then sensor chip reference frame is also
+		perfectly aligned with it.
+		Matrix is a 3x3 unitary matrix and typically looks like
+		[0, 1, 0; 1, 0, 0; 0, 0, -1]. Identity matrix
+		[1, 0, 0; 0, 1, 0; 0, 0, 1] means sensor chip and main hardware
+		are perfectly aligned with each other.
+
+		For example, a mounting matrix for a magnetometer sensor informs
+		userspace about sensor chip's ORIENTATION relative to the main
+		hardware.
+		More specifically, main hardware orientation is defined with
+		respect to the LOCAL EARTH GEOMAGNETIC REFERENCE FRAME where :
+		* Y is in the ground plane and positive towards magnetic North ;
+		* X is in the ground plane, perpendicular to the North axis and
+		  positive towards the East ;
+		* Z is perpendicular to the ground plane and positive upwards.
+
+		An implementor might consider that for a hand-held device, a
+		'natural' orientation would be 'front facing camera at the top'.
+		The main hardware reference frame could then be described as :
+		* Y is in the plane of the screen and is positive towards the
+		  top of the screen ;
+		* X is in the plane of the screen, perpendicular to Y axis, and
+		  positive towards the right hand side of the screen ;
+		* Z is perpendicular to the screen plane and positive out of the
+		  screen.
+		Another example for a quadrotor UAV might be :
+		* Y is in the plane of the propellers and positive towards the
+		  front-view camera;
+		* X is in the plane of the propellers, perpendicular to Y axis,
+		  and positive towards the starboard side of the UAV ;
+		* Z is perpendicular to propellers plane and positive upwards.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 190a5939fd8c..e6319a9346b2 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -412,6 +412,88 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(iio_enum_write);
 
+static const struct iio_mount_matrix iio_mount_idmatrix = {
+	.rotation = {
+		"1", "0", "0",
+		"0", "1", "0",
+		"0", "0", "1"
+	}
+};
+
+static int iio_setup_mount_idmatrix(const struct device *dev,
+				    struct iio_mount_matrix *matrix)
+{
+	*matrix = iio_mount_idmatrix;
+	dev_info(dev, "mounting matrix not found: using identity...\n");
+	return 0;
+}
+
+ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv,
+			      const struct iio_chan_spec *chan, char *buf)
+{
+	const struct iio_mount_matrix *mtx = ((iio_get_mount_matrix_t *)
+					      priv)(indio_dev, chan);
+
+	if (IS_ERR(mtx))
+		return PTR_ERR(mtx);
+
+	if (!mtx)
+		mtx = &iio_mount_idmatrix;
+
+	return snprintf(buf, PAGE_SIZE, "%s, %s, %s; %s, %s, %s; %s, %s, %s\n",
+			mtx->rotation[0], mtx->rotation[1], mtx->rotation[2],
+			mtx->rotation[3], mtx->rotation[4], mtx->rotation[5],
+			mtx->rotation[6], mtx->rotation[7], mtx->rotation[8]);
+}
+EXPORT_SYMBOL_GPL(iio_show_mount_matrix);
+
+/**
+ * of_iio_read_mount_matrix() - retrieve iio device mounting matrix from
+ *                              device-tree "mount-matrix" property
+ * @dev:	device the mounting matrix property is assigned to
+ * @propname:	device specific mounting matrix property name
+ * @matrix:	where to store retrieved matrix
+ *
+ * If device is assigned no mounting matrix property, a default 3x3 identity
+ * matrix will be filled in.
+ *
+ * Return: 0 if success, or a negative error code on failure.
+ */
+#ifdef CONFIG_OF
+int of_iio_read_mount_matrix(const struct device *dev,
+			     const char *propname,
+			     struct iio_mount_matrix *matrix)
+{
+	if (dev->of_node) {
+		int err = of_property_read_string_array(dev->of_node,
+				propname, matrix->rotation,
+				ARRAY_SIZE(iio_mount_idmatrix.rotation));
+
+		if (err == ARRAY_SIZE(iio_mount_idmatrix.rotation))
+			return 0;
+
+		if (err >= 0)
+			/* Invalid number of matrix entries. */
+			return -EINVAL;
+
+		if (err != -EINVAL)
+			/* Invalid matrix declaration format. */
+			return err;
+	}
+
+	/* Matrix was not declared at all: fallback to identity. */
+	return iio_setup_mount_idmatrix(dev, matrix);
+}
+#else
+int of_iio_read_mount_matrix(const struct device *dev,
+			     const char *propname,
+			     struct iio_mount_matrix *matrix)
+{
+	return iio_setup_mount_idmatrix(dev, matrix);
+}
+#endif
+EXPORT_SYMBOL(of_iio_read_mount_matrix);
+
 /**
  * iio_format_value() - Formats a IIO value into its string representation
  * @buf:	The buffer to which the formatted value gets written
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 0b2773ada0ba..7c29cb0124ae 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -147,6 +147,37 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
 	.private = (uintptr_t)(_e), \
 }
 
+/**
+ * struct iio_mount_matrix - iio mounting matrix
+ * @rotation: 3 dimensional space rotation matrix defining sensor alignment with
+ *            main hardware
+ */
+struct iio_mount_matrix {
+	const char *rotation[9];
+};
+
+ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv,
+			      const struct iio_chan_spec *chan, char *buf);
+int of_iio_read_mount_matrix(const struct device *dev, const char *propname,
+			     struct iio_mount_matrix *matrix);
+
+typedef const struct iio_mount_matrix *
+	(iio_get_mount_matrix_t)(const struct iio_dev *indio_dev,
+				 const struct iio_chan_spec *chan);
+
+/**
+ * IIO_MOUNT_MATRIX() - Initialize mount matrix extended channel attribute
+ * @_shared:	Whether the attribute is shared between all channels
+ * @_get:	Pointer to an iio_get_mount_matrix_t accessor
+ */
+#define IIO_MOUNT_MATRIX(_shared, _get) \
+{ \
+	.name = "mount_matrix", \
+	.shared = (_shared), \
+	.read = iio_show_mount_matrix, \
+	.private = (uintptr_t)(_get), \
+}
+
 /**
  * struct iio_event_spec - specification for a channel event
  * @type:		    Type of the event
-- 
cgit v1.2.3


From 97eacb9166f4810368e180073dcbceeff0de34df Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:44 +0200
Subject: iio:ak8975: add mounting matrix support

Expose a rotation matrix to indicate userspace the chip orientation with
respect to the overall hardware system.
Matrix is retrieved from "in_mount_matrix". It is declared into ak8975 DTS
entry as a "mount-matrix" property.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../bindings/iio/magnetometer/ak8975.txt           | 10 +++++++
 drivers/iio/magnetometer/ak8975.c                  | 34 +++++++++++++++++++---
 include/linux/iio/magnetometer/ak8975.h            | 16 ++++++++++
 3 files changed, 56 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/iio/magnetometer/ak8975.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
index 34a3206eefdf..e1e7dd3259f6 100644
--- a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
+++ b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
@@ -9,6 +9,7 @@ Optional properties:
 
   - gpios : should be device tree identifier of the magnetometer DRDY pin
   - vdd-supply: an optional regulator that needs to be on to provide VDD
+  - mount-matrix: an optional 3x3 mounting rotation matrix
 
 Example:
 
@@ -17,4 +18,13 @@ ak8975@0c {
         reg = <0x0c>;
         gpios = <&gpj0 7 0>;
         vdd-supply = <&ldo_3v3_gnss>;
+        mount-matrix = "-0.984807753012208",  /* x0 */
+                       "0",                   /* y0 */
+                       "-0.173648177666930",  /* z0 */
+                       "0",                   /* x1 */
+                       "-1",                  /* y1 */
+                       "0",                   /* z1 */
+                       "-0.173648177666930",  /* x2 */
+                       "0",                   /* y2 */
+                       "0.984807753012208";   /* z2 */
 };
diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index a2aac50a0149..dbf066129a04 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c
@@ -40,7 +40,8 @@
 #include <linux/iio/trigger.h>
 #include <linux/iio/trigger_consumer.h>
 #include <linux/iio/triggered_buffer.h>
-#include <linux/regulator/consumer.h>
+
+#include <linux/iio/magnetometer/ak8975.h>
 
 /*
  * Register definitions, as well as various shifts and masks to get at the
@@ -376,6 +377,7 @@ struct ak8975_data {
 	wait_queue_head_t	data_ready_queue;
 	unsigned long		flags;
 	u8			cntl_cache;
+	struct iio_mount_matrix orientation;
 	struct regulator	*vdd;
 };
 
@@ -726,6 +728,18 @@ static int ak8975_read_raw(struct iio_dev *indio_dev,
 	return -EINVAL;
 }
 
+static const struct iio_mount_matrix *
+ak8975_get_mount_matrix(const struct iio_dev *indio_dev,
+			const struct iio_chan_spec *chan)
+{
+	return &((struct ak8975_data *)iio_priv(indio_dev))->orientation;
+}
+
+static const struct iio_chan_spec_ext_info ak8975_ext_info[] = {
+	IIO_MOUNT_MATRIX(IIO_SHARED_BY_DIR, ak8975_get_mount_matrix),
+	{ },
+};
+
 #define AK8975_CHANNEL(axis, index)					\
 	{								\
 		.type = IIO_MAGN,					\
@@ -740,7 +754,8 @@ static int ak8975_read_raw(struct iio_dev *indio_dev,
 			.realbits = 16,					\
 			.storagebits = 16,				\
 			.endianness = IIO_CPU				\
-		}							\
+		},							\
+		.ext_info = ak8975_ext_info,				\
 	}
 
 static const struct iio_chan_spec ak8975_channels[] = {
@@ -837,10 +852,12 @@ static int ak8975_probe(struct i2c_client *client,
 	int err;
 	const char *name = NULL;
 	enum asahi_compass_chipset chipset;
+	const struct ak8975_platform_data *pdata =
+		dev_get_platdata(&client->dev);
 
 	/* Grab and set up the supplied GPIO. */
-	if (client->dev.platform_data)
-		eoc_gpio = *(int *)(client->dev.platform_data);
+	if (pdata)
+		eoc_gpio = pdata->eoc_gpio;
 	else if (client->dev.of_node)
 		eoc_gpio = of_get_gpio(client->dev.of_node, 0);
 	else
@@ -874,6 +891,15 @@ static int ak8975_probe(struct i2c_client *client,
 	data->eoc_gpio = eoc_gpio;
 	data->eoc_irq = 0;
 
+	if (!pdata) {
+		err = of_iio_read_mount_matrix(&client->dev,
+					       "mount-matrix",
+					       &data->orientation);
+		if (err)
+			return err;
+	} else
+		data->orientation = pdata->orientation;
+
 	/* id will be NULL when enumerated via ACPI */
 	if (id) {
 		chipset = (enum asahi_compass_chipset)(id->driver_data);
diff --git a/include/linux/iio/magnetometer/ak8975.h b/include/linux/iio/magnetometer/ak8975.h
new file mode 100644
index 000000000000..c8400959d197
--- /dev/null
+++ b/include/linux/iio/magnetometer/ak8975.h
@@ -0,0 +1,16 @@
+#ifndef __IIO_MAGNETOMETER_AK8975_H__
+#define __IIO_MAGNETOMETER_AK8975_H__
+
+#include <linux/iio/iio.h>
+
+/**
+ * struct ak8975_platform_data - AK8975 magnetometer driver platform data
+ * @eoc_gpio:    data ready event gpio
+ * @orientation: mounting matrix relative to main hardware
+ */
+struct ak8975_platform_data {
+	int                     eoc_gpio;
+	struct iio_mount_matrix orientation;
+};
+
+#endif
-- 
cgit v1.2.3


From eb3798463f71afc77abd25b2f62708be06f7173b Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 20 Apr 2016 19:23:45 +0200
Subject: iio:imu:mpu6050: enhance mounting matrix support

Add a new rotation matrix sysfs attribute compliant with IIO core
mounting matrix API.
Matrix is retrieved from "in_anglvel_mount_matrix" and
"in_accel_mount_matrix" sysfs attributes. It is declared into mpu6050 DTS
entry as a "mount-matrix" property.

Old interface is kept for backward userspace compatibility and may be
retrieved from legacy platform_data mechanism only.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio            |  2 ++
 .../devicetree/bindings/iio/imu/inv_mpu6050.txt    | 13 ++++++++
 drivers/iio/imu/inv_mpu6050/inv_mpu_core.c         | 36 ++++++++++++++++++++--
 drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h          |  4 ++-
 include/linux/platform_data/invensense_mpu6050.h   |  5 ++-
 5 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index ba8df69d40b0..df44998e7506 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1516,6 +1516,8 @@ Description:
 What:           /sys/bus/iio/devices/iio:deviceX/mount_matrix
 What:           /sys/bus/iio/devices/iio:deviceX/in_mount_matrix
 What:           /sys/bus/iio/devices/iio:deviceX/out_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_anglvel_mount_matrix
+What:           /sys/bus/iio/devices/iio:deviceX/in_accel_mount_matrix
 KernelVersion:  4.6
 Contact:        linux-iio@vger.kernel.org
 Description:
diff --git a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
index e4d8f1c52f4a..a9fc11e43b45 100644
--- a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
+++ b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt
@@ -8,10 +8,23 @@ Required properties:
  - interrupt-parent : should be the phandle for the interrupt controller
  - interrupts : interrupt mapping for GPIO IRQ
 
+Optional properties:
+ - mount-matrix: an optional 3x3 mounting rotation matrix
+
+
 Example:
 	mpu6050@68 {
 		compatible = "invensense,mpu6050";
 		reg = <0x68>;
 		interrupt-parent = <&gpio1>;
 		interrupts = <18 1>;
+		mount-matrix = "-0.984807753012208",  /* x0 */
+		               "0",                   /* y0 */
+		               "-0.173648177666930",  /* z0 */
+		               "0",                   /* x1 */
+		               "-1",                  /* y1 */
+		               "0",                   /* z1 */
+		               "-0.173648177666930",  /* x2 */
+		               "0",                   /* y2 */
+		               "0.984807753012208";   /* z2 */
 	};
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
index d192953e9a38..482a2490c53a 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
@@ -600,6 +600,10 @@ inv_fifo_rate_show(struct device *dev, struct device_attribute *attr,
 /**
  * inv_attr_show() - calling this function will show current
  *                    parameters.
+ *
+ * Deprecated in favor of IIO mounting matrix API.
+ *
+ * See inv_get_mount_matrix()
  */
 static ssize_t inv_attr_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
@@ -644,6 +648,18 @@ static int inv_mpu6050_validate_trigger(struct iio_dev *indio_dev,
 	return 0;
 }
 
+static const struct iio_mount_matrix *
+inv_get_mount_matrix(const struct iio_dev *indio_dev,
+		     const struct iio_chan_spec *chan)
+{
+	return &((struct inv_mpu6050_state *)iio_priv(indio_dev))->orientation;
+}
+
+static const struct iio_chan_spec_ext_info inv_ext_info[] = {
+	IIO_MOUNT_MATRIX(IIO_SHARED_BY_TYPE, inv_get_mount_matrix),
+	{ },
+};
+
 #define INV_MPU6050_CHAN(_type, _channel2, _index)                    \
 	{                                                             \
 		.type = _type,                                        \
@@ -660,6 +676,7 @@ static int inv_mpu6050_validate_trigger(struct iio_dev *indio_dev,
 				.shift = 0,                           \
 				.endianness = IIO_BE,                 \
 			     },                                       \
+		.ext_info = inv_ext_info,                             \
 	}
 
 static const struct iio_chan_spec inv_mpu_channels[] = {
@@ -692,14 +709,16 @@ static IIO_CONST_ATTR(in_accel_scale_available,
 					  "0.000598 0.001196 0.002392 0.004785");
 static IIO_DEV_ATTR_SAMP_FREQ(S_IRUGO | S_IWUSR, inv_fifo_rate_show,
 	inv_mpu6050_fifo_rate_store);
+
+/* Deprecated: kept for userspace backward compatibility. */
 static IIO_DEVICE_ATTR(in_gyro_matrix, S_IRUGO, inv_attr_show, NULL,
 	ATTR_GYRO_MATRIX);
 static IIO_DEVICE_ATTR(in_accel_matrix, S_IRUGO, inv_attr_show, NULL,
 	ATTR_ACCL_MATRIX);
 
 static struct attribute *inv_attributes[] = {
-	&iio_dev_attr_in_gyro_matrix.dev_attr.attr,
-	&iio_dev_attr_in_accel_matrix.dev_attr.attr,
+	&iio_dev_attr_in_gyro_matrix.dev_attr.attr,  /* deprecated */
+	&iio_dev_attr_in_accel_matrix.dev_attr.attr, /* deprecated */
 	&iio_dev_attr_sampling_frequency.dev_attr.attr,
 	&iio_const_attr_sampling_frequency_available.dev_attr.attr,
 	&iio_const_attr_in_accel_scale_available.dev_attr.attr,
@@ -779,9 +798,20 @@ int inv_mpu_core_probe(struct regmap *regmap, int irq, const char *name,
 	st->powerup_count = 0;
 	st->irq = irq;
 	st->map = regmap;
+
 	pdata = dev_get_platdata(dev);
-	if (pdata)
+	if (!pdata) {
+		result = of_iio_read_mount_matrix(dev, "mount-matrix",
+						  &st->orientation);
+		if (result) {
+			dev_err(dev, "Failed to retrieve mounting matrix %d\n",
+				result);
+			return result;
+		}
+	} else {
 		st->plat_data = *pdata;
+	}
+
 	/* power is turned on inside check chip type*/
 	result = inv_check_and_setup_chip(st);
 	if (result)
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
index e302a49703bf..52d60cdc9f16 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
@@ -114,7 +114,8 @@ struct inv_mpu6050_hw {
  *  @hw:		Other hardware-specific information.
  *  @chip_type:		chip type.
  *  @time_stamp_lock:	spin lock to time stamp.
- *  @plat_data:		platform data.
+ *  @plat_data:		platform data (deprecated in favor of @orientation).
+ *  @orientation:	sensor chip orientation relative to main hardware.
  *  @timestamps:        kfifo queue to store time stamp.
  *  @map		regmap pointer.
  *  @irq		interrupt number.
@@ -131,6 +132,7 @@ struct inv_mpu6050_state {
 	struct i2c_client *mux_client;
 	unsigned int powerup_count;
 	struct inv_mpu6050_platform_data plat_data;
+	struct iio_mount_matrix orientation;
 	DECLARE_KFIFO(timestamps, long long, TIMESTAMP_FIFO_SIZE);
 	struct regmap *map;
 	int irq;
diff --git a/include/linux/platform_data/invensense_mpu6050.h b/include/linux/platform_data/invensense_mpu6050.h
index ad3aa7b95f35..554b59801aa8 100644
--- a/include/linux/platform_data/invensense_mpu6050.h
+++ b/include/linux/platform_data/invensense_mpu6050.h
@@ -16,13 +16,16 @@
 
 /**
  * struct inv_mpu6050_platform_data - Platform data for the mpu driver
- * @orientation:	Orientation matrix of the chip
+ * @orientation:	Orientation matrix of the chip (deprecated in favor of
+ *			mounting matrix retrieved from device-tree)
  *
  * Contains platform specific information on how to configure the MPU6050 to
  * work on this platform.  The orientation matricies are 3x3 rotation matricies
  * that are applied to the data to rotate from the mounting orientation to the
  * platform orientation.  The values must be one of 0, 1, or -1 and each row and
  * column should have exactly 1 non-zero value.
+ *
+ * Deprecated in favor of mounting matrix retrieved from device-tree.
  */
 struct inv_mpu6050_platform_data {
 	__s8 orientation[9];
-- 
cgit v1.2.3


From e9bbe898cbe89b17ad3993c136aa13d0431cd537 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 22 Apr 2016 17:31:19 +0200
Subject: libnl: nla_put_net64(): align on a 64-bit area

nla_data() is now aligned on a 64-bit area.

The temporary function nla_put_be64_32bit() is removed in this patch.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/ipset/ip_set.h      |  9 ++++++---
 include/net/netlink.h                       | 14 ++++++--------
 include/uapi/linux/netfilter/ipset/ip_set.h |  1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f48b8a664b0f..83b9a2e0d8d4 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -351,7 +351,8 @@ ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
 		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
 			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask))) ||
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
 	       (skbinfo->skbprio &&
 		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
 			      cpu_to_be32(skbinfo->skbprio))) ||
@@ -374,9 +375,11 @@ static inline bool
 ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
 {
 	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter))) ||
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
 	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)));
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
 }
 
 static inline void
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 47d7d1356fa3..066a921e7cbe 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -868,20 +868,18 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
 	return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr);
 }
 
-static inline int nla_put_be64_32bit(struct sk_buff *skb, int attrtype,
-				     __be64 value)
-{
-	return nla_put(skb, attrtype, sizeof(__be64), &value);
-}
 /**
- * nla_put_net64 - Add 64-bit network byte order netlink attribute to a socket buffer
+ * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
  * @skb: socket buffer to add attribute to
  * @attrtype: attribute type
  * @value: numeric value
+ * @padattr: attribute type for the padding
  */
-static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value)
+static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
+				int padattr)
 {
-	return nla_put_be64_32bit(skb, attrtype | NLA_F_NET_BYTEORDER, value);
+	return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value,
+			    padattr);
 }
 
 /**
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 63b2e34f1b60..ebb5154976de 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -118,6 +118,7 @@ enum {
 	IPSET_ATTR_SKBMARK,
 	IPSET_ATTR_SKBPRIO,
 	IPSET_ATTR_SKBQUEUE,
+	IPSET_ATTR_PAD,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
-- 
cgit v1.2.3


From 41617e1a8dec9fe082ba5dec26bacb154eb55482 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:07 -0400
Subject: jbd2: add support for avoiding data writes during transaction commits

Currently when filesystem needs to make sure data is on permanent
storage before committing a transaction it adds inode to transaction's
inode list. During transaction commit, jbd2 writes back all dirty
buffers that have allocated underlying blocks and waits for the IO to
finish. However when doing writeback for delayed allocated data, we
allocate blocks and immediately submit the data. Thus asking jbd2 to
write dirty pages just unnecessarily adds more work to jbd2 possibly
writing back other redirtied blocks.

Add support to jbd2 to allow filesystem to ask jbd2 to only wait for
outstanding data writes before committing a transaction and thus avoid
unnecessary writes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.h   |  3 ++-
 fs/jbd2/commit.c      |  4 ++++
 fs/jbd2/journal.c     |  3 ++-
 fs/jbd2/transaction.c | 22 ++++++++++++++++++----
 fs/ocfs2/journal.h    |  2 +-
 include/linux/jbd2.h  | 13 +++++++++++--
 6 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5f5846211095..f1c940b38b30 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -362,7 +362,8 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+		return jbd2_journal_inode_add_write(handle,
+						    EXT4_I(inode)->jinode);
 	return 0;
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2ad98d6e19f4..70078096117d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WRITE_DATA))
+			continue;
 		mapping = jinode->i_vfs_inode->i_mapping;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WAIT_DATA))
+			continue;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 435f0b26ac20..b31852f76f46 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 67c103867bf8..be56c8ca34c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 /*
  * File inode in the inode list of the handle's transaction
  */
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+				   unsigned long flags)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	 * and if jinode->i_next_transaction == transaction, commit code
 	 * will only file the inode where we want it.
 	 */
-	if (jinode->i_transaction == transaction ||
-	    jinode->i_next_transaction == transaction)
+	if ((jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction) &&
+	    (jinode->i_flags & flags) == flags)
 		return 0;
 
 	spin_lock(&journal->j_list_lock);
-
+	jinode->i_flags |= flags;
+	/* Is inode already attached where we need it? */
 	if (jinode->i_transaction == transaction ||
 	    jinode->i_next_transaction == transaction)
 		goto done;
@@ -2523,6 +2526,17 @@ done:
 	return 0;
 }
 
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode,
+				       JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
 /*
  * File truncate and transaction commit interact with each other in a
  * non-trivial way.  If a transaction writing data block A is
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index f4cd3c3e9fb7..497a4171ef61 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 
 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+	return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd1083c46c61..39511484ad10 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
-/* Commit of the inode data in progress. We use this flag to protect us from
+#define __JI_WRITE_DATA 1
+#define __JI_WAIT_DATA 2
+
+/*
+ * Commit of the inode data in progress. We use this flag to protect us from
  * concurrent deletion of inode. We cannot use reference to inode for this
  * since we cannot afford doing last iput() on behalf of kjournald
  */
 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+/* Write allocated dirty buffers in this inode before commit */
+#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
+/* Wait for outstanding data writes for this inode before commit */
+#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
  * struct jbd_inode is the structure linking inodes in ordered mode
@@ -1270,7 +1278,8 @@ extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_force_commit_nested(journal_t *);
-extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
 				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
-- 
cgit v1.2.3


From 7ba2f2757d84eae533679306f03c93c118437a87 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 23 Apr 2016 22:47:08 +0200
Subject: spi: core: add hook flash_read_supported to spi_master

If hook spi_flash_read is implemented the fast flash read feature
is enabled for all devices attached to the respective master.

In most cases there is just one flash chip, however there are also
devices with more than one flash chip, namely some WiFi routers.
Then the fast flash read feature can be used for the first chip only.
OpenWRT implemented an own handling of this case, using controller_data
element of spi_device to hold the information whether fast flash read
can be used for a device.

This patch adds hook flash_read_supported to spi_master which is
used to extend spi_flash_read_supported() by checking whether the
fast flash read feature can be used for a specific spi_device.

If the hook is not implemented the default behavior is to allow
fast flash read for all devices (if spi_flash_read is implemented).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 857a9a1d82b5..1f03483f61e5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -372,6 +372,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @unprepare_message: undo any work done by prepare_message().
  * @spi_flash_read: to support spi-controller hardwares that provide
  *                  accelerated interface to read from flash devices.
+ * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
  *	are not GPIOs (driven by the SPI controller itself).
@@ -529,6 +530,7 @@ struct spi_master {
 				 struct spi_message *message);
 	int (*spi_flash_read)(struct  spi_device *spi,
 			      struct spi_flash_read_message *msg);
+	bool (*flash_read_supported)(struct spi_device *spi);
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
@@ -1158,7 +1160,9 @@ struct spi_flash_read_message {
 /* SPI core interface for flash read support */
 static inline bool spi_flash_read_supported(struct spi_device *spi)
 {
-	return spi->master->spi_flash_read ? true : false;
+	return spi->master->spi_flash_read &&
+	       (!spi->master->flash_read_supported ||
+	       spi->master->flash_read_supported(spi));
 }
 
 int spi_flash_read(struct spi_device *spi,
-- 
cgit v1.2.3


From a558da0916b90c330940a106105d0a6a67cb77f7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 25 Apr 2016 10:25:20 +0200
Subject: ieee802154: use nla_put_u64_64bit()

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/nl802154.h  |  2 ++
 net/ieee802154/nl-mac.c   | 17 +++++++++++------
 net/ieee802154/nl802154.c |  3 ++-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index 167342c2ce6b..0f6f6607f592 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -92,6 +92,8 @@ enum {
 	IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
 	IEEE802154_ATTR_LLSEC_DEV_KEY_MODE,
 
+	IEEE802154_ATTR_PAD,
+
 	__IEEE802154_ATTR_MAX,
 };
 
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 3503c38954f9..d3cbb3258718 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -34,9 +34,11 @@
 
 #include "ieee802154.h"
 
-static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr)
+static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr,
+			  int padattr)
 {
-	return nla_put_u64(msg, type, swab64((__force u64)hwaddr));
+	return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr),
+				 padattr);
 }
 
 static __le64 nla_get_hwaddr(const struct nlattr *nla)
@@ -623,7 +625,8 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
 
 		if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
 		    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
-				   desc->device_addr.extended_addr))
+				   desc->device_addr.extended_addr,
+				   IEEE802154_ATTR_PAD))
 			return -EMSGSIZE;
 	}
 
@@ -638,7 +641,7 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg,
 
 	if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
 	    nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
-			   desc->extended_source))
+			   desc->extended_source, IEEE802154_ATTR_PAD))
 		return -EMSGSIZE;
 
 	return 0;
@@ -1063,7 +1066,8 @@ ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
 	    nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
 	    nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
 			      desc->short_addr) ||
-	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr,
+			   IEEE802154_ATTR_PAD) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
 			desc->frame_counter) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
@@ -1167,7 +1171,8 @@ ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
 
 	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
-	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr,
+			   IEEE802154_ATTR_PAD) ||
 	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
 			devkey->frame_counter) ||
 	    ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 614072064d03..8035c93dd527 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -813,7 +813,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 
 	if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
 	    nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) ||
-	    nla_put_u64(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev)) ||
+	    nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+			      wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) ||
 	    nla_put_u32(msg, NL802154_ATTR_GENERATION,
 			rdev->devlist_generation ^
 			(cfg802154_rdev_list_generation << 2)))
-- 
cgit v1.2.3


From d4967cf38fbd62467b8fb5cab63d7da1f5907ed7 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Fri, 22 Apr 2016 08:41:01 +0300
Subject: qed*: Align statistics names

There's a difference in statsitics' names starting at qed and
propagating to qede, where egress counters indicate ranges while ingress
counters indiciate high-end.
Align all statistcs to follow the same conventions - name indicates range.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede.h         | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 20 ++++++++---------
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 29 ++++++++++++++++---------
 include/linux/qed/qed_if.h                      | 20 ++++++++---------
 5 files changed, 59 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index fb5f3b815340..31e1d510a991 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1415,16 +1415,16 @@ static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 			sizeof(port_stats));
 
 	p_stats->rx_64_byte_packets		+= port_stats.pmm.r64;
-	p_stats->rx_127_byte_packets		+= port_stats.pmm.r127;
-	p_stats->rx_255_byte_packets		+= port_stats.pmm.r255;
-	p_stats->rx_511_byte_packets		+= port_stats.pmm.r511;
-	p_stats->rx_1023_byte_packets		+= port_stats.pmm.r1023;
-	p_stats->rx_1518_byte_packets		+= port_stats.pmm.r1518;
-	p_stats->rx_1522_byte_packets		+= port_stats.pmm.r1522;
-	p_stats->rx_2047_byte_packets		+= port_stats.pmm.r2047;
-	p_stats->rx_4095_byte_packets		+= port_stats.pmm.r4095;
-	p_stats->rx_9216_byte_packets		+= port_stats.pmm.r9216;
-	p_stats->rx_16383_byte_packets		+= port_stats.pmm.r16383;
+	p_stats->rx_65_to_127_byte_packets	+= port_stats.pmm.r127;
+	p_stats->rx_128_to_255_byte_packets	+= port_stats.pmm.r255;
+	p_stats->rx_256_to_511_byte_packets	+= port_stats.pmm.r511;
+	p_stats->rx_512_to_1023_byte_packets	+= port_stats.pmm.r1023;
+	p_stats->rx_1024_to_1518_byte_packets	+= port_stats.pmm.r1518;
+	p_stats->rx_1519_to_1522_byte_packets	+= port_stats.pmm.r1522;
+	p_stats->rx_1519_to_2047_byte_packets	+= port_stats.pmm.r2047;
+	p_stats->rx_2048_to_4095_byte_packets	+= port_stats.pmm.r4095;
+	p_stats->rx_4096_to_9216_byte_packets	+= port_stats.pmm.r9216;
+	p_stats->rx_9217_to_16383_byte_packets	+= port_stats.pmm.r16383;
 	p_stats->rx_crc_errors			+= port_stats.pmm.rfcs;
 	p_stats->rx_mac_crtl_frames		+= port_stats.pmm.rxcf;
 	p_stats->rx_pause_frames		+= port_stats.pmm.rxpf;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 16df1591388f..a687e7a1dc8d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -59,16 +59,16 @@ struct qede_stats {
 
 	/* port */
 	u64 rx_64_byte_packets;
-	u64 rx_127_byte_packets;
-	u64 rx_255_byte_packets;
-	u64 rx_511_byte_packets;
-	u64 rx_1023_byte_packets;
-	u64 rx_1518_byte_packets;
-	u64 rx_1522_byte_packets;
-	u64 rx_2047_byte_packets;
-	u64 rx_4095_byte_packets;
-	u64 rx_9216_byte_packets;
-	u64 rx_16383_byte_packets;
+	u64 rx_65_to_127_byte_packets;
+	u64 rx_128_to_255_byte_packets;
+	u64 rx_256_to_511_byte_packets;
+	u64 rx_512_to_1023_byte_packets;
+	u64 rx_1024_to_1518_byte_packets;
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
 	u64 rx_crc_errors;
 	u64 rx_mac_crtl_frames;
 	u64 rx_pause_frames;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index f0982f163670..f87e83b41d5d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -59,16 +59,16 @@ static const struct {
 	QEDE_STAT(tx_bcast_pkts),
 
 	QEDE_PF_STAT(rx_64_byte_packets),
-	QEDE_PF_STAT(rx_127_byte_packets),
-	QEDE_PF_STAT(rx_255_byte_packets),
-	QEDE_PF_STAT(rx_511_byte_packets),
-	QEDE_PF_STAT(rx_1023_byte_packets),
-	QEDE_PF_STAT(rx_1518_byte_packets),
-	QEDE_PF_STAT(rx_1522_byte_packets),
-	QEDE_PF_STAT(rx_2047_byte_packets),
-	QEDE_PF_STAT(rx_4095_byte_packets),
-	QEDE_PF_STAT(rx_9216_byte_packets),
-	QEDE_PF_STAT(rx_16383_byte_packets),
+	QEDE_PF_STAT(rx_65_to_127_byte_packets),
+	QEDE_PF_STAT(rx_128_to_255_byte_packets),
+	QEDE_PF_STAT(rx_256_to_511_byte_packets),
+	QEDE_PF_STAT(rx_512_to_1023_byte_packets),
+	QEDE_PF_STAT(rx_1024_to_1518_byte_packets),
+	QEDE_PF_STAT(rx_1519_to_1522_byte_packets),
+	QEDE_PF_STAT(rx_1519_to_2047_byte_packets),
+	QEDE_PF_STAT(rx_2048_to_4095_byte_packets),
+	QEDE_PF_STAT(rx_4096_to_9216_byte_packets),
+	QEDE_PF_STAT(rx_9217_to_16383_byte_packets),
 	QEDE_PF_STAT(tx_64_byte_packets),
 	QEDE_PF_STAT(tx_65_to_127_byte_packets),
 	QEDE_PF_STAT(tx_128_to_255_byte_packets),
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 197ef85684da..1e3ee49bae24 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1638,16 +1638,25 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	edev->stats.coalesced_bytes = stats.tpa_coalesced_bytes;
 
 	edev->stats.rx_64_byte_packets = stats.rx_64_byte_packets;
-	edev->stats.rx_127_byte_packets = stats.rx_127_byte_packets;
-	edev->stats.rx_255_byte_packets = stats.rx_255_byte_packets;
-	edev->stats.rx_511_byte_packets = stats.rx_511_byte_packets;
-	edev->stats.rx_1023_byte_packets = stats.rx_1023_byte_packets;
-	edev->stats.rx_1518_byte_packets = stats.rx_1518_byte_packets;
-	edev->stats.rx_1522_byte_packets = stats.rx_1522_byte_packets;
-	edev->stats.rx_2047_byte_packets = stats.rx_2047_byte_packets;
-	edev->stats.rx_4095_byte_packets = stats.rx_4095_byte_packets;
-	edev->stats.rx_9216_byte_packets = stats.rx_9216_byte_packets;
-	edev->stats.rx_16383_byte_packets = stats.rx_16383_byte_packets;
+	edev->stats.rx_65_to_127_byte_packets = stats.rx_65_to_127_byte_packets;
+	edev->stats.rx_128_to_255_byte_packets =
+				stats.rx_128_to_255_byte_packets;
+	edev->stats.rx_256_to_511_byte_packets =
+				stats.rx_256_to_511_byte_packets;
+	edev->stats.rx_512_to_1023_byte_packets =
+				stats.rx_512_to_1023_byte_packets;
+	edev->stats.rx_1024_to_1518_byte_packets =
+				stats.rx_1024_to_1518_byte_packets;
+	edev->stats.rx_1519_to_1522_byte_packets =
+				stats.rx_1519_to_1522_byte_packets;
+	edev->stats.rx_1519_to_2047_byte_packets =
+				stats.rx_1519_to_2047_byte_packets;
+	edev->stats.rx_2048_to_4095_byte_packets =
+				stats.rx_2048_to_4095_byte_packets;
+	edev->stats.rx_4096_to_9216_byte_packets =
+				stats.rx_4096_to_9216_byte_packets;
+	edev->stats.rx_9217_to_16383_byte_packets =
+				stats.rx_9217_to_16383_byte_packets;
 	edev->stats.rx_crc_errors = stats.rx_crc_errors;
 	edev->stats.rx_mac_crtl_frames = stats.rx_mac_crtl_frames;
 	edev->stats.rx_pause_frames = stats.rx_pause_frames;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 67e8c206b2c1..82a7fe011068 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -384,16 +384,16 @@ struct qed_eth_stats {
 
 	/* port */
 	u64	rx_64_byte_packets;
-	u64	rx_127_byte_packets;
-	u64	rx_255_byte_packets;
-	u64	rx_511_byte_packets;
-	u64	rx_1023_byte_packets;
-	u64	rx_1518_byte_packets;
-	u64	rx_1522_byte_packets;
-	u64	rx_2047_byte_packets;
-	u64	rx_4095_byte_packets;
-	u64	rx_9216_byte_packets;
-	u64	rx_16383_byte_packets;
+	u64	rx_65_to_127_byte_packets;
+	u64	rx_128_to_255_byte_packets;
+	u64	rx_256_to_511_byte_packets;
+	u64	rx_512_to_1023_byte_packets;
+	u64	rx_1024_to_1518_byte_packets;
+	u64	rx_1519_to_1522_byte_packets;
+	u64	rx_1519_to_2047_byte_packets;
+	u64	rx_2048_to_4095_byte_packets;
+	u64	rx_4096_to_9216_byte_packets;
+	u64	rx_9217_to_16383_byte_packets;
 	u64	rx_crc_errors;
 	u64	rx_mac_crtl_frames;
 	u64	rx_pause_frames;
-- 
cgit v1.2.3


From fe7cd2bfdac4d8739bc8665eef040e668e6b428f Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Fri, 22 Apr 2016 08:41:03 +0300
Subject: qed*: Conditions for changing link

There's some inconsistency in current logic determining whether the
link settings of a given interface can be changed; I.e., in all modes
other than the so-called `deault' mode the interfaces are forbidden from
changing the configuration - but even this rule is not applied to all
user APIs that may change the configuration.

Instead, let the core-module [qed] decide whether an interface can change
the configuration by supporting a new API function. We also revise the
current rule, allowing all interfaces to change their configurations while
laying the infrastructure for future modes where an interface would be
blocked from making such a configuration.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c      |  6 ++++++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 14 ++++++++++----
 include/linux/qed/qed_if.h                      | 10 ++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1e9f321f1ac4..d189871e8e23 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -915,6 +915,11 @@ static u32 qed_sb_release(struct qed_dev *cdev,
 	return rc;
 }
 
+static bool qed_can_link_change(struct qed_dev *cdev)
+{
+	return true;
+}
+
 static int qed_set_link(struct qed_dev *cdev,
 			struct qed_link_params *params)
 {
@@ -1177,6 +1182,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.sb_release = &qed_sb_release,
 	.simd_handler_config = &qed_simd_handler_config,
 	.simd_handler_clean = &qed_simd_handler_clean,
+	.can_link_change = &qed_can_link_change,
 	.set_link = &qed_set_link,
 	.get_link = &qed_get_current_link,
 	.drain = &qed_drain,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 2ac98d44c1e1..f1dd25ac5552 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -239,9 +239,9 @@ static int qede_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	struct qed_link_params params;
 	u32 speed;
 
-	if (!edev->dev_info.common.is_mf_default) {
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
 		DP_INFO(edev,
-			"Link parameters can not be changed in non-default mode\n");
+			"Link settings are not allowed to be changed\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -350,6 +350,12 @@ static int qede_nway_reset(struct net_device *dev)
 	struct qed_link_output current_link;
 	struct qed_link_params link_params;
 
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev,
+			"Link settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (!netif_running(dev))
 		return 0;
 
@@ -450,9 +456,9 @@ static int qede_set_pauseparam(struct net_device *dev,
 	struct qed_link_params params;
 	struct qed_link_output current_link;
 
-	if (!edev->dev_info.common.is_mf_default) {
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
 		DP_INFO(edev,
-			"Pause parameters can not be updated in non-default mode\n");
+			"Pause settings are not allowed to be changed\n");
 		return -EOPNOTSUPP;
 	}
 
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 82a7fe011068..e5de42b62976 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -211,6 +211,16 @@ struct qed_common_ops {
 
 	void		(*simd_handler_clean)(struct qed_dev *cdev,
 					      int index);
+
+/**
+ * @brief can_link_change - can the instance change the link or not
+ *
+ * @param cdev
+ *
+ * @return true if link-change is allowed, false otherwise.
+ */
+	bool (*can_link_change)(struct qed_dev *cdev);
+
 /**
  * @brief set_link - set links according to params
  *
-- 
cgit v1.2.3


From 6fa01ccd883021105e9f8af7d04b9f156fa3494a Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 22 Apr 2016 18:36:35 -0700
Subject: skbuff: Add pskb_extract() helper function

A pattern of skb usage seen in modules such as RDS-TCP is to
extract `to_copy' bytes from the received TCP segment, starting
at some offset `off' into a new skb `clone'. This is done in
the ->data_ready callback, where the clone skb is queued up for rx on
the PF_RDS socket, while the parent TCP segment is returned unchanged
back to the TCP engine.

The existing code uses the sequence
	clone = skb_clone(..);
	pskb_pull(clone, off, ..);
	pskb_trim(clone, to_copy, ..);
with the intention of discarding the first `off' bytes. However,
skb_clone() + pskb_pull() implies pksb_expand_head(), which ends
up doing a redundant memcpy of bytes that will then get discarded
in __pskb_pull_tail().

To avoid this inefficiency, this commit adds pskb_extract() that
creates the clone, and memcpy's only the relevant header/frag/frag_list
to the start of `clone'. pskb_trim() is then invoked to trim clone
down to the requested to_copy bytes.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   2 +
 net/core/skbuff.c      | 242 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 244 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index da0ace389fec..a1ce63979ad8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
+			     gfp_t gfp);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ff7788b0151..7a1d48983f81 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4622,3 +4622,245 @@ failure:
 	return NULL;
 }
 EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+				    const int headlen, gfp_t gfp_mask)
+{
+	int i;
+	int size = skb_end_offset(skb);
+	int new_hlen = headlen - off;
+	u8 *data;
+	int doff = 0;
+
+	size = SKB_DATA_ALIGN(size);
+
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size +
+			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+			       gfp_mask, NUMA_NO_NODE, NULL);
+	if (!data)
+		return -ENOMEM;
+
+	size = SKB_WITH_OVERHEAD(ksize(data));
+
+	/* Copy real data, and all frags */
+	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+	skb->len -= off;
+
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb),
+	       offsetof(struct skb_shared_info,
+			frags[skb_shinfo(skb)->nr_frags]));
+	if (skb_cloned(skb)) {
+		/* drop the old head gracefully */
+		if (skb_orphan_frags(skb, gfp_mask)) {
+			kfree(data);
+			return -ENOMEM;
+		}
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			skb_frag_ref(skb, i);
+		if (skb_has_frag_list(skb))
+			skb_clone_fraglist(skb);
+		skb_release_data(skb);
+	} else {
+		/* we can reuse existing recount- all we did was
+		 * relocate values
+		 */
+		skb_free_head(skb);
+	}
+
+	doff = (data - skb->head);
+	skb->head = data;
+	skb->data = data;
+	skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->end = size;
+	doff = 0;
+#else
+	skb->end = skb->head + size;
+#endif
+	skb_set_tail_pointer(skb, skb_headlen(skb));
+	skb_headers_offset_update(skb, 0);
+	skb->cloned = 0;
+	skb->hdr_len = 0;
+	skb->nohdr = 0;
+	atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+	return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+				struct skb_shared_info *shinfo, int eat,
+				gfp_t gfp_mask)
+{
+	struct sk_buff *list = shinfo->frag_list;
+	struct sk_buff *clone = NULL;
+	struct sk_buff *insp = NULL;
+
+	do {
+		if (!list) {
+			pr_err("Not enough bytes to eat. Want %d\n", eat);
+			return -EFAULT;
+		}
+		if (list->len <= eat) {
+			/* Eaten as whole. */
+			eat -= list->len;
+			list = list->next;
+			insp = list;
+		} else {
+			/* Eaten partially. */
+			if (skb_shared(list)) {
+				clone = skb_clone(list, gfp_mask);
+				if (!clone)
+					return -ENOMEM;
+				insp = list->next;
+				list = clone;
+			} else {
+				/* This may be pulled without problems. */
+				insp = list;
+			}
+			if (pskb_carve(list, eat, gfp_mask) < 0) {
+				kfree_skb(clone);
+				return -ENOMEM;
+			}
+			break;
+		}
+	} while (eat);
+
+	/* Free pulled out fragments. */
+	while ((list = shinfo->frag_list) != insp) {
+		shinfo->frag_list = list->next;
+		kfree_skb(list);
+	}
+	/* And insert new clone at head. */
+	if (clone) {
+		clone->next = list;
+		shinfo->frag_list = clone;
+	}
+	return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+				       int pos, gfp_t gfp_mask)
+{
+	int i, k = 0;
+	int size = skb_end_offset(skb);
+	u8 *data;
+	const int nfrags = skb_shinfo(skb)->nr_frags;
+	struct skb_shared_info *shinfo;
+	int doff = 0;
+
+	size = SKB_DATA_ALIGN(size);
+
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size +
+			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+			       gfp_mask, NUMA_NO_NODE, NULL);
+	if (!data)
+		return -ENOMEM;
+
+	size = SKB_WITH_OVERHEAD(ksize(data));
+
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb), offsetof(struct skb_shared_info,
+					 frags[skb_shinfo(skb)->nr_frags]));
+	if (skb_orphan_frags(skb, gfp_mask)) {
+		kfree(data);
+		return -ENOMEM;
+	}
+	shinfo = (struct skb_shared_info *)(data + size);
+	for (i = 0; i < nfrags; i++) {
+		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+		if (pos + fsize > off) {
+			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+			if (pos < off) {
+				/* Split frag.
+				 * We have two variants in this case:
+				 * 1. Move all the frag to the second
+				 *    part, if it is possible. F.e.
+				 *    this approach is mandatory for TUX,
+				 *    where splitting is expensive.
+				 * 2. Split is accurately. We make this.
+				 */
+				shinfo->frags[0].page_offset += off - pos;
+				skb_frag_size_sub(&shinfo->frags[0], off - pos);
+			}
+			skb_frag_ref(skb, i);
+			k++;
+		}
+		pos += fsize;
+	}
+	shinfo->nr_frags = k;
+	if (skb_has_frag_list(skb))
+		skb_clone_fraglist(skb);
+
+	if (k == 0) {
+		/* split line is in frag list */
+		pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+	}
+	skb_release_data(skb);
+
+	doff = (data - skb->head);
+	skb->head = data;
+	skb->head_frag = 0;
+	skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->end = size;
+	doff = 0;
+#else
+	skb->end = skb->head + size;
+#endif
+	skb_reset_tail_pointer(skb);
+	skb_headers_offset_update(skb, 0);
+	skb->cloned   = 0;
+	skb->hdr_len  = 0;
+	skb->nohdr    = 0;
+	skb->len -= off;
+	skb->data_len = skb->len;
+	atomic_set(&skb_shinfo(skb)->dataref, 1);
+	return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+	int headlen = skb_headlen(skb);
+
+	if (len < headlen)
+		return pskb_carve_inside_header(skb, len, headlen, gfp);
+	else
+		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+			     int to_copy, gfp_t gfp)
+{
+	struct sk_buff  *clone = skb_clone(skb, gfp);
+
+	if (!clone)
+		return NULL;
+
+	if (pskb_carve(clone, off, gfp) < 0 ||
+	    pskb_trim(clone, to_copy)) {
+		kfree_skb(clone);
+		return NULL;
+	}
+	return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
-- 
cgit v1.2.3


From 21e8868e661ededd2c45c8ec27f6fd5354b88b71 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Wed, 20 Apr 2016 11:20:28 +0100
Subject: drivers: firmware: psci: make two helper functions static

psci_power_state_loses_context() and psci_power_state_is_valid are only
used internally now, so make them static.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/firmware/psci.c | 4 ++--
 include/linux/psci.h    | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
index 0ab477ba6564..04f2ac54e5ab 100644
--- a/drivers/firmware/psci.c
+++ b/drivers/firmware/psci.c
@@ -91,7 +91,7 @@ static inline bool psci_has_ext_power_state(void)
 				PSCI_1_0_FEATURES_CPU_SUSPEND_PF_MASK;
 }
 
-bool psci_power_state_loses_context(u32 state)
+static bool psci_power_state_loses_context(u32 state)
 {
 	const u32 mask = psci_has_ext_power_state() ?
 					PSCI_1_0_EXT_POWER_STATE_TYPE_MASK :
@@ -100,7 +100,7 @@ bool psci_power_state_loses_context(u32 state)
 	return state & mask;
 }
 
-bool psci_power_state_is_valid(u32 state)
+static bool psci_power_state_is_valid(u32 state)
 {
 	const u32 valid_mask = psci_has_ext_power_state() ?
 			       PSCI_1_0_EXT_POWER_STATE_MASK :
diff --git a/include/linux/psci.h b/include/linux/psci.h
index 393efe2edf9a..bdea1cb5e1db 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -21,8 +21,6 @@
 #define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
 
 bool psci_tos_resident_on(int cpu);
-bool psci_power_state_loses_context(u32 state);
-bool psci_power_state_is_valid(u32 state);
 
 int psci_cpu_init_idle(unsigned int cpu);
 int psci_cpu_suspend_enter(unsigned long index);
-- 
cgit v1.2.3


From 1fcbcc333f1fae6e11cc0839a6e72bc1a3e830bf Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 25 Apr 2016 16:50:14 -0700
Subject: block: copy NOMERGE flag from bio to request

bio might have NOMERGE flag set, for example blk_queue_split sets it.
When we initiate request, copy this flag too.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 86a38ea1823f..77e5d81f07aa 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -208,7 +208,7 @@ enum rq_flag_bits {
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
 	 REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
-	 REQ_SECURE | REQ_INTEGRITY)
+	 REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 #define BIO_NO_ADVANCE_ITER_MASK	(REQ_DISCARD|REQ_WRITE_SAME)
-- 
cgit v1.2.3


From 4899f78a3dccda41ffdaa1a2cbf78209753e0f70 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 6 Apr 2016 12:37:37 -0500
Subject: mailbox/omap: drop legacy platform device support

OMAP mailbox devices can no longer be created in legacy non-DT
mode, all the relevant code has been cleaned up. The OMAP mailbox
driver will only support devices created from DT going forward,
so drop the legacy platform device support from the driver.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/omap-mailbox.c             | 101 ++++++++++++-----------------
 include/linux/platform_data/mailbox-omap.h |  58 -----------------
 2 files changed, 41 insertions(+), 118 deletions(-)
 delete mode 100644 include/linux/platform_data/mailbox-omap.h

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index b7f636f15cac..8754d810ef05 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -2,7 +2,7 @@
  * OMAP mailbox driver
  *
  * Copyright (C) 2006-2009 Nokia Corporation. All rights reserved.
- * Copyright (C) 2013-2014 Texas Instruments Inc.
+ * Copyright (C) 2013-2016 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
@@ -33,7 +33,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
-#include <linux/platform_data/mailbox-omap.h>
 #include <linux/omap-mailbox.h>
 #include <linux/mailbox_controller.h>
 #include <linux/mailbox_client.h>
@@ -69,6 +68,10 @@
 #define MBOX_NR_REGS			(MBOX_REG_SIZE / sizeof(u32))
 #define OMAP4_MBOX_NR_REGS		(OMAP4_MBOX_REG_SIZE / sizeof(u32))
 
+/* Interrupt register configuration types */
+#define MBOX_INTR_CFG_TYPE1		0
+#define MBOX_INTR_CFG_TYPE2		1
+
 struct omap_mbox_fifo {
 	unsigned long msg;
 	unsigned long fifo_stat;
@@ -696,8 +699,6 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	int ret;
 	struct mbox_chan *chnls;
 	struct omap_mbox **list, *mbox, *mboxblk;
-	struct omap_mbox_pdata *pdata = pdev->dev.platform_data;
-	struct omap_mbox_dev_info *info = NULL;
 	struct omap_mbox_fifo_info *finfo, *finfoblk;
 	struct omap_mbox_device *mdev;
 	struct omap_mbox_fifo *fifo;
@@ -710,36 +711,26 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	u32 l;
 	int i;
 
-	if (!node && (!pdata || !pdata->info_cnt || !pdata->info)) {
-		pr_err("%s: platform not supported\n", __func__);
+	if (!node) {
+		pr_err("%s: only DT-based devices are supported\n", __func__);
 		return -ENODEV;
 	}
 
-	if (node) {
-		match = of_match_device(omap_mailbox_of_match, &pdev->dev);
-		if (!match)
-			return -ENODEV;
-		intr_type = (u32)match->data;
+	match = of_match_device(omap_mailbox_of_match, &pdev->dev);
+	if (!match)
+		return -ENODEV;
+	intr_type = (u32)match->data;
 
-		if (of_property_read_u32(node, "ti,mbox-num-users",
-					 &num_users))
-			return -ENODEV;
+	if (of_property_read_u32(node, "ti,mbox-num-users", &num_users))
+		return -ENODEV;
 
-		if (of_property_read_u32(node, "ti,mbox-num-fifos",
-					 &num_fifos))
-			return -ENODEV;
+	if (of_property_read_u32(node, "ti,mbox-num-fifos", &num_fifos))
+		return -ENODEV;
 
-		info_count = of_get_available_child_count(node);
-		if (!info_count) {
-			dev_err(&pdev->dev, "no available mbox devices found\n");
-			return -ENODEV;
-		}
-	} else { /* non-DT device creation */
-		info_count = pdata->info_cnt;
-		info = pdata->info;
-		intr_type = pdata->intr_type;
-		num_users = pdata->num_users;
-		num_fifos = pdata->num_fifos;
+	info_count = of_get_available_child_count(node);
+	if (!info_count) {
+		dev_err(&pdev->dev, "no available mbox devices found\n");
+		return -ENODEV;
 	}
 
 	finfoblk = devm_kzalloc(&pdev->dev, info_count * sizeof(*finfoblk),
@@ -750,38 +741,28 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	finfo = finfoblk;
 	child = NULL;
 	for (i = 0; i < info_count; i++, finfo++) {
-		if (node) {
-			child = of_get_next_available_child(node, child);
-			ret = of_property_read_u32_array(child, "ti,mbox-tx",
-							 tmp, ARRAY_SIZE(tmp));
-			if (ret)
-				return ret;
-			finfo->tx_id = tmp[0];
-			finfo->tx_irq = tmp[1];
-			finfo->tx_usr = tmp[2];
-
-			ret = of_property_read_u32_array(child, "ti,mbox-rx",
-							 tmp, ARRAY_SIZE(tmp));
-			if (ret)
-				return ret;
-			finfo->rx_id = tmp[0];
-			finfo->rx_irq = tmp[1];
-			finfo->rx_usr = tmp[2];
-
-			finfo->name = child->name;
-
-			if (of_find_property(child, "ti,mbox-send-noirq", NULL))
-				finfo->send_no_irq = true;
-		} else {
-			finfo->tx_id = info->tx_id;
-			finfo->rx_id = info->rx_id;
-			finfo->tx_usr = info->usr_id;
-			finfo->tx_irq = info->irq_id;
-			finfo->rx_usr = info->usr_id;
-			finfo->rx_irq = info->irq_id;
-			finfo->name = info->name;
-			info++;
-		}
+		child = of_get_next_available_child(node, child);
+		ret = of_property_read_u32_array(child, "ti,mbox-tx", tmp,
+						 ARRAY_SIZE(tmp));
+		if (ret)
+			return ret;
+		finfo->tx_id = tmp[0];
+		finfo->tx_irq = tmp[1];
+		finfo->tx_usr = tmp[2];
+
+		ret = of_property_read_u32_array(child, "ti,mbox-rx", tmp,
+						 ARRAY_SIZE(tmp));
+		if (ret)
+			return ret;
+		finfo->rx_id = tmp[0];
+		finfo->rx_irq = tmp[1];
+		finfo->rx_usr = tmp[2];
+
+		finfo->name = child->name;
+
+		if (of_find_property(child, "ti,mbox-send-noirq", NULL))
+			finfo->send_no_irq = true;
+
 		if (finfo->tx_id >= num_fifos || finfo->rx_id >= num_fifos ||
 		    finfo->tx_usr >= num_users || finfo->rx_usr >= num_users)
 			return -EINVAL;
diff --git a/include/linux/platform_data/mailbox-omap.h b/include/linux/platform_data/mailbox-omap.h
deleted file mode 100644
index 4631dbb4255e..000000000000
--- a/include/linux/platform_data/mailbox-omap.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * mailbox-omap.h
- *
- * Copyright (C) 2013 Texas Instruments, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _PLAT_MAILBOX_H
-#define _PLAT_MAILBOX_H
-
-/* Interrupt register configuration types */
-#define MBOX_INTR_CFG_TYPE1	(0)
-#define MBOX_INTR_CFG_TYPE2	(1)
-
-/**
- * struct omap_mbox_dev_info - OMAP mailbox device attribute info
- * @name:	name of the mailbox device
- * @tx_id:	mailbox queue id used for transmitting messages
- * @rx_id:	mailbox queue id on which messages are received
- * @irq_id:	irq identifier number to use from the hwmod data
- * @usr_id:	mailbox user id for identifying the interrupt into
- *			the MPU interrupt controller.
- */
-struct omap_mbox_dev_info {
-	const char *name;
-	u32 tx_id;
-	u32 rx_id;
-	u32 irq_id;
-	u32 usr_id;
-};
-
-/**
- * struct omap_mbox_pdata - OMAP mailbox platform data
- * @intr_type:	type of interrupt configuration registers used
-			while programming mailbox queue interrupts
- * @num_users:	number of users (processor devices) that the mailbox
- *			h/w block can interrupt
- * @num_fifos:	number of h/w fifos within the mailbox h/w block
- * @info_cnt:	number of mailbox devices for the platform
- * @info:	array of mailbox device attributes
- */
-struct omap_mbox_pdata {
-	u32 intr_type;
-	u32 num_users;
-	u32 num_fifos;
-	u32 info_cnt;
-	struct omap_mbox_dev_info *info;
-};
-
-#endif /* _PLAT_MAILBOX_H */
-- 
cgit v1.2.3


From dd28216528cf67339cd4f5854166fbd4eefd7fa8 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 6 Apr 2016 18:37:20 -0500
Subject: mailbox/omap: kill omap_mbox_{save/restore}_ctx() functions

The omap_mbox_save_ctx() and omap_mbox_restore_ctx() API were
previously provided to OMAP mailbox clients to save and restore
the mailbox context during system suspend/resume. The save and
restore functionality is now implemented through System PM driver
callbacks, and there is no need for these functions, so kill these
API.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/omap-mailbox.c | 51 ------------------------------------------
 include/linux/omap-mailbox.h   |  2 --
 2 files changed, 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index d8d3a4bc5262..c5e8b9cb170d 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -55,13 +55,6 @@
 #define MAILBOX_IRQ_NEWMSG(m)		(1 << (2 * (m)))
 #define MAILBOX_IRQ_NOTFULL(m)		(1 << (2 * (m) + 1))
 
-#define MBOX_REG_SIZE			0x120
-
-#define OMAP4_MBOX_REG_SIZE		0x130
-
-#define MBOX_NR_REGS			(MBOX_REG_SIZE / sizeof(u32))
-#define OMAP4_MBOX_NR_REGS		(OMAP4_MBOX_REG_SIZE / sizeof(u32))
-
 /* Interrupt register configuration types */
 #define MBOX_INTR_CFG_TYPE1		0
 #define MBOX_INTR_CFG_TYPE2		1
@@ -118,7 +111,6 @@ struct omap_mbox {
 	struct omap_mbox_device *parent;
 	struct omap_mbox_fifo	tx_fifo;
 	struct omap_mbox_fifo	rx_fifo;
-	u32			ctx[OMAP4_MBOX_NR_REGS];
 	u32			intr_type;
 	struct mbox_chan	*chan;
 	bool			send_no_irq;
@@ -209,49 +201,6 @@ static int is_mbox_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 	return (int)(enable & status & bit);
 }
 
-void omap_mbox_save_ctx(struct mbox_chan *chan)
-{
-	int i;
-	int nr_regs;
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	if (mbox->intr_type)
-		nr_regs = OMAP4_MBOX_NR_REGS;
-	else
-		nr_regs = MBOX_NR_REGS;
-	for (i = 0; i < nr_regs; i++) {
-		mbox->ctx[i] = mbox_read_reg(mbox->parent, i * sizeof(u32));
-
-		dev_dbg(mbox->dev, "%s: [%02x] %08x\n", __func__,
-			i, mbox->ctx[i]);
-	}
-}
-EXPORT_SYMBOL(omap_mbox_save_ctx);
-
-void omap_mbox_restore_ctx(struct mbox_chan *chan)
-{
-	int i;
-	int nr_regs;
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	if (mbox->intr_type)
-		nr_regs = OMAP4_MBOX_NR_REGS;
-	else
-		nr_regs = MBOX_NR_REGS;
-	for (i = 0; i < nr_regs; i++) {
-		mbox_write_reg(mbox->parent, mbox->ctx[i], i * sizeof(u32));
-		dev_dbg(mbox->dev, "%s: [%02x] %08x\n", __func__,
-			i, mbox->ctx[i]);
-	}
-}
-EXPORT_SYMBOL(omap_mbox_restore_ctx);
-
 static void _omap_mbox_enable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 {
 	u32 l;
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 587bbdd31f5a..c726bd833761 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -21,8 +21,6 @@ struct mbox_client;
 struct mbox_chan *omap_mbox_request_channel(struct mbox_client *cl,
 					    const char *chan_name);
 
-void omap_mbox_save_ctx(struct mbox_chan *chan);
-void omap_mbox_restore_ctx(struct mbox_chan *chan);
 void omap_mbox_enable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
 void omap_mbox_disable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
 
-- 
cgit v1.2.3


From f0cdf76c103ffa34ca5ac87dcdef7edffc722cbf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 24 Apr 2016 21:38:14 +0200
Subject: net: remove NETDEV_TX_LOCKED support

No more users in the tree, remove NETDEV_TX_LOCKED support.
Adds another hole in softnet_stats struct, but better than keeping
the unused collision counter around.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.txt | 10 ++++-----
 Documentation/networking/netdevices.txt      |  9 +++-----
 include/linux/netdevice.h                    |  3 ---
 net/core/net-procfs.c                        |  3 ++-
 net/core/pktgen.c                            |  1 -
 net/sched/sch_generic.c                      | 32 ----------------------------
 6 files changed, 9 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
index f310edec8a77..7413eb05223b 100644
--- a/Documentation/networking/netdev-features.txt
+++ b/Documentation/networking/netdev-features.txt
@@ -131,13 +131,11 @@ stack. Driver should not change behaviour based on them.
 
  * LLTX driver (deprecated for hardware drivers)
 
-NETIF_F_LLTX should be set in drivers that implement their own locking in
-transmit path or don't need locking at all (e.g. software tunnels).
-In ndo_start_xmit, it is recommended to use a try_lock and return
-NETDEV_TX_LOCKED when the spin lock fails.  The locking should also properly
-protect against other callbacks (the rules you need to find out).
+NETIF_F_LLTX is meant to be used by drivers that don't need locking at all,
+e.g. software tunnels.
 
-Don't use it for new drivers.
+This is also used in a few legacy drivers that implement their
+own locking, don't use it for new (hardware) drivers.
 
  * netns-local device
 
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index 0b1cf6b2a592..7fec2061a334 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -69,10 +69,9 @@ ndo_start_xmit:
 
 	When the driver sets NETIF_F_LLTX in dev->features this will be
 	called without holding netif_tx_lock. In this case the driver
-	has to lock by itself when needed. It is recommended to use a try lock
-	for this and return NETDEV_TX_LOCKED when the spin lock fails.
-	The locking there should also properly protect against 
-	set_rx_mode. Note that the use of NETIF_F_LLTX is deprecated.
+	has to lock by itself when needed.
+	The locking there should also properly protect against
+	set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated.
 	Don't use it for new drivers.
 
 	Context: Process with BHs disabled or BH (timer),
@@ -83,8 +82,6 @@ ndo_start_xmit:
 	o NETDEV_TX_BUSY Cannot transmit packet, try later 
 	  Usually a bug, means queue start/stop flow control is broken in
 	  the driver. Note: the driver must NOT put the skb in its DMA ring.
-	o NETDEV_TX_LOCKED Locking failed, please retry quickly.
-	  Only valid when NETIF_F_LLTX is set.
 
 ndo_tx_timeout:
 	Synchronization: netif_tx_lock spinlock; all TX queues frozen.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1f6d5db471a2..18d8394f2e5d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -106,7 +106,6 @@ enum netdev_tx {
 	__NETDEV_TX_MIN	 = INT_MIN,	/* make sure enum is signed */
 	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
 	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
-	NETDEV_TX_LOCKED = 0x20,	/* driver tx lock was already taken */
 };
 typedef enum netdev_tx netdev_tx_t;
 
@@ -831,7 +830,6 @@ struct tc_to_netdev {
  *	the queue before that can happen; it's for obsolete devices and weird
  *	corner cases, but the stack really does a non-trivial amount
  *	of useless work if you return NETDEV_TX_BUSY.
- *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
  *	Required; cannot be NULL.
  *
  * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
@@ -2737,7 +2735,6 @@ struct softnet_data {
 	/* stats */
 	unsigned int		processed;
 	unsigned int		time_squeeze;
-	unsigned int		cpu_collision;
 	unsigned int		received_rps;
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 2bf83299600a..14d09345f00d 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps, flow_limit_count);
+		   0,	/* was cpu_collision */
+		   sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 20999aa596dd..8604ae245960 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3472,7 +3472,6 @@ xmit_more:
 				     pkt_dev->odevname, ret);
 		pkt_dev->errors++;
 		/* fallthru */
-	case NETDEV_TX_LOCKED:
 	case NETDEV_TX_BUSY:
 		/* Retry it next time */
 		atomic_dec(&(pkt_dev->skb->users));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80742edea96f..9c7756237904 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -108,35 +108,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	return skb;
 }
 
-static inline int handle_dev_cpu_collision(struct sk_buff *skb,
-					   struct netdev_queue *dev_queue,
-					   struct Qdisc *q)
-{
-	int ret;
-
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
-		/*
-		 * Same CPU holding the lock. It may be a transient
-		 * configuration error, when hard_start_xmit() recurses. We
-		 * detect it by checking xmit owner and drop the packet when
-		 * deadloop is detected. Return OK to try the next skb.
-		 */
-		kfree_skb_list(skb);
-		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
-				     dev_queue->dev->name);
-		ret = qdisc_qlen(q);
-	} else {
-		/*
-		 * Another cpu is holding lock, requeue & delay xmits for
-		 * some time.
-		 */
-		__this_cpu_inc(softnet_data.cpu_collision);
-		ret = dev_requeue_skb(skb, q);
-	}
-
-	return ret;
-}
-
 /*
  * Transmit possibly several skbs, and handle the return status as
  * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
@@ -174,9 +145,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
 		ret = qdisc_qlen(q);
-	} else if (ret == NETDEV_TX_LOCKED) {
-		/* Driver try lock failed */
-		ret = handle_dev_cpu_collision(skb, txq, q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
-- 
cgit v1.2.3


From 9218b44dcc059e08e249f6f7614b8e391eb041d8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:47 +0300
Subject: net/mlx5e: Statistics handling refactoring

Redesign ethtool statistics handling and reporting in the driver:
1. Move counters to a separate file (en_stats.h).
2. Remove unnecessary dependencies between stats and strings.
3. Use counter descriptors which hold a name and offset for each counter,
   and will be used to decide which counters will be exposed.

For example when adding a new software counter to ethtool, instead of:
1. Add to stats struct.
2. Add to strings struct in the same order.
3. Change macro defining number of software counters.
The only thing needed is to link the new counter to a counter descriptor.

VPort counters are a set of hardware traffic counters created automatically
for each virtual port opened.
PPort counters are a set of counters describing per physical port
performance statistics.
These counters are gathered from hardware register and divided to groups
according to different protocols.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 240 +---------------
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 132 ++++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 232 +++++----------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 318 +++++++++++++++++++++
 include/linux/mlx5/device.h                        |   1 +
 5 files changed, 483 insertions(+), 440 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6e24e821a1d8..e903eff2574f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -46,6 +46,7 @@
 #include <linux/rhashtable.h>
 #include "wq.h"
 #include "mlx5_core.h"
+#include "en_stats.h"
 
 #define MLX5E_MAX_NUM_TC	8
 
@@ -148,245 +149,6 @@ struct mlx5e_umr_wqe {
 #define MLX5E_MIN_BW_ALLOC 1   /* Min percentage of BW allocation */
 #endif
 
-static const char vport_strings[][ETH_GSTRING_LEN] = {
-	/* vport statistics */
-	"rx_packets",
-	"rx_bytes",
-	"tx_packets",
-	"tx_bytes",
-	"rx_error_packets",
-	"rx_error_bytes",
-	"tx_error_packets",
-	"tx_error_bytes",
-	"rx_unicast_packets",
-	"rx_unicast_bytes",
-	"tx_unicast_packets",
-	"tx_unicast_bytes",
-	"rx_multicast_packets",
-	"rx_multicast_bytes",
-	"tx_multicast_packets",
-	"tx_multicast_bytes",
-	"rx_broadcast_packets",
-	"rx_broadcast_bytes",
-	"tx_broadcast_packets",
-	"tx_broadcast_bytes",
-
-	/* SW counters */
-	"tso_packets",
-	"tso_bytes",
-	"tso_inner_packets",
-	"tso_inner_bytes",
-	"lro_packets",
-	"lro_bytes",
-	"rx_csum_good",
-	"rx_csum_none",
-	"rx_csum_sw",
-	"tx_csum_offload",
-	"tx_csum_inner",
-	"tx_queue_stopped",
-	"tx_queue_wake",
-	"tx_queue_dropped",
-	"rx_wqe_err",
-	"rx_mpwqe_filler",
-	"rx_mpwqe_frag",
-	"rx_buff_alloc_err",
-};
-
-struct mlx5e_vport_stats {
-	/* HW counters */
-	u64 rx_packets;
-	u64 rx_bytes;
-	u64 tx_packets;
-	u64 tx_bytes;
-	u64 rx_error_packets;
-	u64 rx_error_bytes;
-	u64 tx_error_packets;
-	u64 tx_error_bytes;
-	u64 rx_unicast_packets;
-	u64 rx_unicast_bytes;
-	u64 tx_unicast_packets;
-	u64 tx_unicast_bytes;
-	u64 rx_multicast_packets;
-	u64 rx_multicast_bytes;
-	u64 tx_multicast_packets;
-	u64 tx_multicast_bytes;
-	u64 rx_broadcast_packets;
-	u64 rx_broadcast_bytes;
-	u64 tx_broadcast_packets;
-	u64 tx_broadcast_bytes;
-
-	/* SW counters */
-	u64 tso_packets;
-	u64 tso_bytes;
-	u64 tso_inner_packets;
-	u64 tso_inner_bytes;
-	u64 lro_packets;
-	u64 lro_bytes;
-	u64 rx_csum_good;
-	u64 rx_csum_none;
-	u64 rx_csum_sw;
-	u64 tx_csum_offload;
-	u64 tx_csum_inner;
-	u64 tx_queue_stopped;
-	u64 tx_queue_wake;
-	u64 tx_queue_dropped;
-	u64 rx_wqe_err;
-	u64 rx_mpwqe_filler;
-	u64 rx_mpwqe_frag;
-	u64 rx_buff_alloc_err;
-
-#define NUM_VPORT_COUNTERS     38
-};
-
-static const char pport_strings[][ETH_GSTRING_LEN] = {
-	/* IEEE802.3 counters */
-	"frames_tx",
-	"frames_rx",
-	"check_seq_err",
-	"alignment_err",
-	"octets_tx",
-	"octets_received",
-	"multicast_xmitted",
-	"broadcast_xmitted",
-	"multicast_rx",
-	"broadcast_rx",
-	"in_range_len_errors",
-	"out_of_range_len",
-	"too_long_errors",
-	"symbol_err",
-	"mac_control_tx",
-	"mac_control_rx",
-	"unsupported_op_rx",
-	"pause_ctrl_rx",
-	"pause_ctrl_tx",
-
-	/* RFC2863 counters */
-	"in_octets",
-	"in_ucast_pkts",
-	"in_discards",
-	"in_errors",
-	"in_unknown_protos",
-	"out_octets",
-	"out_ucast_pkts",
-	"out_discards",
-	"out_errors",
-	"in_multicast_pkts",
-	"in_broadcast_pkts",
-	"out_multicast_pkts",
-	"out_broadcast_pkts",
-
-	/* RFC2819 counters */
-	"drop_events",
-	"octets",
-	"pkts",
-	"broadcast_pkts",
-	"multicast_pkts",
-	"crc_align_errors",
-	"undersize_pkts",
-	"oversize_pkts",
-	"fragments",
-	"jabbers",
-	"collisions",
-	"p64octets",
-	"p65to127octets",
-	"p128to255octets",
-	"p256to511octets",
-	"p512to1023octets",
-	"p1024to1518octets",
-	"p1519to2047octets",
-	"p2048to4095octets",
-	"p4096to8191octets",
-	"p8192to10239octets",
-};
-
-#define NUM_IEEE_802_3_COUNTERS		19
-#define NUM_RFC_2863_COUNTERS		13
-#define NUM_RFC_2819_COUNTERS		21
-#define NUM_PPORT_COUNTERS		(NUM_IEEE_802_3_COUNTERS + \
-					 NUM_RFC_2863_COUNTERS + \
-					 NUM_RFC_2819_COUNTERS)
-
-struct mlx5e_pport_stats {
-	__be64 IEEE_802_3_counters[NUM_IEEE_802_3_COUNTERS];
-	__be64 RFC_2863_counters[NUM_RFC_2863_COUNTERS];
-	__be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS];
-};
-
-static const char qcounter_stats_strings[][ETH_GSTRING_LEN] = {
-	"rx_out_of_buffer",
-};
-
-struct mlx5e_qcounter_stats {
-	u32 rx_out_of_buffer;
-#define NUM_Q_COUNTERS 1
-};
-
-static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",
-	"bytes",
-	"csum_none",
-	"csum_sw",
-	"lro_packets",
-	"lro_bytes",
-	"wqe_err",
-	"mpwqe_filler",
-	"mpwqe_frag",
-	"buff_alloc_err",
-};
-
-struct mlx5e_rq_stats {
-	u64 packets;
-	u64 bytes;
-	u64 csum_none;
-	u64 csum_sw;
-	u64 lro_packets;
-	u64 lro_bytes;
-	u64 wqe_err;
-	u64 mpwqe_filler;
-	u64 mpwqe_frag;
-	u64 buff_alloc_err;
-#define NUM_RQ_STATS 10
-};
-
-static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",
-	"bytes",
-	"tso_packets",
-	"tso_bytes",
-	"tso_inner_packets",
-	"tso_inner_bytes",
-	"csum_offload_inner",
-	"nop",
-	"csum_offload_none",
-	"stopped",
-	"wake",
-	"dropped",
-};
-
-struct mlx5e_sq_stats {
-	/* commonly accessed in data path */
-	u64 packets;
-	u64 bytes;
-	u64 tso_packets;
-	u64 tso_bytes;
-	u64 tso_inner_packets;
-	u64 tso_inner_bytes;
-	u64 csum_offload_inner;
-	u64 nop;
-	/* less likely accessed in data path */
-	u64 csum_offload_none;
-	u64 stopped;
-	u64 wake;
-	u64 dropped;
-#define NUM_SQ_STATS 12
-};
-
-struct mlx5e_stats {
-	struct mlx5e_vport_stats   vport;
-	struct mlx5e_pport_stats   pport;
-	struct mlx5e_qcounter_stats qcnt;
-};
-
 struct mlx5e_params {
 	u8  log_sq_size;
 	u8  rq_wq_type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 4077856aab76..f1649d543475 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -166,6 +166,12 @@ static const struct {
 };
 
 #define MLX5E_NUM_Q_CNTRS(priv) (NUM_Q_COUNTERS * (!!priv->q_counter))
+#define MLX5E_NUM_RQ_STATS(priv) \
+	(NUM_RQ_STATS * priv->params.num_channels * \
+	 test_bit(MLX5E_STATE_OPENED, &priv->state))
+#define MLX5E_NUM_SQ_STATS(priv) \
+	(NUM_SQ_STATS * priv->params.num_channels * priv->params.num_tc * \
+	 test_bit(MLX5E_STATE_OPENED, &priv->state))
 
 static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 {
@@ -173,21 +179,68 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
+		return NUM_SW_COUNTERS +
 		       MLX5E_NUM_Q_CNTRS(priv) +
-		       priv->params.num_channels * NUM_RQ_STATS +
-		       priv->params.num_channels * priv->params.num_tc *
-						   NUM_SQ_STATS;
+		       NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
+		       MLX5E_NUM_RQ_STATS(priv) +
+		       MLX5E_NUM_SQ_STATS(priv);
 	/* fallthrough */
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
+static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	int i, j, tc, idx = 0;
+
+	/* SW counters */
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, sw_stats_desc[i].name);
+
+	/* Q counters */
+	for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, q_stats_desc[i].name);
+
+	/* VPORT counters */
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       vport_stats_desc[i].name);
+
+	/* PPORT counters */
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_802_3_stats_desc[i].name);
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_2863_stats_desc[i].name);
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_2819_stats_desc[i].name);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return;
+
+	/* per channel counters */
+	for (i = 0; i < priv->params.num_channels; i++)
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN, "rx%d_%s", i,
+				rq_stats_desc[j].name);
+
+	for (tc = 0; tc < priv->params.num_tc; tc++)
+		for (i = 0; i < priv->params.num_channels; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				sprintf(data + (idx++) * ETH_GSTRING_LEN,
+					"tx%d_%s",
+					priv->channeltc_to_txq_map[i][tc],
+					sq_stats_desc[j].name);
+}
+
 static void mlx5e_get_strings(struct net_device *dev,
 			      uint32_t stringset, uint8_t *data)
 {
-	int i, j, tc, idx = 0;
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
 	switch (stringset) {
@@ -198,35 +251,7 @@ static void mlx5e_get_strings(struct net_device *dev,
 		break;
 
 	case ETH_SS_STATS:
-		/* VPORT counters */
-		for (i = 0; i < NUM_VPORT_COUNTERS; i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       vport_strings[i]);
-
-		/* Q counters */
-		for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       qcounter_stats_strings[i]);
-
-		/* PPORT counters */
-		for (i = 0; i < NUM_PPORT_COUNTERS; i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       pport_strings[i]);
-
-		/* per channel counters */
-		for (i = 0; i < priv->params.num_channels; i++)
-			for (j = 0; j < NUM_RQ_STATS; j++)
-				sprintf(data + (idx++) * ETH_GSTRING_LEN,
-					"rx%d_%s", i, rq_stats_strings[j]);
-
-		for (tc = 0; tc < priv->params.num_tc; tc++)
-			for (i = 0; i < priv->params.num_channels; i++)
-				for (j = 0; j < NUM_SQ_STATS; j++)
-					sprintf(data +
-					      (idx++) * ETH_GSTRING_LEN,
-					      "tx%d_%s",
-					      priv->channeltc_to_txq_map[i][tc],
-					      sq_stats_strings[j]);
+		mlx5e_fill_stats_strings(priv, data);
 		break;
 	}
 }
@@ -245,28 +270,45 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
 		mlx5e_update_stats(priv);
 	mutex_unlock(&priv->state_lock);
 
-	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
-		data[idx++] = ((u64 *)&priv->stats.vport)[i];
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
+						   sw_stats_desc, i);
 
 	for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
-		data[idx++] = ((u32 *)&priv->stats.qcnt)[i];
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   q_stats_desc, i);
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vport.query_vport_out,
+						  vport_stats_desc, i);
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.IEEE_802_3_counters,
+						  pport_802_3_stats_desc, i);
 
-	for (i = 0; i < NUM_PPORT_COUNTERS; i++)
-		data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2863_counters,
+						  pport_2863_stats_desc, i);
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2819_counters,
+						  pport_2819_stats_desc, i);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return;
 
 	/* per channel counters */
 	for (i = 0; i < priv->params.num_channels; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
-			data[idx++] = !test_bit(MLX5E_STATE_OPENED,
-						&priv->state) ? 0 :
-				       ((u64 *)&priv->channel[i]->rq.stats)[j];
+			data[idx++] =
+			       MLX5E_READ_CTR64_CPU(&priv->channel[i]->rq.stats,
+						    rq_stats_desc, j);
 
 	for (tc = 0; tc < priv->params.num_tc; tc++)
 		for (i = 0; i < priv->params.num_channels; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
-				data[idx++] = !test_bit(MLX5E_STATE_OPENED,
-							&priv->state) ? 0 :
-				((u64 *)&priv->channel[i]->sq[tc].stats)[j];
+				data[idx++] = MLX5E_READ_CTR64_CPU(&priv->channel[i]->sq[tc].stats,
+								   sq_stats_desc, j);
 }
 
 static void mlx5e_get_ringparam(struct net_device *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6270f8d539db..0c532367ff13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -91,96 +91,15 @@ static void mlx5e_update_carrier_work(struct work_struct *work)
 	mutex_unlock(&priv->state_lock);
 }
 
-static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_pport_stats *s = &priv->stats.pport;
-	u32 *in;
-	u32 *out;
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-
-	in  = mlx5_vzalloc(sz);
-	out = mlx5_vzalloc(sz);
-	if (!in || !out)
-		goto free_out;
-
-	MLX5_SET(ppcnt_reg, in, local_port, 1);
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->IEEE_802_3_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->IEEE_802_3_counters));
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->RFC_2863_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->RFC_2863_counters));
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
-	mlx5_core_access_reg(mdev, in, sz, out,
-			     sz, MLX5_REG_PPCNT, 0, 0);
-	memcpy(s->RFC_2819_counters,
-	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
-	       sizeof(s->RFC_2819_counters));
-
-free_out:
-	kvfree(in);
-	kvfree(out);
-}
-
-static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
-{
-	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
-
-	if (!priv->q_counter)
-		return;
-
-	mlx5_core_query_out_of_buffer(priv->mdev, priv->q_counter,
-				      &qcnt->rx_out_of_buffer);
-}
-
-void mlx5e_update_stats(struct mlx5e_priv *priv)
+static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 {
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_vport_stats *s = &priv->stats.vport;
+	struct mlx5e_sw_stats *s = &priv->stats.sw;
 	struct mlx5e_rq_stats *rq_stats;
 	struct mlx5e_sq_stats *sq_stats;
-	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
-	u32 *out;
-	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
-	u64 tx_offload_none;
+	u64 tx_offload_none = 0;
 	int i, j;
 
-	out = mlx5_vzalloc(outlen);
-	if (!out)
-		return;
-
-	/* Collect firts the SW counters and then HW for consistency */
-	s->rx_packets		= 0;
-	s->rx_bytes		= 0;
-	s->tx_packets		= 0;
-	s->tx_bytes		= 0;
-	s->tso_packets		= 0;
-	s->tso_bytes		= 0;
-	s->tso_inner_packets	= 0;
-	s->tso_inner_bytes	= 0;
-	s->tx_queue_stopped	= 0;
-	s->tx_queue_wake	= 0;
-	s->tx_queue_dropped	= 0;
-	s->tx_csum_inner	= 0;
-	tx_offload_none		= 0;
-	s->lro_packets		= 0;
-	s->lro_bytes		= 0;
-	s->rx_csum_none		= 0;
-	s->rx_csum_sw		= 0;
-	s->rx_wqe_err		= 0;
-	s->rx_mpwqe_filler	= 0;
-	s->rx_mpwqe_frag	= 0;
-	s->rx_buff_alloc_err	= 0;
+	memset(s, 0, sizeof(*s));
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
@@ -212,7 +131,19 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		}
 	}
 
-	/* HW counters */
+	/* Update calculated offload counters */
+	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
+	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
+			     s->rx_csum_sw;
+}
+
+static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 *out = (u32 *)priv->stats.vport.query_vport_out;
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
+	struct mlx5_core_dev *mdev = priv->mdev;
+
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
@@ -222,58 +153,56 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 
 	memset(out, 0, outlen);
 
-	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen))
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+	u32 *in;
+
+	in = mlx5_vzalloc(sz);
+	if (!in)
 		goto free_out;
 
-#define MLX5_GET_CTR(p, x) \
-	MLX5_GET64(query_vport_counter_out, p, x)
-
-	s->rx_error_packets     =
-		MLX5_GET_CTR(out, received_errors.packets);
-	s->rx_error_bytes       =
-		MLX5_GET_CTR(out, received_errors.octets);
-	s->tx_error_packets     =
-		MLX5_GET_CTR(out, transmit_errors.packets);
-	s->tx_error_bytes       =
-		MLX5_GET_CTR(out, transmit_errors.octets);
-
-	s->rx_unicast_packets   =
-		MLX5_GET_CTR(out, received_eth_unicast.packets);
-	s->rx_unicast_bytes     =
-		MLX5_GET_CTR(out, received_eth_unicast.octets);
-	s->tx_unicast_packets   =
-		MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
-	s->tx_unicast_bytes     =
-		MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
-
-	s->rx_multicast_packets =
-		MLX5_GET_CTR(out, received_eth_multicast.packets);
-	s->rx_multicast_bytes   =
-		MLX5_GET_CTR(out, received_eth_multicast.octets);
-	s->tx_multicast_packets =
-		MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
-	s->tx_multicast_bytes   =
-		MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
-
-	s->rx_broadcast_packets =
-		MLX5_GET_CTR(out, received_eth_broadcast.packets);
-	s->rx_broadcast_bytes   =
-		MLX5_GET_CTR(out, received_eth_broadcast.octets);
-	s->tx_broadcast_packets =
-		MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
-	s->tx_broadcast_bytes   =
-		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
-	/* Update calculated offload counters */
-	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
-	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
-			       s->rx_csum_sw;
+	out = pstats->IEEE_802_3_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
-	mlx5e_update_pport_counters(priv);
-	mlx5e_update_q_counter(priv);
+	out = pstats->RFC_2863_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
+	out = pstats->RFC_2819_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 free_out:
-	kvfree(out);
+	kvfree(in);
+}
+
+static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
+
+	if (!priv->q_counter)
+		return;
+
+	mlx5_core_query_out_of_buffer(priv->mdev, priv->q_counter,
+				      &qcnt->rx_out_of_buffer);
+}
+
+void mlx5e_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_update_sw_counters(priv);
+	mlx5e_update_q_counter(priv);
+	mlx5e_update_vport_counters(priv);
+	mlx5e_update_pport_counters(priv);
 }
 
 static void mlx5e_update_stats_work(struct work_struct *work)
@@ -2073,37 +2002,28 @@ static struct rtnl_link_stats64 *
 mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
 	struct mlx5e_vport_stats *vstats = &priv->stats.vport;
 	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
 
-	stats->rx_packets = vstats->rx_packets;
-	stats->rx_bytes   = vstats->rx_bytes;
-	stats->tx_packets = vstats->tx_packets;
-	stats->tx_bytes   = vstats->tx_bytes;
-
-#define PPCNT_GET_802_3_CTR(fld)                            \
-	(MLX5_GET64(eth_802_3_cntrs_grp_data_layout,        \
-			pstats->IEEE_802_3_counters, fld##_high))
-
-#define PPCNT_GET_2863_CTR(fld)                             \
-	(MLX5_GET64(eth_2863_cntrs_grp_data_layout,         \
-			pstats->RFC_2863_counters, fld##_high))
+	stats->rx_packets = sstats->rx_packets;
+	stats->rx_bytes   = sstats->rx_bytes;
+	stats->tx_packets = sstats->tx_packets;
+	stats->tx_bytes   = sstats->tx_bytes;
 
 	stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer;
-	stats->tx_dropped = vstats->tx_queue_dropped;
+	stats->tx_dropped = sstats->tx_queue_dropped;
 
 	stats->rx_length_errors =
-		PPCNT_GET_802_3_CTR(a_in_range_length_errors) +
-		PPCNT_GET_802_3_CTR(a_out_of_range_length_field) +
-		PPCNT_GET_802_3_CTR(a_frame_too_long_errors);
+		PPORT_802_3_GET(pstats, a_in_range_length_errors) +
+		PPORT_802_3_GET(pstats, a_out_of_range_length_field) +
+		PPORT_802_3_GET(pstats, a_frame_too_long_errors);
 	stats->rx_crc_errors =
-		PPCNT_GET_802_3_CTR(a_frame_check_sequence_errors);
-	stats->rx_frame_errors =
-		PPCNT_GET_802_3_CTR(a_alignment_errors);
-	stats->tx_aborted_errors =
-		PPCNT_GET_2863_CTR(if_out_discards);
+		PPORT_802_3_GET(pstats, a_frame_check_sequence_errors);
+	stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors);
+	stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards);
 	stats->tx_carrier_errors =
-		PPCNT_GET_802_3_CTR(a_symbol_error_during_carrier);
+		PPORT_802_3_GET(pstats, a_symbol_error_during_carrier);
 	stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors +
 			   stats->rx_frame_errors;
 	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
@@ -2111,8 +2031,8 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	/* vport multicast also counts packets that are dropped due to steering
 	 * or rx out of buffer
 	 */
-	stats->multicast = vstats->rx_multicast_packets;
-
+	stats->multicast =
+		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
 
 	return stats;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
new file mode 100644
index 000000000000..116320d8fc42
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_STATS_H__
+#define __MLX5_EN_STATS_H__
+
+#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \
+	(*(u64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \
+	(*(u32 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset))
+
+#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld)
+
+struct counter_desc {
+	char		name[ETH_GSTRING_LEN];
+	int		offset; /* Byte offset */
+};
+
+struct mlx5e_sw_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 rx_csum_good;
+	u64 rx_csum_none;
+	u64 rx_csum_sw;
+	u64 tx_csum_offload;
+	u64 tx_csum_inner;
+	u64 tx_queue_stopped;
+	u64 tx_queue_wake;
+	u64 tx_queue_dropped;
+	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
+	u64 rx_mpwqe_frag;
+	u64 rx_buff_alloc_err;
+};
+
+static const struct counter_desc sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_good) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_offload) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+};
+
+struct mlx5e_qcounter_stats {
+	u32 rx_out_of_buffer;
+};
+
+static const struct counter_desc q_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_out_of_buffer) },
+};
+
+#define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c)
+#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \
+						vstats->query_vport_out, c)
+
+struct mlx5e_vport_stats {
+	__be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)];
+};
+
+static const struct counter_desc vport_stats_desc[] = {
+	{ "rx_error_packets", VPORT_COUNTER_OFF(received_errors.packets) },
+	{ "rx_error_bytes", VPORT_COUNTER_OFF(received_errors.octets) },
+	{ "tx_error_packets", VPORT_COUNTER_OFF(transmit_errors.packets) },
+	{ "tx_error_bytes", VPORT_COUNTER_OFF(transmit_errors.octets) },
+	{ "rx_unicast_packets",
+		VPORT_COUNTER_OFF(received_eth_unicast.packets) },
+	{ "rx_unicast_bytes", VPORT_COUNTER_OFF(received_eth_unicast.octets) },
+	{ "tx_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.packets) },
+	{ "tx_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.octets) },
+	{ "rx_multicast_packets",
+		VPORT_COUNTER_OFF(received_eth_multicast.packets) },
+	{ "rx_multicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_multicast.octets) },
+	{ "tx_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.packets) },
+	{ "tx_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.octets) },
+	{ "rx_broadcast_packets",
+		VPORT_COUNTER_OFF(received_eth_broadcast.packets) },
+	{ "rx_broadcast_bytes",
+		VPORT_COUNTER_OFF(received_eth_broadcast.octets) },
+	{ "tx_broadcast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.packets) },
+	{ "tx_broadcast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.octets) },
+};
+
+#define PPORT_802_3_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_802_3_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \
+		   counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \
+		   counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \
+		   counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+};
+
+static const struct counter_desc pport_802_3_stats_desc[] = {
+	{ "frames_tx", PPORT_802_3_OFF(a_frames_transmitted_ok) },
+	{ "frames_rx", PPORT_802_3_OFF(a_frames_received_ok) },
+	{ "check_seq_err", PPORT_802_3_OFF(a_frame_check_sequence_errors) },
+	{ "alignment_err", PPORT_802_3_OFF(a_alignment_errors) },
+	{ "octets_tx", PPORT_802_3_OFF(a_octets_transmitted_ok) },
+	{ "octets_received", PPORT_802_3_OFF(a_octets_received_ok) },
+	{ "multicast_xmitted", PPORT_802_3_OFF(a_multicast_frames_xmitted_ok) },
+	{ "broadcast_xmitted", PPORT_802_3_OFF(a_broadcast_frames_xmitted_ok) },
+	{ "multicast_rx", PPORT_802_3_OFF(a_multicast_frames_received_ok) },
+	{ "broadcast_rx", PPORT_802_3_OFF(a_broadcast_frames_received_ok) },
+	{ "in_range_len_errors", PPORT_802_3_OFF(a_in_range_length_errors) },
+	{ "out_of_range_len", PPORT_802_3_OFF(a_out_of_range_length_field) },
+	{ "too_long_errors", PPORT_802_3_OFF(a_frame_too_long_errors) },
+	{ "symbol_err", PPORT_802_3_OFF(a_symbol_error_during_carrier) },
+	{ "mac_control_tx", PPORT_802_3_OFF(a_mac_control_frames_transmitted) },
+	{ "mac_control_rx", PPORT_802_3_OFF(a_mac_control_frames_received) },
+	{ "unsupported_op_rx",
+		PPORT_802_3_OFF(a_unsupported_opcodes_received) },
+	{ "pause_ctrl_rx", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_received) },
+	{ "pause_ctrl_tx",
+		PPORT_802_3_OFF(a_pause_mac_ctrl_frames_transmitted) },
+};
+
+static const struct counter_desc pport_2863_stats_desc[] = {
+	{ "in_octets", PPORT_2863_OFF(if_in_octets) },
+	{ "in_ucast_pkts", PPORT_2863_OFF(if_in_ucast_pkts) },
+	{ "in_discards", PPORT_2863_OFF(if_in_discards) },
+	{ "in_errors", PPORT_2863_OFF(if_in_errors) },
+	{ "in_unknown_protos", PPORT_2863_OFF(if_in_unknown_protos) },
+	{ "out_octets", PPORT_2863_OFF(if_out_octets) },
+	{ "out_ucast_pkts", PPORT_2863_OFF(if_out_ucast_pkts) },
+	{ "out_discards", PPORT_2863_OFF(if_out_discards) },
+	{ "out_errors", PPORT_2863_OFF(if_out_errors) },
+	{ "in_multicast_pkts", PPORT_2863_OFF(if_in_multicast_pkts) },
+	{ "in_broadcast_pkts", PPORT_2863_OFF(if_in_broadcast_pkts) },
+	{ "out_multicast_pkts", PPORT_2863_OFF(if_out_multicast_pkts) },
+	{ "out_broadcast_pkts", PPORT_2863_OFF(if_out_broadcast_pkts) },
+};
+
+static const struct counter_desc pport_2819_stats_desc[] = {
+	{ "drop_events", PPORT_2819_OFF(ether_stats_drop_events) },
+	{ "octets", PPORT_2819_OFF(ether_stats_octets) },
+	{ "pkts", PPORT_2819_OFF(ether_stats_pkts) },
+	{ "broadcast_pkts", PPORT_2819_OFF(ether_stats_broadcast_pkts) },
+	{ "multicast_pkts", PPORT_2819_OFF(ether_stats_multicast_pkts) },
+	{ "crc_align_errors", PPORT_2819_OFF(ether_stats_crc_align_errors) },
+	{ "undersize_pkts", PPORT_2819_OFF(ether_stats_undersize_pkts) },
+	{ "oversize_pkts", PPORT_2819_OFF(ether_stats_oversize_pkts) },
+	{ "fragments", PPORT_2819_OFF(ether_stats_fragments) },
+	{ "jabbers", PPORT_2819_OFF(ether_stats_jabbers) },
+	{ "collisions", PPORT_2819_OFF(ether_stats_collisions) },
+	{ "p64octets", PPORT_2819_OFF(ether_stats_pkts64octets) },
+	{ "p65to127octets", PPORT_2819_OFF(ether_stats_pkts65to127octets) },
+	{ "p128to255octets", PPORT_2819_OFF(ether_stats_pkts128to255octets) },
+	{ "p256to511octets", PPORT_2819_OFF(ether_stats_pkts256to511octets) },
+	{ "p512to1023octets", PPORT_2819_OFF(ether_stats_pkts512to1023octets) },
+	{ "p1024to1518octets",
+		PPORT_2819_OFF(ether_stats_pkts1024to1518octets) },
+	{ "p1519to2047octets",
+		PPORT_2819_OFF(ether_stats_pkts1519to2047octets) },
+	{ "p2048to4095octets",
+		PPORT_2819_OFF(ether_stats_pkts2048to4095octets) },
+	{ "p4096to8191octets",
+		PPORT_2819_OFF(ether_stats_pkts4096to8191octets) },
+	{ "p8192to10239octets",
+		PPORT_2819_OFF(ether_stats_pkts8192to10239octets) },
+};
+
+struct mlx5e_rq_stats {
+	u64 packets;
+	u64 bytes;
+	u64 csum_none;
+	u64 csum_sw;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 wqe_err;
+	u64 mpwqe_filler;
+	u64 mpwqe_frag;
+	u64 buff_alloc_err;
+};
+
+static const struct counter_desc rq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_filler) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_frag) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+};
+
+struct mlx5e_sq_stats {
+	/* commonly accessed in data path */
+	u64 packets;
+	u64 bytes;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 csum_offload_inner;
+	u64 nop;
+	/* less likely accessed in data path */
+	u64 csum_offload_none;
+	u64 stopped;
+	u64 wake;
+	u64 dropped;
+};
+
+static const struct counter_desc sq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, csum_offload_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, nop) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, csum_offload_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sq_stats, dropped) },
+};
+
+#define NUM_SW_COUNTERS			ARRAY_SIZE(sw_stats_desc)
+#define NUM_Q_COUNTERS			ARRAY_SIZE(q_stats_desc)
+#define NUM_VPORT_COUNTERS		ARRAY_SIZE(vport_stats_desc)
+#define NUM_PPORT_802_3_COUNTERS	ARRAY_SIZE(pport_802_3_stats_desc)
+#define NUM_PPORT_2863_COUNTERS		ARRAY_SIZE(pport_2863_stats_desc)
+#define NUM_PPORT_2819_COUNTERS		ARRAY_SIZE(pport_2819_stats_desc)
+#define NUM_PPORT_COUNTERS		(NUM_PPORT_802_3_COUNTERS + \
+					 NUM_PPORT_2863_COUNTERS  + \
+					 NUM_PPORT_2819_COUNTERS)
+#define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
+#define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
+
+struct mlx5e_stats {
+	struct mlx5e_sw_stats sw;
+	struct mlx5e_qcounter_stats qcnt;
+	struct mlx5e_vport_stats vport;
+	struct mlx5e_pport_stats pport;
+};
+
+#endif /* __MLX5_EN_STATS_H__ */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 03f8d719b680..8be44ca777ed 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -59,6 +59,7 @@
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
 #define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
 #define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
 #define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
-- 
cgit v1.2.3


From 121fcdc84d8240d4dfe1f737befd5814b12623ee Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:50 +0300
Subject: net/mlx5e: Add link down events counter

Expose link_down_events counter through ethtool -S.
This counter is read from PPort statistics, then proccessed and stored as
a special handling software counter.
This counter is stored along software counters since it is the only PPort
counter that it's size is not 64 bits.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 10 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  5 +++++
 include/linux/mlx5/device.h                        |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ef66ba65f5cd..61e261c3d247 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -135,6 +135,10 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 	s->tx_csum_offload = s->tx_packets - tx_offload_none - s->tx_csum_inner;
 	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
 			     s->rx_csum_sw;
+
+	s->link_down_events = MLX5_GET(ppcnt_reg,
+				priv->stats.pport.phy_counters,
+				counter_set.phys_layer_cntrs.link_down_events);
 }
 
 static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
@@ -183,6 +187,10 @@ static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
+	out = pstats->phy_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
 		out = pstats->per_prio_counters[prio];
@@ -208,10 +216,10 @@ static void mlx5e_update_q_counter(struct mlx5e_priv *priv)
 
 void mlx5e_update_stats(struct mlx5e_priv *priv)
 {
-	mlx5e_update_sw_counters(priv);
 	mlx5e_update_q_counter(priv);
 	mlx5e_update_vport_counters(priv);
 	mlx5e_update_pport_counters(priv);
+	mlx5e_update_sw_counters(priv);
 }
 
 static void mlx5e_update_stats_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index de27eeaa3bd2..7cd8cb44b2ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -71,6 +71,9 @@ struct mlx5e_sw_stats {
 	u64 rx_mpwqe_filler;
 	u64 rx_mpwqe_frag;
 	u64 rx_buff_alloc_err;
+
+	/* Special handling counters */
+	u64 link_down_events;
 };
 
 static const struct counter_desc sw_stats_desc[] = {
@@ -96,6 +99,7 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events) },
 };
 
 struct mlx5e_qcounter_stats {
@@ -178,6 +182,7 @@ struct mlx5e_pport_stats {
 	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 };
 
 static const struct counter_desc pport_802_3_stats_desc[] = {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8be44ca777ed..942bccacd7b7 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1369,6 +1369,7 @@ enum {
 	MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5,
 	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
 	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11,
+	MLX5_PHYSICAL_LAYER_COUNTERS_GROUP    = 0x12,
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
-- 
cgit v1.2.3


From 94cb1ebbafd509210887eea6ced55c40da7b4baa Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:52 +0300
Subject: net/mlx5e: Add support for RXALL netdev feature

Introduce new access register named Ports Check Mask Register (PCMR) to
control all HW checks on port. With this register, the driver can
enable/disable Hardware FCS validation.

When RXALL is enabled/disabled using ndo_set_features, enable/disable
fcs check at HW.
User can change HW configuration using rx-all flag at ethtool.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 20 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c    | 49 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                       |  1 +
 include/linux/mlx5/port.h                         |  4 ++
 4 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d82bc6b697f9..ad0cb4aa593b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2139,6 +2139,14 @@ static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
 	return 0;
 }
 
+static int set_feature_rx_all(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_set_port_fcs(mdev, !enable);
+}
+
 static int mlx5e_handle_feature(struct net_device *netdev,
 				netdev_features_t wanted_features,
 				netdev_features_t feature,
@@ -2174,6 +2182,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 				    set_feature_vlan_filter);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_TC,
 				    set_feature_tc_num_filters);
+	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL,
+				    set_feature_rx_all);
 
 	return err ? -EINVAL : 0;
 }
@@ -2564,6 +2574,8 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
+	bool fcs_supported;
+	bool fcs_enabled;
 
 	SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
 
@@ -2607,10 +2619,18 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
 	}
 
+	mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
+
+	if (fcs_supported)
+		netdev->hw_features |= NETIF_F_RXALL;
+
 	netdev->features          = netdev->hw_features;
 	if (!priv->params.lro_en)
 		netdev->features  &= ~NETIF_F_LRO;
 
+	if (fcs_enabled)
+		netdev->features  &= ~NETIF_F_RXALL;
+
 #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
 	if (FT_CAP(flow_modify_en) &&
 	    FT_CAP(modify_root) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index ae378c575deb..c37740f30fbe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -607,3 +607,52 @@ int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode)
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_wol);
+
+static int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    outlen, MLX5_REG_PCMR, 0, 0);
+}
+
+static int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	return mlx5_core_access_reg(mdev, in, inlen, out,
+				    sizeof(out), MLX5_REG_PCMR, 0, 1);
+}
+
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	MLX5_SET(pcmr_reg, in, fcs_chk, enable);
+
+	return mlx5_set_ports_check(mdev, in, sizeof(in));
+}
+
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+	/* Default values for FW which do not support MLX5_REG_PCMR */
+	*supported = false;
+	*enabled = true;
+
+	if (!MLX5_CAP_GEN(mdev, ports_check))
+		return;
+
+	if (mlx5_query_ports_check(mdev, out, sizeof(out)))
+		return;
+
+	*supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap));
+	*enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk));
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index dcd5ac8d3b14..497a4dbd91b0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -112,6 +112,7 @@ enum {
 	MLX5_REG_PMPE		 = 0x5010,
 	MLX5_REG_PELC		 = 0x500e,
 	MLX5_REG_PVLC		 = 0x500f,
+	MLX5_REG_PCMR		 = 0x5041,
 	MLX5_REG_PMLP		 = 0, /* TBD */
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index a1d145abd4eb..577e953d0aa7 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -84,4 +84,8 @@ int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode);
 int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
 
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From da54d24ec3ef736de04c61a01653776a9750334f Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:53 +0300
Subject: net/mlx5e: Add ethtool support for interface identify (LED blinking)

Add the needed hardware command and mlx5_ifc structs for managing LED
control.
Add set_phys_id ethtool callback to support ethtool -p flag.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 25 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 13 +++++++++++
 include/linux/mlx5/driver.h                        |  1 +
 include/linux/mlx5/port.h                          |  6 ++++++
 4 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 522d584bc05f..a2c444ec191b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1135,6 +1135,30 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 	return mlx5_set_port_wol(mdev, mlx5_wol_mode);
 }
 
+static int mlx5e_set_phys_id(struct net_device *dev,
+			     enum ethtool_phys_id_state state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 beacon_duration;
+
+	if (!MLX5_CAP_GEN(mdev, beacon_led))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_INF;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_OFF;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return mlx5_set_port_beacon(mdev, beacon_duration);
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -1159,6 +1183,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_pauseparam    = mlx5e_get_pauseparam,
 	.set_pauseparam    = mlx5e_set_pauseparam,
 	.get_ts_info       = mlx5e_get_ts_info,
+	.set_phys_id       = mlx5e_set_phys_id,
 	.get_wol	   = mlx5e_get_wol,
 	.set_wol	   = mlx5e_set_wol,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index c37740f30fbe..446549f63b5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -115,6 +115,19 @@ int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_ptys);
 
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration)
+{
+	u32 out[MLX5_ST_SZ_DW(mlcr_reg)];
+	u32 in[MLX5_ST_SZ_DW(mlcr_reg)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(mlcr_reg, in, local_port, 1);
+	MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MLCR, 0, 1);
+}
+
 int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 			      u32 *proto_cap, int proto_mask)
 {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 497a4dbd91b0..2e8758d1b19e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -116,6 +116,7 @@ enum {
 	MLX5_REG_PMLP		 = 0, /* TBD */
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MLCR		 = 0x902b,
 };
 
 enum {
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 577e953d0aa7..a364ab1737a0 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -35,6 +35,11 @@
 
 #include <linux/mlx5/driver.h>
 
+enum mlx5_beacon_duration {
+	MLX5_BEACON_DURATION_OFF = 0x0,
+	MLX5_BEACON_DURATION_INF = 0xffff,
+};
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
@@ -53,6 +58,7 @@ int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
 			       enum mlx5_port_status status);
 int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
 				 enum mlx5_port_status *status);
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration);
 
 int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
 void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-- 
cgit v1.2.3


From bb64143eee8c036a89b31daa4e9bf8360a8bded1 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:54 +0300
Subject: net/mlx5e: Add ethtool support for dump module EEPROM

Add query MCIA, PMLP registers infrastructure and commands.
Add ethtool support for get_module_info() and get_module_eeprom()
callbacks.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 80 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 76 ++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  3 +-
 include/linux/mlx5/port.h                          | 15 ++++
 4 files changed, 173 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index a2c444ec191b..0518c8658507 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1159,6 +1159,84 @@ static int mlx5e_set_phys_id(struct net_device *dev,
 	return mlx5_set_port_beacon(mdev, beacon_duration);
 }
 
+static int mlx5e_get_module_info(struct net_device *netdev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *dev = priv->mdev;
+	int size_read = 0;
+	u8 data[4];
+
+	size_read = mlx5_query_module_eeprom(dev, 0, 2, data);
+	if (size_read < 2)
+		return -EIO;
+
+	/* data[0] = identifier byte */
+	switch (data[0]) {
+	case MLX5_MODULE_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX5_MODULE_ID_QSFP_PLUS:
+	case MLX5_MODULE_ID_QSFP28:
+		/* data[1] = revision id */
+		if (data[0] == MLX5_MODULE_ID_QSFP28 || data[1] >= 0x3) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX5_MODULE_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n",
+			   __func__, data[0]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5e_get_module_eeprom(struct net_device *netdev,
+				   struct ethtool_eeprom *ee,
+				   u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int size_read;
+	int i = 0;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i,
+						     data + i);
+
+		if (!size_read)
+			/* Done reading */
+			return 0;
+
+		if (size_read < 0) {
+			netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n",
+				   __func__, size_read);
+			return 0;
+		}
+
+		i += size_read;
+		offset += size_read;
+	}
+
+	return 0;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -1186,4 +1264,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.set_phys_id       = mlx5e_set_phys_id,
 	.get_wol	   = mlx5e_get_wol,
 	.set_wol	   = mlx5e_set_wol,
+	.get_module_info   = mlx5e_get_module_info,
+	.get_module_eeprom = mlx5e_get_module_eeprom,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 446549f63b5c..4cb2a44510fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -310,6 +310,82 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
 
+static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num)
+{
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)];
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)];
+	int module_mapping;
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return err;
+
+	module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping);
+	*module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK;
+
+	return 0;
+}
+
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data)
+{
+	u32 out[MLX5_ST_SZ_DW(mcia_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcia_reg)];
+	int module_num;
+	u16 i2c_addr;
+	int status;
+	int err;
+	void *ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
+
+	err = mlx5_query_module_num(dev, &module_num);
+	if (err)
+		return err;
+
+	memset(in, 0, sizeof(in));
+	size = min_t(int, size, MLX5_EEPROM_MAX_BYTES);
+
+	if (offset < MLX5_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLX5_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size -= offset + size - MLX5_EEPROM_PAGE_LENGTH;
+
+	i2c_addr = MLX5_I2C_ADDR_LOW;
+	if (offset >= MLX5_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLX5_I2C_ADDR_HIGH;
+		offset -= MLX5_EEPROM_PAGE_LENGTH;
+	}
+
+	MLX5_SET(mcia_reg, in, l, 0);
+	MLX5_SET(mcia_reg, in, module, module_num);
+	MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr);
+	MLX5_SET(mcia_reg, in, page_number, 0);
+	MLX5_SET(mcia_reg, in, device_address, offset);
+	MLX5_SET(mcia_reg, in, size, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCIA, 0, 0);
+	if (err)
+		return err;
+
+	status = MLX5_GET(mcia_reg, out, status);
+	if (status) {
+		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
+			      status);
+		return -EIO;
+	}
+
+	memcpy(data, ptr, size);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
+
 static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
 				int pvlc_size,  u8 local_port)
 {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2e8758d1b19e..1a170672c656 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -113,9 +113,10 @@ enum {
 	MLX5_REG_PELC		 = 0x500e,
 	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PCMR		 = 0x5041,
-	MLX5_REG_PMLP		 = 0, /* TBD */
+	MLX5_REG_PMLP		 = 0x5002,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
 };
 
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index a364ab1737a0..7391eb833253 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -40,6 +40,19 @@ enum mlx5_beacon_duration {
 	MLX5_BEACON_DURATION_INF = 0xffff,
 };
 
+enum mlx5_module_id {
+	MLX5_MODULE_ID_SFP              = 0x3,
+	MLX5_MODULE_ID_QSFP             = 0xC,
+	MLX5_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX5_MODULE_ID_QSFP28           = 0x11,
+};
+
+#define MLX5_EEPROM_MAX_BYTES			32
+#define MLX5_EEPROM_IDENTIFIER_BYTE_MASK	0x000000ff
+#define MLX5_I2C_ADDR_LOW		0x50
+#define MLX5_I2C_ADDR_HIGH		0x51
+#define MLX5_EEPROM_PAGE_LENGTH		256
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
@@ -93,5 +106,7 @@ int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
 int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
 void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 			 bool *enabled);
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data);
 
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 363501145e3faa650193722fe7047b767ed87172 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:55 +0300
Subject: net/mlx5e: Add ethtool support for rxvlan-offload (vlan stripping)

Use ethtool -K <interface> rxvlan <on/off> to enable/disable
C-TAG vlan stripping by hardware.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 74 ++++++++++++++++++++++-
 include/linux/mlx5/driver.h                       |  4 ++
 3 files changed, 78 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e903eff2574f..8abc289ac1fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -166,6 +166,7 @@ struct mlx5e_params {
 	u8  rss_hfunc;
 	u8  toeplitz_hash_key[40];
 	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
+	bool vlan_strip_disable;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct ieee_ets ets;
 #endif
@@ -575,6 +576,8 @@ int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
 void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
 
+int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd);
+
 int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix);
 void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ad0cb4aa593b..6c9c10c131a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -388,6 +388,7 @@ static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 	MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
 	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc,  rqc, flush_in_error_en,	1);
+	MLX5_SET(rqc,  rqc, vsd, priv->params.vlan_strip_disable);
 	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
 						MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
@@ -402,7 +403,8 @@ static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 	return err;
 }
 
-static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
+static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
+				 int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
@@ -430,6 +432,36 @@ static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 	return err;
 }
 
+static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask, MLX5_RQ_BITMASK_VSD);
+	MLX5_SET(rqc, rqc, vsd, vsd);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	mlx5_core_destroy_rq(rq->priv->mdev, rq->rqn);
@@ -468,7 +500,7 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (err)
 		goto err_destroy_rq;
 
-	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
@@ -493,7 +525,7 @@ static void mlx5e_close_rq(struct mlx5e_rq *rq)
 	clear_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state);
 	napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
 
-	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
+	mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	while (!mlx5_wq_ll_is_empty(&rq->wq))
 		msleep(20);
 
@@ -1963,6 +1995,23 @@ static void mlx5e_destroy_tirs(struct mlx5e_priv *priv)
 		mlx5e_destroy_tir(priv, i);
 }
 
+int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd)
+{
+	int err = 0;
+	int i;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	for (i = 0; i < priv->params.num_channels; i++) {
+		err = mlx5e_modify_rq_vsd(&priv->channel[i]->rq, vsd);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int mlx5e_setup_tc(struct net_device *netdev, u8 tc)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -2147,6 +2196,23 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable)
 	return mlx5_set_port_fcs(mdev, !enable);
 }
 
+static int set_feature_rx_vlan(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->params.vlan_strip_disable = !enable;
+	err = mlx5e_modify_rqs_vsd(priv, !enable);
+	if (err)
+		priv->params.vlan_strip_disable = enable;
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
 static int mlx5e_handle_feature(struct net_device *netdev,
 				netdev_features_t wanted_features,
 				netdev_features_t feature,
@@ -2184,6 +2250,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 				    set_feature_tc_num_filters);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL,
 				    set_feature_rx_all);
+	err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_VLAN_CTAG_RX,
+				    set_feature_rx_vlan);
 
 	return err ? -EINVAL : 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1a170672c656..2cc5e9fd5913 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -45,6 +45,10 @@
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
 
+enum {
+	MLX5_RQ_BITMASK_VSD = 1 << 1,
+};
+
 enum {
 	MLX5_BOARD_ID_LEN = 64,
 	MLX5_MAX_NAME_LEN = 16,
-- 
cgit v1.2.3


From 1b223dd391622fde05e03829d813c3c6cc998685 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sun, 24 Apr 2016 22:51:56 +0300
Subject: net/mlx5e: Fix checksum handling for non-stripped vlan packets

Now as rx-vlan offload can be disabled, packets can be received
with vlan tag not stripped, which means is_first_ethertype_ip will
return false, for that we need to check if the hardware reported
csum OK so we will report CHECKSUM_UNNECESSARY for those packets.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 20 +++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  8 ++++++--
 include/linux/mlx5/device.h                        | 21 ++++++++++++++++-----
 4 files changed, 38 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6c9c10c131a0..5bad17d37d7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -109,6 +109,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 		s->lro_bytes	+= rq_stats->lro_bytes;
 		s->rx_csum_none	+= rq_stats->csum_none;
 		s->rx_csum_sw	+= rq_stats->csum_sw;
+		s->rx_csum_inner += rq_stats->csum_inner;
 		s->rx_wqe_err   += rq_stats->wqe_err;
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 918b7c7fd74f..23adfe2fcba9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -543,16 +543,26 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 
 	if (lro) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else if (likely(is_first_ethertype_ip(skb))) {
+		return;
+	}
+
+	if (is_first_ethertype_ip(skb)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
 		rq->stats.csum_sw++;
-	} else {
-		goto csum_none;
+		return;
 	}
 
-	return;
-
+	if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
+		   (cqe->hds_ip_ext & CQE_L4_OK))) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (cqe_is_tunneled(cqe)) {
+			skb->csum_level = 1;
+			skb->encapsulation = 1;
+			rq->stats.csum_inner++;
+		}
+		return;
+	}
 csum_none:
 	skb->ip_summed = CHECKSUM_NONE;
 	rq->stats.csum_none++;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 7cd8cb44b2ab..115752b53d85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -62,6 +62,7 @@ struct mlx5e_sw_stats {
 	u64 rx_csum_good;
 	u64 rx_csum_none;
 	u64 rx_csum_sw;
+	u64 rx_csum_inner;
 	u64 tx_csum_offload;
 	u64 tx_csum_inner;
 	u64 tx_queue_stopped;
@@ -90,6 +91,7 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_good) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_offload) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
@@ -272,8 +274,9 @@ static const struct counter_desc pport_per_prio_pfc_stats_desc[] = {
 struct mlx5e_rq_stats {
 	u64 packets;
 	u64 bytes;
-	u64 csum_none;
 	u64 csum_sw;
+	u64 csum_inner;
+	u64 csum_none;
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
@@ -285,8 +288,9 @@ struct mlx5e_rq_stats {
 static const struct counter_desc rq_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, bytes) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_sw) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, csum_none) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, lro_bytes) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, wqe_err) },
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 942bccacd7b7..6bd429b53b77 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -645,8 +645,9 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8              rsvd0[2];
-	__be16          wqe_id;
+	u8		outer_l3_tunneled;
+	u8		rsvd0;
+	__be16		wqe_id;
 	u8		lro_tcppsh_abort_dupack;
 	u8		lro_min_ttl;
 	__be16		lro_tcp_win;
@@ -659,7 +660,7 @@ struct mlx5_cqe64 {
 	__be16		slid;
 	__be32		flags_rqpn;
 	u8		hds_ip_ext;
-	u8		l4_hdr_type_etc;
+	u8		l4_l3_hdr_type;
 	__be16		vlan_info;
 	__be32		srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */
 	__be32		imm_inval_pkey;
@@ -680,12 +681,22 @@ static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 
 static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
 {
-	return (cqe->l4_hdr_type_etc >> 4) & 0x7;
+	return (cqe->l4_l3_hdr_type >> 4) & 0x7;
+}
+
+static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->l4_l3_hdr_type >> 2) & 0x3;
+}
+
+static inline u8 cqe_is_tunneled(struct mlx5_cqe64 *cqe)
+{
+	return cqe->outer_l3_tunneled & 0x1;
 }
 
 static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe)
 {
-	return !!(cqe->l4_hdr_type_etc & 0x1);
+	return !!(cqe->l4_l3_hdr_type & 0x1);
 }
 
 static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
-- 
cgit v1.2.3


From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 21 Apr 2016 14:14:01 -0400
Subject: audit: add tty field to LOGIN event

The tty field was missing from AUDIT_LOGIN events.

Refactor code to create a new function audit_get_tty(), using it to
replace the call in audit_log_task_info() and to add it to
audit_log_set_loginuid().  Lock and bump the kref to protect it, adding
audit_put_tty() alias to decrement it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 24 ++++++++++++++++++++++++
 kernel/audit.c        | 18 +++++-------------
 kernel/auditsc.c      |  8 ++++++--
 3 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b40ed5df5542..32cdafb312d8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
+#include <linux/tty.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -343,6 +344,23 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	struct tty_struct *tty = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	if (tsk->signal)
+		tty = tty_kref_get(tsk->signal->tty);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+	return tty;
+}
+
+static inline void audit_put_tty(struct tty_struct *tty)
+{
+	tty_kref_put(tty);
+}
+
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -500,6 +518,12 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	return NULL;
+}
+static inline void audit_put_tty(struct tty_struct *tty)
+{ }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
diff --git a/kernel/audit.c b/kernel/audit.c
index f52fbefede09..384374a1d232 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -64,7 +64,6 @@
 #include <linux/security.h>
 #endif
 #include <linux/freezer.h>
-#include <linux/tty.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
 
@@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
 	char comm[sizeof(tsk->comm)];
-	char *tty;
+	struct tty_struct *tty;
 
 	if (!ab)
 		return;
 
 	/* tsk == current */
 	cred = current_cred();
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-		tty = tsk->signal->tty->name;
-	else
-		tty = "(none)";
-	spin_unlock_irq(&tsk->sighand->siglock);
-
+	tty = audit_get_tty(tsk);
 	audit_log_format(ab,
 			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
@@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 from_kgid(&init_user_ns, cred->egid),
 			 from_kgid(&init_user_ns, cred->sgid),
 			 from_kgid(&init_user_ns, cred->fsgid),
-			 tty, audit_get_sessionid(tsk));
-
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(tsk));
+	audit_put_tty(tty);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
 	audit_log_d_path_exe(ab, tsk->mm);
 	audit_log_task_context(ab);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 195ffaee50b9..71e14d836e69 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 {
 	struct audit_buffer *ab;
 	uid_t uid, oldloginuid, loginuid;
+	struct tty_struct *tty;
 
 	if (!audit_enabled)
 		return;
@@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
+	tty = audit_get_tty(current);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
 	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
-			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+			 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+			 oldsessionid, sessionid, !rc);
+	audit_put_tty(tty);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.2.3


From dd80b54b18db3d76a43558daaa6ea3aa67f5aacd Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Wed, 20 Apr 2016 15:39:11 +0200
Subject: USB: LTM also for USB 3.1

LTM is also defined for SS+. The correct test is to check for anything
slower than SS not exactly SS.

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 6a9a0c28415d..29aba76017ee 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -720,7 +720,7 @@ extern void usb_enable_ltm(struct usb_device *udev);
 
 static inline bool usb_device_supports_ltm(struct usb_device *udev)
 {
-	if (udev->speed != USB_SPEED_SUPER || !udev->bos || !udev->bos->ss_cap)
+	if (udev->speed < USB_SPEED_SUPER || !udev->bos || !udev->bos->ss_cap)
 		return false;
 	return udev->bos->ss_cap->bmAttributes & USB_LTM_SUPPORT;
 }
-- 
cgit v1.2.3


From fca504f6054f2a62d966afde23a0cfbf9e3dc32f Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Wed, 20 Apr 2016 15:39:12 +0200
Subject: USB: correct intervals for SS+

SS+ also expresses intervals in units of 125ms. Testing must
be for SS or faster, not SS exactly.

Signed-off-by: Oliver neukum <oneukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 29aba76017ee..7824f4557d50 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1569,7 +1569,7 @@ static inline void usb_fill_bulk_urb(struct urb *urb,
  * Initializes a interrupt urb with the proper information needed to submit
  * it to a device.
  *
- * Note that High Speed and SuperSpeed interrupt endpoints use a logarithmic
+ * Note that High Speed and SuperSpeed(+) interrupt endpoints use a logarithmic
  * encoding of the endpoint interval, and express polling intervals in
  * microframes (eight per millisecond) rather than in frames (one per
  * millisecond).
@@ -1595,7 +1595,7 @@ static inline void usb_fill_int_urb(struct urb *urb,
 	urb->complete = complete_fn;
 	urb->context = context;
 
-	if (dev->speed == USB_SPEED_HIGH || dev->speed == USB_SPEED_SUPER) {
+	if (dev->speed == USB_SPEED_HIGH || dev->speed >= USB_SPEED_SUPER) {
 		/* make sure interval is within allowed range */
 		interval = clamp(interval, 1, 16);
 
-- 
cgit v1.2.3


From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Apr 2016 21:22:22 -0400
Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header

The functions event_trigger_unlock_commit() and
event_trigger_unlock_commit_regs() are no longer used outside the tracing
system. Move them out of the generic headers and into the local one.

Along with __event_trigger_test_discard() that is only used by them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 94 --------------------------------------------
 kernel/trace/trace.h         | 94 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5f89a5b0c7e6..70a181cb3585 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -452,100 +452,6 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 	return false;
 }
 
-/*
- * Helper function for event_trigger_unlock_commit{_regs}().
- * If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
- * entry already holds the field information of the current event.
- *
- * It also checks if the event should be discarded or not.
- * It is to be discarded if the event is soft disabled and the
- * event was only recorded to process triggers, or if the event
- * filter is active and this event did not match the filters.
- *
- * Returns true if the event is discarded, false otherwise.
- */
-static inline bool
-__event_trigger_test_discard(struct trace_event_file *file,
-			     struct ring_buffer *buffer,
-			     struct ring_buffer_event *event,
-			     void *entry,
-			     enum event_trigger_type *tt)
-{
-	unsigned long eflags = file->flags;
-
-	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry);
-
-	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
-		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
-
-	return true;
-}
-
-/**
- * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- */
-static inline void
-event_trigger_unlock_commit(struct trace_event_file *file,
-			    struct ring_buffer *buffer,
-			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
-				 struct ring_buffer *buffer,
-				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
-				 struct pt_regs *regs)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
-						irq_flags, pc, regs);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
 #else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 727a3d28bce5..c0eac7b1e5a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,100 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct trace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct trace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
-- 
cgit v1.2.3


From da20dfe6b50ea4c1a82797b7ee8655a370535d73 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 21 Apr 2016 12:53:29 -0700
Subject: fs: fix over-zealous use of "const"

When I was fixing up const recommendations from checkpatch.pl, I went
overboard. This fixes the warning (during a W=1 build):

include/linux/fs.h:2627:74: warning: type qualifiers ignored on function return type [-Wignored-qualifiers]
static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)

Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 90477550b935..9847d5c49a0e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2600,7 +2600,7 @@ static const char * const kernel_read_file_str[] = {
 	__kernel_read_file_id(__fid_stringify)
 };
 
-static inline const char * const kernel_read_file_id_str(enum kernel_read_file_id id)
+static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 {
 	if (id < 0 || id >= READING_MAX_ID)
 		return kernel_read_file_str[READING_UNKNOWN];
-- 
cgit v1.2.3


From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 10:13:46 -0400
Subject: tracing: Make filter_check_discard() local

Nothing outside of the tracing directory calls filter_check_discard() or
check_filter_check_discard(). They should not be called by modules. Move
their prototypes into the local tracing header and remove their
EXPORT_SYMBOL() macros.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 6 ------
 kernel/trace/trace.c         | 2 --
 kernel/trace/trace.h         | 6 ++++++
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 70a181cb3585..bb383af35cc7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -413,12 +413,6 @@ enum event_trigger_type {
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
-extern int filter_check_discard(struct trace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event);
 extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
 						   void *rec);
 extern void event_triggers_post_call(struct trace_event_file *file,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46028d47d252..02f5a5f51d49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(filter_check_discard);
 
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
@@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
 
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c0eac7b1e5a6..ee8691c66bfe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,12 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+extern int filter_check_discard(struct trace_event_file *file, void *rec,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From 188e3c5cd2b672620291e64a21f1598fe91e40b6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Apr 2016 11:56:04 +0200
Subject: tty: provide tty_name() even without CONFIG_TTY

The audit subsystem just started printing the name of the tty,
but that causes a build failure when CONFIG_TTY is disabled:

kernel/built-in.o: In function `audit_log_task_info':
memremap.c:(.text+0x5e34c): undefined reference to `tty_name'
kernel/built-in.o: In function `audit_set_loginuid':
memremap.c:(.text+0x63b34): undefined reference to `tty_name'

This adds tty_name() to the list of functions that are provided
as trivial stubs in that configuration.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: db0a6fb5d97a ("audit: add tty field to LOGIN event")
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/tty.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index d9fb4b043f56..a93cce297832 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -371,6 +371,7 @@ extern void proc_clear_tty(struct task_struct *p);
 extern struct tty_struct *get_current_tty(void);
 /* tty_io.c */
 extern int __init tty_init(void);
+extern const char *tty_name(const struct tty_struct *tty);
 #else
 static inline void console_init(void)
 { }
@@ -391,6 +392,8 @@ static inline struct tty_struct *get_current_tty(void)
 /* tty_io.c */
 static inline int __init tty_init(void)
 { return 0; }
+static inline const char *tty_name(const struct tty_struct *tty)
+{ return "(none)"; }
 #endif
 
 extern struct ktermios tty_std_termios;
@@ -415,7 +418,6 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 	return tty;
 }
 
-extern const char *tty_name(const struct tty_struct *tty);
 extern const char *tty_driver_name(const struct tty_struct *tty);
 extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 extern int __tty_check_change(struct tty_struct *tty, int sig);
-- 
cgit v1.2.3


From 501e7ef569f4ea2dc7e50773cf6a5d757c94f9b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Apr 2016 15:30:07 -0700
Subject: net-rfs: fix false sharing accessing sd->input_queue_head

sd->input_queue_head is incremented for each processed packet
in process_backlog(), and read from other cpus performing
Out Of Order avoidance in get_rps_cpu()

Moving this field in a separate cache line keeps it mostly
hot for the cpu in process_backlog(), as other cpus will
only read it.

In a stress test, process_backlog() was consuming 6.80 % of cpu cycles,
and the patch reduced the cost to 0.65 %

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 18d8394f2e5d..934ca866562d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2747,11 +2747,15 @@ struct softnet_data {
 	struct sk_buff		*completion_queue;
 
 #ifdef CONFIG_RPS
-	/* Elements below can be accessed between CPUs for RPS */
+	/* input_queue_head should be written by cpu owning this struct,
+	 * and only read by other cpus. Worth using a cache line.
+	 */
+	unsigned int		input_queue_head ____cacheline_aligned_in_smp;
+
+	/* Elements below can be accessed between CPUs for RPS/RFS */
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
-	unsigned int		input_queue_head;
 	unsigned int		input_queue_tail;
 #endif
 	unsigned int		dropped;
-- 
cgit v1.2.3


From 3358d2d9f47af86bdd71edb24b361f72a54ec04e Mon Sep 17 00:00:00 2001
From: Andrew Bresticker <abrestic@chromium.org>
Date: Thu, 18 Jun 2015 17:28:40 -0400
Subject: clk: tegra: Add interface to enable hardware control of SATA/XUSB
 PLLs

On Tegra210, hardware control of the SATA and XUSB pad PLLs must be
done during the UPHY enable sequence rather than the PLLE enable
sequence.  Export functions to do this so that hardware control can
be enabled from the XUSB padctl driver.

Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 58 ++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/tegra.h        |  5 ++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 637041fd53ad..3d0edee1f9fe 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -175,6 +175,19 @@
 #define UTMIP_PLL_CFG1_FORCE_PLL_ENABLE_POWERDOWN BIT(14)
 #define UTMIP_PLL_CFG1_FORCE_PLL_ACTIVE_POWERDOWN BIT(12)
 
+#define SATA_PLL_CFG0				0x490
+#define SATA_PLL_CFG0_PADPLL_RESET_SWCTL	BIT(0)
+#define SATA_PLL_CFG0_PADPLL_USE_LOCKDET	BIT(2)
+#define SATA_PLL_CFG0_PADPLL_SLEEP_IDDQ		BIT(13)
+#define SATA_PLL_CFG0_SEQ_ENABLE		BIT(24)
+
+#define XUSBIO_PLL_CFG0				0x51c
+#define XUSBIO_PLL_CFG0_PADPLL_RESET_SWCTL	BIT(0)
+#define XUSBIO_PLL_CFG0_CLK_ENABLE_SWCTL	BIT(2)
+#define XUSBIO_PLL_CFG0_PADPLL_USE_LOCKDET	BIT(6)
+#define XUSBIO_PLL_CFG0_PADPLL_SLEEP_IDDQ	BIT(13)
+#define XUSBIO_PLL_CFG0_SEQ_ENABLE		BIT(24)
+
 #define UTMIPLL_HW_PWRDN_CFG0			0x52c
 #define UTMIPLL_HW_PWRDN_CFG0_UTMIPLL_LOCK	BIT(31)
 #define UTMIPLL_HW_PWRDN_CFG0_SEQ_START_STATE	BIT(25)
@@ -416,6 +429,51 @@ static const char *mux_pllmcp_clkm[] = {
 #define PLLU_MISC0_WRITE_MASK		0xbfffffff
 #define PLLU_MISC1_WRITE_MASK		0x00000007
 
+void tegra210_xusb_pll_hw_control_enable(void)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + XUSBIO_PLL_CFG0);
+	val &= ~(XUSBIO_PLL_CFG0_CLK_ENABLE_SWCTL |
+		 XUSBIO_PLL_CFG0_PADPLL_RESET_SWCTL);
+	val |= XUSBIO_PLL_CFG0_PADPLL_USE_LOCKDET |
+	       XUSBIO_PLL_CFG0_PADPLL_SLEEP_IDDQ;
+	writel_relaxed(val, clk_base + XUSBIO_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_xusb_pll_hw_control_enable);
+
+void tegra210_xusb_pll_hw_sequence_start(void)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + XUSBIO_PLL_CFG0);
+	val |= XUSBIO_PLL_CFG0_SEQ_ENABLE;
+	writel_relaxed(val, clk_base + XUSBIO_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_xusb_pll_hw_sequence_start);
+
+void tegra210_sata_pll_hw_control_enable(void)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + SATA_PLL_CFG0);
+	val &= ~SATA_PLL_CFG0_PADPLL_RESET_SWCTL;
+	val |= SATA_PLL_CFG0_PADPLL_USE_LOCKDET |
+	       SATA_PLL_CFG0_PADPLL_SLEEP_IDDQ;
+	writel_relaxed(val, clk_base + SATA_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_sata_pll_hw_control_enable);
+
+void tegra210_sata_pll_hw_sequence_start(void)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + SATA_PLL_CFG0);
+	val |= SATA_PLL_CFG0_SEQ_ENABLE;
+	writel_relaxed(val, clk_base + SATA_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_sata_pll_hw_sequence_start);
+
 static inline void _pll_misc_chk_default(void __iomem *base,
 					struct tegra_clk_pll_params *params,
 					u8 misc_num, u32 default_val, u32 mask)
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 57bf7aab4516..7007a5f48080 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -121,4 +121,9 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
+extern void tegra210_xusb_pll_hw_control_enable(void);
+extern void tegra210_xusb_pll_hw_sequence_start(void);
+extern void tegra210_sata_pll_hw_control_enable(void);
+extern void tegra210_sata_pll_hw_sequence_start(void);
+
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.2.3


From feb26ac31a2a5cb88d86680d9a94916a6343e9e6 Mon Sep 17 00:00:00 2001
From: Chris Bainbridge <chris.bainbridge@gmail.com>
Date: Mon, 25 Apr 2016 13:48:38 +0100
Subject: usb: core: hub: hub_port_init lock controller instead of bus

The XHCI controller presents two USB buses to the system - one for USB2
and one for USB3. The hub init code (hub_port_init) is reentrant but
only locks one bus per thread, leading to a race condition failure when
two threads attempt to simultaneously initialise a USB2 and USB3 device:

[    8.034843] xhci_hcd 0000:00:14.0: Timeout while waiting for setup device command
[   13.183701] usb 3-3: device descriptor read/all, error -110

On a test system this failure occurred on 6% of all boots.

The call traces at the point of failure are:

Call Trace:
 [<ffffffff81b9bab7>] schedule+0x37/0x90
 [<ffffffff817da7cd>] usb_kill_urb+0x8d/0xd0
 [<ffffffff8111e5e0>] ? wake_up_atomic_t+0x30/0x30
 [<ffffffff817dafbe>] usb_start_wait_urb+0xbe/0x150
 [<ffffffff817db10c>] usb_control_msg+0xbc/0xf0
 [<ffffffff817d07de>] hub_port_init+0x51e/0xb70
 [<ffffffff817d4697>] hub_event+0x817/0x1570
 [<ffffffff810f3e6f>] process_one_work+0x1ff/0x620
 [<ffffffff810f3dcf>] ? process_one_work+0x15f/0x620
 [<ffffffff810f4684>] worker_thread+0x64/0x4b0
 [<ffffffff810f4620>] ? rescuer_thread+0x390/0x390
 [<ffffffff810fa7f5>] kthread+0x105/0x120
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200
 [<ffffffff81ba183f>] ret_from_fork+0x3f/0x70
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200

Call Trace:
 [<ffffffff817fd36d>] xhci_setup_device+0x53d/0xa40
 [<ffffffff817fd87e>] xhci_address_device+0xe/0x10
 [<ffffffff817d047f>] hub_port_init+0x1bf/0xb70
 [<ffffffff811247ed>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff817d4697>] hub_event+0x817/0x1570
 [<ffffffff810f3e6f>] process_one_work+0x1ff/0x620
 [<ffffffff810f3dcf>] ? process_one_work+0x15f/0x620
 [<ffffffff810f4684>] worker_thread+0x64/0x4b0
 [<ffffffff810f4620>] ? rescuer_thread+0x390/0x390
 [<ffffffff810fa7f5>] kthread+0x105/0x120
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200
 [<ffffffff81ba183f>] ret_from_fork+0x3f/0x70
 [<ffffffff810fa6f0>] ? kthread_create_on_node+0x200/0x200

Which results from the two call chains:

hub_port_init
 usb_get_device_descriptor
  usb_get_descriptor
   usb_control_msg
    usb_internal_control_msg
     usb_start_wait_urb
      usb_submit_urb / wait_for_completion_timeout / usb_kill_urb

hub_port_init
 hub_set_address
  xhci_address_device
   xhci_setup_device

Mathias Nyman explains the current behaviour violates the XHCI spec:

 hub_port_reset() will end up moving the corresponding xhci device slot
 to default state.

 As hub_port_reset() is called several times in hub_port_init() it
 sounds reasonable that we could end up with two threads having their
 xhci device slots in default state at the same time, which according to
 xhci 4.5.3 specs still is a big no no:

 "Note: Software shall not transition more than one Device Slot to the
  Default State at a time"

 So both threads fail at their next task after this.
 One fails to read the descriptor, and the other fails addressing the
 device.

Fix this in hub_port_init by locking the USB controller (instead of an
individual bus) to prevent simultaneous initialisation of both buses.

Fixes: 638139eb95d2 ("usb: hub: allow to process more usb hub events in parallel")
Link: https://lkml.org/lkml/2016/2/8/312
Link: https://lkml.org/lkml/2016/2/4/748
Signed-off-by: Chris Bainbridge <chris.bainbridge@gmail.com>
Cc: stable <stable@vger.kernel.org>
Acked-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 15 +++++++++++++--
 drivers/usb/core/hub.c  |  8 ++++----
 include/linux/usb.h     |  3 +--
 include/linux/usb/hcd.h |  1 +
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 2ca2cef7f681..980fc5774151 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -994,7 +994,7 @@ static void usb_bus_init (struct usb_bus *bus)
 	bus->bandwidth_allocated = 0;
 	bus->bandwidth_int_reqs  = 0;
 	bus->bandwidth_isoc_reqs = 0;
-	mutex_init(&bus->usb_address0_mutex);
+	mutex_init(&bus->devnum_next_mutex);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -2521,6 +2521,14 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 		return NULL;
 	}
 	if (primary_hcd == NULL) {
+		hcd->address0_mutex = kmalloc(sizeof(*hcd->address0_mutex),
+				GFP_KERNEL);
+		if (!hcd->address0_mutex) {
+			kfree(hcd);
+			dev_dbg(dev, "hcd address0 mutex alloc failed\n");
+			return NULL;
+		}
+		mutex_init(hcd->address0_mutex);
 		hcd->bandwidth_mutex = kmalloc(sizeof(*hcd->bandwidth_mutex),
 				GFP_KERNEL);
 		if (!hcd->bandwidth_mutex) {
@@ -2532,6 +2540,7 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 		dev_set_drvdata(dev, hcd);
 	} else {
 		mutex_lock(&usb_port_peer_mutex);
+		hcd->address0_mutex = primary_hcd->address0_mutex;
 		hcd->bandwidth_mutex = primary_hcd->bandwidth_mutex;
 		hcd->primary_hcd = primary_hcd;
 		primary_hcd->primary_hcd = primary_hcd;
@@ -2598,8 +2607,10 @@ static void hcd_release(struct kref *kref)
 	struct usb_hcd *hcd = container_of (kref, struct usb_hcd, kref);
 
 	mutex_lock(&usb_port_peer_mutex);
-	if (usb_hcd_is_primary_hcd(hcd))
+	if (usb_hcd_is_primary_hcd(hcd)) {
+		kfree(hcd->address0_mutex);
 		kfree(hcd->bandwidth_mutex);
+	}
 	if (hcd->shared_hcd) {
 		struct usb_hcd *peer = hcd->shared_hcd;
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index c2270d8fac12..bee13517676f 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2082,7 +2082,7 @@ static void choose_devnum(struct usb_device *udev)
 	struct usb_bus	*bus = udev->bus;
 
 	/* be safe when more hub events are proceed in parallel */
-	mutex_lock(&bus->usb_address0_mutex);
+	mutex_lock(&bus->devnum_next_mutex);
 	if (udev->wusb) {
 		devnum = udev->portnum + 1;
 		BUG_ON(test_bit(devnum, bus->devmap.devicemap));
@@ -2100,7 +2100,7 @@ static void choose_devnum(struct usb_device *udev)
 		set_bit(devnum, bus->devmap.devicemap);
 		udev->devnum = devnum;
 	}
-	mutex_unlock(&bus->usb_address0_mutex);
+	mutex_unlock(&bus->devnum_next_mutex);
 }
 
 static void release_devnum(struct usb_device *udev)
@@ -4366,7 +4366,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 	if (oldspeed == USB_SPEED_LOW)
 		delay = HUB_LONG_RESET_TIME;
 
-	mutex_lock(&hdev->bus->usb_address0_mutex);
+	mutex_lock(hcd->address0_mutex);
 
 	/* Reset the device; full speed may morph to high speed */
 	/* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
@@ -4652,7 +4652,7 @@ fail:
 		hub_port_disable(hub, port1, 0);
 		update_devnum(udev, devnum);	/* for disconnect processing */
 	}
-	mutex_unlock(&hdev->bus->usb_address0_mutex);
+	mutex_unlock(hcd->address0_mutex);
 	return retval;
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7824f4557d50..01b6c61cf9bb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -374,13 +374,12 @@ struct usb_bus {
 
 	int devnum_next;		/* Next open device number in
 					 * round-robin allocation */
+	struct mutex devnum_next_mutex; /* devnum_next mutex */
 
 	struct usb_devmap devmap;	/* device address allocation map */
 	struct usb_device *root_hub;	/* Root hub */
 	struct usb_bus *hs_companion;	/* Companion EHCI bus, if any */
 
-	struct mutex usb_address0_mutex; /* unaddressed device mutex */
-
 	int bandwidth_allocated;	/* on this bus: how much of the time
 					 * reserved for periodic (intr/iso)
 					 * requests is used, on average?
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index b98f831dcda3..66fc13705ab7 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -181,6 +181,7 @@ struct usb_hcd {
 	 * bandwidth_mutex should be dropped after a successful control message
 	 * to the device, or resetting the bandwidth after a failed attempt.
 	 */
+	struct mutex		*address0_mutex;
 	struct mutex		*bandwidth_mutex;
 	struct usb_hcd		*shared_hcd;
 	struct usb_hcd		*primary_hcd;
-- 
cgit v1.2.3


From 0a2cf20c3fb62ad4717276b5303bf831f7b29d54 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Wed, 27 Apr 2016 23:39:01 -0400
Subject: tcp: remove SKBTX_ACK_TSTAMP since it is redundant

The SKBTX_ACK_TSTAMP flag is set in skb_shinfo->tx_flags when
the timestamp of the TCP acknowledgement should be reported on
error queue. Since accessing skb_shinfo is likely to incur a
cache-line miss at the time of receiving the ack, the
txstamp_ack bit was added in tcp_skb_cb, which is set iff
the SKBTX_ACK_TSTAMP flag is set for an skb. This makes
SKBTX_ACK_TSTAMP flag redundant.

Remove the SKBTX_ACK_TSTAMP and instead use the txstamp_ack bit
everywhere.

Note that this frees one bit in shinfo->tx_flags.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Suggested-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  6 +-----
 net/ipv4/tcp.c         |  5 +++--
 net/ipv4/tcp_input.c   |  3 +--
 net/ipv4/tcp_output.c  | 17 +++++++++++------
 net/socket.c           |  3 ---
 5 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a1ce63979ad8..c84a5a1078c5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -382,14 +382,10 @@ enum {
 
 	/* generate software time stamp when entering packet scheduling */
 	SKBTX_SCHED_TSTAMP = 1 << 6,
-
-	/* generate software timestamp on peer data acknowledgment */
-	SKBTX_ACK_TSTAMP = 1 << 7,
 };
 
 #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
-				 SKBTX_SCHED_TSTAMP | \
-				 SKBTX_ACK_TSTAMP)
+				 SKBTX_SCHED_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53890a730ff4..91993782a947 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -435,9 +435,10 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 		sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
-		if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+		if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+			tcb->txstamp_ack = 1;
+		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
 			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
-		tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d5239c283cb..70c370b93762 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3087,8 +3087,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 		return;
 
 	shinfo = skb_shinfo(skb);
-	if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
-	    !before(shinfo->tskey, prior_snd_una) &&
+	if (!before(shinfo->tskey, prior_snd_una) &&
 	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
 		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1b2045ac3a9..b3a31b4df57c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1111,11 +1111,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	tcp_verify_left_out(tp);
 }
 
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->txstamp_ack ||
+		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
 static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+	if (unlikely(tcp_has_tx_tstamp(skb)) &&
 	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
 		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
 		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
@@ -2446,13 +2452,12 @@ u32 __tcp_select_window(struct sock *sk)
 void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 			     const struct sk_buff *next_skb)
 {
-	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
-	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-
-	if (unlikely(tsflags)) {
+	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+		const struct skb_shared_info *next_shinfo =
+			skb_shinfo(next_skb);
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-		shinfo->tx_flags |= tsflags;
+		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
 		shinfo->tskey = next_shinfo->tskey;
 		TCP_SKB_CB(skb)->txstamp_ack |=
 			TCP_SKB_CB(next_skb)->txstamp_ack;
diff --git a/net/socket.c b/net/socket.c
index 5dbb0bbe12a7..7789d79609dd 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -600,9 +600,6 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
 	if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
 		flags |= SKBTX_SCHED_TSTAMP;
 
-	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
-		flags |= SKBTX_ACK_TSTAMP;
-
 	*tx_flags = flags;
 }
 EXPORT_SYMBOL(__sock_tx_timestamp);
-- 
cgit v1.2.3


From 8453c5cafd32c4d6bd13ec4a62d4b639f4edb222 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 28 Apr 2016 08:21:03 -0700
Subject: ARM: OMAP2+: Add more functions to pwm pdata for ir-rx51

Before we start removing omap3 legacy booting support, let's make n900
DT booting behave the same way for ir-rx51 as the legacy booting does.

For now, we need to pass pdata to the ir-rx51 driver. This means that
the n900 tree can move to using DT based booting without having to carry
all the legacy platform data with it when it gets dropped from the mainline
tree.

Note that the ir-rx51 driver is currently disabled because of the
dependency to !ARCH_MULTIPLATFORM. This will get sorted out later
with the help of drivers/pwm/pwm-omap-dmtimer.c. But first we need
to add chained IRQ support to dmtimer code to avoid introducing new
custom frameworks.

So let's just pass the necessary dmtimer functions to ir-rx51 so we
can get it working in the following patch.

Cc: Neil Armstrong <narmstrong@baylibre.com>
Tested-by: Ivaylo Dimitrov <ivo.g.dimitrov.75@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/board-rx51-peripherals.c   | 35 ++++++++++++++++++++++++--
 arch/arm/mach-omap2/pdata-quirks.c             | 33 +++++++++++++++++++++++-
 include/linux/platform_data/media/ir-rx51.h    |  1 +
 include/linux/platform_data/pwm_omap_dmtimer.h | 21 ++++++++++++++++
 4 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/board-rx51-peripherals.c b/arch/arm/mach-omap2/board-rx51-peripherals.c
index da174c0d603b..9a7073949d1d 100644
--- a/arch/arm/mach-omap2/board-rx51-peripherals.c
+++ b/arch/arm/mach-omap2/board-rx51-peripherals.c
@@ -30,6 +30,8 @@
 #include <linux/platform_data/spi-omap2-mcspi.h>
 #include <linux/platform_data/mtd-onenand-omap2.h>
 
+#include <plat/dmtimer.h>
+
 #include <asm/system_info.h>
 
 #include "common.h"
@@ -47,9 +49,8 @@
 
 #include <video/omap-panel-data.h>
 
-#if defined(CONFIG_IR_RX51) || defined(CONFIG_IR_RX51_MODULE)
+#include <linux/platform_data/pwm_omap_dmtimer.h>
 #include <linux/platform_data/media/ir-rx51.h>
-#endif
 
 #include "mux.h"
 #include "omap-pm.h"
@@ -1212,10 +1213,40 @@ static void __init rx51_init_tsc2005(void)
 				gpio_to_irq(RX51_TSC2005_IRQ_GPIO);
 }
 
+#if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
+static struct pwm_omap_dmtimer_pdata __maybe_unused pwm_dmtimer_pdata = {
+	.request_by_node = omap_dm_timer_request_by_node,
+	.request_specific = omap_dm_timer_request_specific,
+	.request = omap_dm_timer_request,
+	.set_source = omap_dm_timer_set_source,
+	.get_irq = omap_dm_timer_get_irq,
+	.set_int_enable = omap_dm_timer_set_int_enable,
+	.set_int_disable = omap_dm_timer_set_int_disable,
+	.free = omap_dm_timer_free,
+	.enable = omap_dm_timer_enable,
+	.disable = omap_dm_timer_disable,
+	.get_fclk = omap_dm_timer_get_fclk,
+	.start = omap_dm_timer_start,
+	.stop = omap_dm_timer_stop,
+	.set_load = omap_dm_timer_set_load,
+	.set_match = omap_dm_timer_set_match,
+	.set_pwm = omap_dm_timer_set_pwm,
+	.set_prescaler = omap_dm_timer_set_prescaler,
+	.read_counter = omap_dm_timer_read_counter,
+	.write_counter = omap_dm_timer_write_counter,
+	.read_status = omap_dm_timer_read_status,
+	.write_status = omap_dm_timer_write_status,
+};
+#endif
+
 #if defined(CONFIG_IR_RX51) || defined(CONFIG_IR_RX51_MODULE)
 static struct lirc_rx51_platform_data rx51_lirc_data = {
 	.set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat,
 	.pwm_timer = 9, /* Use GPT 9 for CIR */
+#if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
+	.dmtimer = &pwm_dmtimer_pdata,
+#endif
+
 };
 
 static struct platform_device rx51_lirc_device = {
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index a935d28443da..f8d389d031ba 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -24,6 +24,7 @@
 #include <linux/platform_data/iommu-omap.h>
 #include <linux/platform_data/wkup_m3.h>
 #include <linux/platform_data/pwm_omap_dmtimer.h>
+#include <linux/platform_data/media/ir-rx51.h>
 #include <plat/dmtimer.h>
 
 #include "common.h"
@@ -31,6 +32,7 @@
 #include "dss-common.h"
 #include "control.h"
 #include "omap_device.h"
+#include "omap-pm.h"
 #include "omap-secure.h"
 #include "soc.h"
 #include "hsmmc.h"
@@ -268,6 +270,8 @@ static struct platform_device omap3_rom_rng_device = {
 	},
 };
 
+static struct platform_device rx51_lirc_device;
+
 static void __init nokia_n900_legacy_init(void)
 {
 	hsmmc2_internal_input_clk();
@@ -286,6 +290,8 @@ static void __init nokia_n900_legacy_init(void)
 		platform_device_register(&omap3_rom_rng_device);
 
 	}
+
+	platform_device_register(&rx51_lirc_device);
 }
 
 static void __init omap3_tao3530_legacy_init(void)
@@ -453,8 +459,14 @@ void omap_auxdata_legacy_init(struct device *dev)
 
 /* Dual mode timer PWM callbacks platdata */
 #if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
-struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = {
+static struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = {
 	.request_by_node = omap_dm_timer_request_by_node,
+	.request_specific = omap_dm_timer_request_specific,
+	.request = omap_dm_timer_request,
+	.set_source = omap_dm_timer_set_source,
+	.get_irq = omap_dm_timer_get_irq,
+	.set_int_enable = omap_dm_timer_set_int_enable,
+	.set_int_disable = omap_dm_timer_set_int_disable,
 	.free = omap_dm_timer_free,
 	.enable = omap_dm_timer_enable,
 	.disable = omap_dm_timer_disable,
@@ -465,10 +477,29 @@ struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = {
 	.set_match = omap_dm_timer_set_match,
 	.set_pwm = omap_dm_timer_set_pwm,
 	.set_prescaler = omap_dm_timer_set_prescaler,
+	.read_counter = omap_dm_timer_read_counter,
 	.write_counter = omap_dm_timer_write_counter,
+	.read_status = omap_dm_timer_read_status,
+	.write_status = omap_dm_timer_write_status,
 };
 #endif
 
+static struct lirc_rx51_platform_data __maybe_unused rx51_lirc_data = {
+	.set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat,
+	.pwm_timer = 9, /* Use GPT 9 for CIR */
+#if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
+	.dmtimer = &pwm_dmtimer_pdata,
+#endif
+};
+
+static struct platform_device __maybe_unused rx51_lirc_device = {
+	.name           = "lirc_rx51",
+	.id             = -1,
+	.dev            = {
+		.platform_data = &rx51_lirc_data,
+	},
+};
+
 /*
  * Few boards still need auxdata populated before we populate
  * the dev entries in of_platform_populate().
diff --git a/include/linux/platform_data/media/ir-rx51.h b/include/linux/platform_data/media/ir-rx51.h
index 104aa892f31b..3038120ca46e 100644
--- a/include/linux/platform_data/media/ir-rx51.h
+++ b/include/linux/platform_data/media/ir-rx51.h
@@ -5,6 +5,7 @@ struct lirc_rx51_platform_data {
 	int pwm_timer;
 
 	int(*set_max_mpu_wakeup_lat)(struct device *dev, long t);
+	struct pwm_omap_dmtimer_pdata *dmtimer;
 };
 
 #endif
diff --git a/include/linux/platform_data/pwm_omap_dmtimer.h b/include/linux/platform_data/pwm_omap_dmtimer.h
index 59384217208f..e7d521e48855 100644
--- a/include/linux/platform_data/pwm_omap_dmtimer.h
+++ b/include/linux/platform_data/pwm_omap_dmtimer.h
@@ -35,6 +35,16 @@
 #ifndef __PWM_OMAP_DMTIMER_PDATA_H
 #define __PWM_OMAP_DMTIMER_PDATA_H
 
+/* clock sources */
+#define PWM_OMAP_DMTIMER_SRC_SYS_CLK			0x00
+#define PWM_OMAP_DMTIMER_SRC_32_KHZ			0x01
+#define PWM_OMAP_DMTIMER_SRC_EXT_CLK			0x02
+
+/* timer interrupt enable bits */
+#define PWM_OMAP_DMTIMER_INT_CAPTURE			(1 << 2)
+#define PWM_OMAP_DMTIMER_INT_OVERFLOW			(1 << 1)
+#define PWM_OMAP_DMTIMER_INT_MATCH			(1 << 0)
+
 /* trigger types */
 #define PWM_OMAP_DMTIMER_TRIGGER_NONE			0x00
 #define PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW		0x01
@@ -45,15 +55,23 @@ typedef struct omap_dm_timer pwm_omap_dmtimer;
 
 struct pwm_omap_dmtimer_pdata {
 	pwm_omap_dmtimer *(*request_by_node)(struct device_node *np);
+	pwm_omap_dmtimer *(*request_specific)(int timer_id);
+	pwm_omap_dmtimer *(*request)(void);
+
 	int	(*free)(pwm_omap_dmtimer *timer);
 
 	void	(*enable)(pwm_omap_dmtimer *timer);
 	void	(*disable)(pwm_omap_dmtimer *timer);
 
+	int	(*get_irq)(pwm_omap_dmtimer *timer);
+	int	(*set_int_enable)(pwm_omap_dmtimer *timer, unsigned int value);
+	int	(*set_int_disable)(pwm_omap_dmtimer *timer, u32 mask);
+
 	struct clk *(*get_fclk)(pwm_omap_dmtimer *timer);
 
 	int	(*start)(pwm_omap_dmtimer *timer);
 	int	(*stop)(pwm_omap_dmtimer *timer);
+	int	(*set_source)(pwm_omap_dmtimer *timer, int source);
 
 	int	(*set_load)(pwm_omap_dmtimer *timer, int autoreload,
 			unsigned int value);
@@ -63,7 +81,10 @@ struct pwm_omap_dmtimer_pdata {
 			int toggle, int trigger);
 	int	(*set_prescaler)(pwm_omap_dmtimer *timer, int prescaler);
 
+	unsigned int (*read_counter)(pwm_omap_dmtimer *timer);
 	int	(*write_counter)(pwm_omap_dmtimer *timer, unsigned int value);
+	unsigned int (*read_status)(pwm_omap_dmtimer *timer);
+	int	(*write_status)(pwm_omap_dmtimer *timer, unsigned int value);
 };
 
 #endif /* __PWM_OMAP_DMTIMER_PDATA_H */
-- 
cgit v1.2.3


From 40abf9be8f52d440e442206182916e3dcc68f722 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Mon, 11 Apr 2016 15:02:28 -0700
Subject: libnvdimm: increase max envelope size for ioctl

nd_ioctl() must first read in the fixed sized portion of an ioctl so
that it can then determine the size of the variable part.

Prepare for ND_CMD_CALL calls which have larger fixed portion
envelope.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/libnvdimm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 833867b9ddc2..af31d1c6fdd7 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -27,7 +27,7 @@ enum {
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
 	ND_CMD_MAX_ELEM = 5,
-	ND_CMD_MAX_ENVELOPE = 16,
+	ND_CMD_MAX_ENVELOPE = 256,
 	ND_MAX_MAPPINGS = 32,
 
 	/* region flag indicating to direct-map persistent memory by default */
-- 
cgit v1.2.3


From e3654eca70d63704c94a60a2aafc0b3c7b46a00b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 16:17:07 -0700
Subject: nfit, libnvdimm: clarify "commands" vs "_DSMs"

Clarify the distinction between "commands", the ioctls userspace calls
to request the kernel take some action on a given dimm device, and
"_DSMs", the actual function numbers used in the firmware interface to
the DIMM.  _DSMs are ACPI specific whereas commands are Linux kernel
generic.

This is in preparation for breaking the 1:1 implicit relationship
between the kernel ioctl number space and the firmware specific function
numbers.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c              | 21 +++++++++++++--------
 drivers/acpi/nfit.h              |  4 ++--
 drivers/nvdimm/bus.c             |  8 ++++----
 drivers/nvdimm/core.c            |  2 +-
 drivers/nvdimm/dimm_devs.c       | 18 ++++++++++++------
 drivers/nvdimm/nd-core.h         |  2 +-
 include/linux/libnvdimm.h        |  5 +++--
 tools/testing/nvdimm/test/nfit.c | 27 ++++++++++++++-------------
 8 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d0f35e63640b..1b98e9dc6138 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -175,7 +175,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	union acpi_object in_obj, in_buf, *out_obj;
 	struct device *dev = acpi_desc->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	acpi_handle handle;
 	const u8 *uuid;
 	u32 offset;
@@ -189,7 +189,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			return -ENOTTY;
 		dimm_name = nvdimm_name(nvdimm);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nfit_mem->dsm_mask;
+		cmd_mask = nvdimm_cmd_mask(nvdimm);
 		desc = nd_cmd_dimm_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 		handle = adev->handle;
@@ -197,7 +197,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct acpi_device *adev = to_acpi_dev(acpi_desc);
 
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		desc = nd_cmd_bus_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_BUS);
 		handle = adev->handle;
@@ -207,7 +207,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
 		return -ENOTTY;
 
-	if (!test_bit(cmd, &dsm_mask))
+	if (!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	in_obj.type = ACPI_TYPE_PACKAGE;
@@ -926,7 +926,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 	int i;
 
-	nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
+	/* nfit test assumes 1:1 relationship between commands and dsms */
+	nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return 0;
@@ -976,9 +977,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (rc)
 			continue;
 
+		/*
+		 * For now there is 1:1 relationship between cmd_mask and
+		 * dsm_mask.
+		 */
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
 				acpi_nfit_dimm_attribute_groups,
-				flags, &nfit_mem->dsm_mask);
+				flags, nfit_mem->dsm_mask);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -1007,14 +1012,14 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	struct acpi_device *adev;
 	int i;
 
-	nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en;
+	nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return;
 
 	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
-			set_bit(i, &nd_desc->dsm_mask);
+			set_bit(i, &nd_desc->cmd_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c75576b2d50e..332ee6f01662 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -132,8 +132,8 @@ struct acpi_nfit_desc {
 	size_t ars_status_size;
 	struct work_struct work;
 	unsigned int cancel:1;
-	unsigned long dimm_dsm_force_en;
-	unsigned long bus_dsm_force_en;
+	unsigned long dimm_cmd_force_en;
+	unsigned long bus_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 19f822d7f652..cb2042a12b76 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -589,24 +589,24 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	void *buf;
 	int rc, i;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
+		cmd_mask = nvdimm->cmd_mask;
 		dimm_name = dev_name(&nvdimm->dev);
 	} else {
 		desc = nd_cmd_bus_desc(cmd);
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		dimm_name = "bus";
 	}
 
 	if (!desc || (desc->out_num + desc->in_num == 0) ||
-			!test_bit(cmd, &dsm_mask))
+			!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	/* fail write commands (when read-only) */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 182a93fe3712..e8688a13cf4f 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -251,7 +251,7 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
-	for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c56f88217924..79a35a02053c 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -37,9 +37,9 @@ static int __validate_dimm(struct nvdimm_drvdata *ndd)
 
 	nvdimm = to_nvdimm(ndd->dev);
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return -ENXIO;
-	if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
+	if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask))
 		return -ENXIO;
 
 	return 0;
@@ -263,6 +263,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm)
 }
 EXPORT_SYMBOL_GPL(nvdimm_name);
 
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm)
+{
+	return nvdimm->cmd_mask;
+}
+EXPORT_SYMBOL_GPL(nvdimm_cmd_mask);
+
 void *nvdimm_provider_data(struct nvdimm *nvdimm)
 {
 	if (nvdimm)
@@ -277,10 +283,10 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 	int cmd, len = 0;
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return sprintf(buf, "\n");
 
-	for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
@@ -340,7 +346,7 @@ EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
 
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask)
+		unsigned long cmd_mask)
 {
 	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
 	struct device *dev;
@@ -355,7 +361,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	}
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
-	nvdimm->dsm_mask = dsm_mask;
+	nvdimm->cmd_mask = cmd_mask;
 	atomic_set(&nvdimm->busy, 0);
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 1d1500f3d8b5..da0d322ed7cb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -37,7 +37,7 @@ struct nvdimm_bus {
 struct nvdimm {
 	unsigned long flags;
 	void *provider_data;
-	unsigned long *dsm_mask;
+	unsigned long cmd_mask;
 	struct device dev;
 	atomic_t busy;
 	int id;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index af31d1c6fdd7..0c3c30cbbea5 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -68,7 +68,7 @@ struct nd_mapping {
 
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	char *provider_name;
 	ndctl_fn ndctl;
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
@@ -130,10 +130,11 @@ struct nd_region *to_nd_region(struct device *dev);
 struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask);
+		unsigned long cmd_mask);
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
 u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 3187322eeed7..ed899a411c22 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -344,8 +344,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 
 	if (nvdimm) {
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+		unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
 
-		if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+		if (!nfit_mem || !test_bit(cmd, &cmd_mask))
 			return -ENOTTY;
 
 		/* lookup label space for the given dimm */
@@ -374,7 +375,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	} else {
 		struct ars_state *ars_state = &t->ars_state;
 
-		if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+		if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
 			return -ENOTTY;
 
 		switch (cmd) {
@@ -1251,13 +1252,13 @@ static void nfit_test0_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1315,10 +1316,10 @@ static void nfit_test1_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit v1.2.3


From 92b4423e3a0bc5d43ecde4bcad871f8b5ba04efd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 Apr 2016 10:39:34 +0200
Subject: netfilter: fix IS_ERR_VALUE usage

This is a forward-port of the original patch from Andrzej Hajda,
he said:

"IS_ERR_VALUE should be used only with unsigned long type.
Otherwise it can work incorrectly. To achieve this function
xt_percpu_counter_alloc is modified to return unsigned long,
and its result is assigned to temporary variable to perform
error checking, before assigning to .pcnt field.

The patch follows conclusion from discussion on LKML [1][2].

[1]: http://permalink.gmane.org/gmane.linux.kernel/2120927
[2]: http://permalink.gmane.org/gmane.linux.kernel/2150581"

Original patch from Andrzej is here:

http://patchwork.ozlabs.org/patch/582970/

This patch has clashed with input validation fixes for x_tables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 6 +++---
 net/ipv4/netfilter/arp_tables.c    | 6 ++++--
 net/ipv4/netfilter/ip_tables.c     | 6 ++++--
 net/ipv6/netfilter/ip6_tables.c    | 6 ++++--
 4 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 4dd9306c9d56..dc4f58a3cdcc 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -380,16 +380,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
  * allows us to return 0 for single core systems without forcing
  * callers to deal with SMP vs. NONSMP issues.
  */
-static inline u64 xt_percpu_counter_alloc(void)
+static inline unsigned long xt_percpu_counter_alloc(void)
 {
 	if (nr_cpu_ids > 1) {
 		void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
 						    sizeof(struct xt_counters));
 
 		if (res == NULL)
-			return (u64) -ENOMEM;
+			return -ENOMEM;
 
-		return (u64) (__force unsigned long) res;
+		return (__force unsigned long) res;
 	}
 
 	return 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 60f5161abcb4..3355ed72051d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -513,11 +513,13 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
+	unsigned long pcnt;
 	int ret;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	t = arpt_get_target(e);
 	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 735d1ee8c1ab..21ccc19e1e6f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -656,10 +656,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
+	unsigned long pcnt;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73e606c719ef..17874e83a950 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -669,10 +669,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
+	unsigned long pcnt;
 
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
+	pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(pcnt))
 		return -ENOMEM;
+	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
-- 
cgit v1.2.3


From e81591815de05572ed28cbdca631d4d97f0bd059 Mon Sep 17 00:00:00 2001
From: Jiang Qiu <qiujiang@huawei.com>
Date: Thu, 28 Apr 2016 17:32:01 +0800
Subject: gpio: dwapb: remove name from dwapb_port_property

This patch removed the name property from dwapb_port_property.
The name property is redundant, since we can get this info
from dwapb_gpio dev node.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Jiang Qiu <qiujiang@huawei.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-dwapb.c                | 24 +++++++++++-------------
 drivers/mfd/intel_quark_i2c_gpio.c       |  1 -
 include/linux/platform_data/gpio-dwapb.h |  1 -
 3 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c
index 597de1ef497b..772d74383253 100644
--- a/drivers/gpio/gpio-dwapb.c
+++ b/drivers/gpio/gpio-dwapb.c
@@ -409,8 +409,8 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 	err = bgpio_init(&port->gc, gpio->dev, 4, dat, set, NULL, dirout,
 			 NULL, false);
 	if (err) {
-		dev_err(gpio->dev, "failed to init gpio chip for %s\n",
-			pp->name);
+		dev_err(gpio->dev, "failed to init gpio chip for port%d\n",
+			port->idx);
 		return err;
 	}
 
@@ -429,8 +429,8 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 
 	err = gpiochip_add_data(&port->gc, port);
 	if (err)
-		dev_err(gpio->dev, "failed to register gpiochip for %s\n",
-			pp->name);
+		dev_err(gpio->dev, "failed to register gpiochip for port%d\n",
+			port->idx);
 	else
 		port->is_registered = true;
 
@@ -480,15 +480,16 @@ dwapb_gpio_get_pdata_of(struct device *dev)
 
 		if (of_property_read_u32(port_np, "reg", &pp->idx) ||
 		    pp->idx >= DWAPB_MAX_PORTS) {
-			dev_err(dev, "missing/invalid port index for %s\n",
-				port_np->full_name);
+			dev_err(dev,
+				"missing/invalid port index for port%d\n", i);
 			return ERR_PTR(-EINVAL);
 		}
 
 		if (of_property_read_u32(port_np, "snps,nr-gpios",
 					 &pp->ngpio)) {
-			dev_info(dev, "failed to get number of gpios for %s\n",
-				 port_np->full_name);
+			dev_info(dev,
+				 "failed to get number of gpios for port%d\n",
+				 i);
 			pp->ngpio = 32;
 		}
 
@@ -499,15 +500,12 @@ dwapb_gpio_get_pdata_of(struct device *dev)
 		if (pp->idx == 0 &&
 		    of_property_read_bool(port_np, "interrupt-controller")) {
 			pp->irq = irq_of_parse_and_map(port_np, 0);
-			if (!pp->irq) {
-				dev_warn(dev, "no irq for bank %s\n",
-					 port_np->full_name);
-			}
+			if (!pp->irq)
+				dev_warn(dev, "no irq for port%d\n", pp->idx);
 		}
 
 		pp->irq_shared	= false;
 		pp->gpio_base	= -1;
-		pp->name	= port_np->full_name;
 	}
 
 	return pdata;
diff --git a/drivers/mfd/intel_quark_i2c_gpio.c b/drivers/mfd/intel_quark_i2c_gpio.c
index bdc5e27222c0..a4ef99b88924 100644
--- a/drivers/mfd/intel_quark_i2c_gpio.c
+++ b/drivers/mfd/intel_quark_i2c_gpio.c
@@ -220,7 +220,6 @@ static int intel_quark_gpio_setup(struct pci_dev *pdev, struct mfd_cell *cell)
 
 	/* Set the properties for portA */
 	pdata->properties->node		= NULL;
-	pdata->properties->name		= "intel-quark-x1000-gpio-portA";
 	pdata->properties->idx		= 0;
 	pdata->properties->ngpio	= INTEL_QUARK_MFD_NGPIO;
 	pdata->properties->gpio_base	= INTEL_QUARK_MFD_GPIO_BASE;
diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
index 28702c849af1..955b5790d24a 100644
--- a/include/linux/platform_data/gpio-dwapb.h
+++ b/include/linux/platform_data/gpio-dwapb.h
@@ -16,7 +16,6 @@
 
 struct dwapb_port_property {
 	struct device_node *node;
-	const char	*name;
 	unsigned int	idx;
 	unsigned int	ngpio;
 	unsigned int	gpio_base;
-- 
cgit v1.2.3


From 4ba8cfa79f44a9489b1d562430cb70fb53200adb Mon Sep 17 00:00:00 2001
From: Jiang Qiu <qiujiang@huawei.com>
Date: Thu, 28 Apr 2016 17:32:02 +0800
Subject: gpio: dwapb: convert device node to fwnode

This patch converts device node to fwnode for dwapb driver, so
as to provide a unified fwnode for DT and ACPI bindings.

Tested-by: Alan Tull <delicious.quinoa@gmail.com>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Jiang Qiu <qiujiang@huawei.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-dwapb.c                | 36 +++++++++++++++-----------------
 drivers/mfd/intel_quark_i2c_gpio.c       |  2 +-
 include/linux/platform_data/gpio-dwapb.h |  2 +-
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c
index 772d74383253..7517c2fcba56 100644
--- a/drivers/gpio/gpio-dwapb.c
+++ b/drivers/gpio/gpio-dwapb.c
@@ -22,6 +22,7 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 #include <linux/spinlock.h>
 #include <linux/platform_data/gpio-dwapb.h>
 #include <linux/slab.h>
@@ -290,14 +291,14 @@ static void dwapb_configure_irqs(struct dwapb_gpio *gpio,
 				 struct dwapb_port_property *pp)
 {
 	struct gpio_chip *gc = &port->gc;
-	struct device_node *node = pp->node;
+	struct fwnode_handle  *fwnode = pp->fwnode;
 	struct irq_chip_generic	*irq_gc = NULL;
 	unsigned int hwirq, ngpio = gc->ngpio;
 	struct irq_chip_type *ct;
 	int err, i;
 
-	gpio->domain = irq_domain_add_linear(node, ngpio,
-					     &irq_generic_chip_ops, gpio);
+	gpio->domain = irq_domain_create_linear(fwnode, ngpio,
+						 &irq_generic_chip_ops, gpio);
 	if (!gpio->domain)
 		return;
 
@@ -415,7 +416,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 	}
 
 #ifdef CONFIG_OF_GPIO
-	port->gc.of_node = pp->node;
+	port->gc.of_node = to_of_node(pp->fwnode);
 #endif
 	port->gc.ngpio = pp->ngpio;
 	port->gc.base = pp->gpio_base;
@@ -447,19 +448,15 @@ static void dwapb_gpio_unregister(struct dwapb_gpio *gpio)
 }
 
 static struct dwapb_platform_data *
-dwapb_gpio_get_pdata_of(struct device *dev)
+dwapb_gpio_get_pdata(struct device *dev)
 {
-	struct device_node *node, *port_np;
+	struct fwnode_handle *fwnode;
 	struct dwapb_platform_data *pdata;
 	struct dwapb_port_property *pp;
 	int nports;
 	int i;
 
-	node = dev->of_node;
-	if (!IS_ENABLED(CONFIG_OF_GPIO) || !node)
-		return ERR_PTR(-ENODEV);
-
-	nports = of_get_child_count(node);
+	nports = device_get_child_node_count(dev);
 	if (nports == 0)
 		return ERR_PTR(-ENODEV);
 
@@ -474,18 +471,18 @@ dwapb_gpio_get_pdata_of(struct device *dev)
 	pdata->nports = nports;
 
 	i = 0;
-	for_each_child_of_node(node, port_np) {
+	device_for_each_child_node(dev, fwnode)  {
 		pp = &pdata->properties[i++];
-		pp->node = port_np;
+		pp->fwnode = fwnode;
 
-		if (of_property_read_u32(port_np, "reg", &pp->idx) ||
+		if (fwnode_property_read_u32(fwnode, "reg", &pp->idx) ||
 		    pp->idx >= DWAPB_MAX_PORTS) {
 			dev_err(dev,
 				"missing/invalid port index for port%d\n", i);
 			return ERR_PTR(-EINVAL);
 		}
 
-		if (of_property_read_u32(port_np, "snps,nr-gpios",
+		if (fwnode_property_read_u32(fwnode, "snps,nr-gpios",
 					 &pp->ngpio)) {
 			dev_info(dev,
 				 "failed to get number of gpios for port%d\n",
@@ -497,9 +494,10 @@ dwapb_gpio_get_pdata_of(struct device *dev)
 		 * Only port A can provide interrupts in all configurations of
 		 * the IP.
 		 */
-		if (pp->idx == 0 &&
-		    of_property_read_bool(port_np, "interrupt-controller")) {
-			pp->irq = irq_of_parse_and_map(port_np, 0);
+		if (dev->of_node && pp->idx == 0 &&
+			fwnode_property_read_bool(fwnode,
+						  "interrupt-controller")) {
+			pp->irq = irq_of_parse_and_map(to_of_node(fwnode), 0);
 			if (!pp->irq)
 				dev_warn(dev, "no irq for port%d\n", pp->idx);
 		}
@@ -521,7 +519,7 @@ static int dwapb_gpio_probe(struct platform_device *pdev)
 	struct dwapb_platform_data *pdata = dev_get_platdata(dev);
 
 	if (!pdata) {
-		pdata = dwapb_gpio_get_pdata_of(dev);
+		pdata = dwapb_gpio_get_pdata(dev);
 		if (IS_ERR(pdata))
 			return PTR_ERR(pdata);
 	}
diff --git a/drivers/mfd/intel_quark_i2c_gpio.c b/drivers/mfd/intel_quark_i2c_gpio.c
index a4ef99b88924..a24b35fc2b5b 100644
--- a/drivers/mfd/intel_quark_i2c_gpio.c
+++ b/drivers/mfd/intel_quark_i2c_gpio.c
@@ -219,7 +219,7 @@ static int intel_quark_gpio_setup(struct pci_dev *pdev, struct mfd_cell *cell)
 		return -ENOMEM;
 
 	/* Set the properties for portA */
-	pdata->properties->node		= NULL;
+	pdata->properties->fwnode	= NULL;
 	pdata->properties->idx		= 0;
 	pdata->properties->ngpio	= INTEL_QUARK_MFD_NGPIO;
 	pdata->properties->gpio_base	= INTEL_QUARK_MFD_GPIO_BASE;
diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
index 955b5790d24a..2dc7f4a8ab09 100644
--- a/include/linux/platform_data/gpio-dwapb.h
+++ b/include/linux/platform_data/gpio-dwapb.h
@@ -15,7 +15,7 @@
 #define GPIO_DW_APB_H
 
 struct dwapb_port_property {
-	struct device_node *node;
+	struct fwnode_handle *fwnode;
 	unsigned int	idx;
 	unsigned int	ngpio;
 	unsigned int	gpio_base;
-- 
cgit v1.2.3


From 1140f7c8994a3a2a0d7c4972509d98b792617d39 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 5 Apr 2016 17:17:34 +0200
Subject: phy: core: Allow children node to be overridden

In order to more flexibly support device tree bindings, allow drivers to
override the container of the child nodes. By default the device node of
the PHY provider is assumed to be the parent for children, but bindings
may decide to add additional levels for better organization.

Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 Documentation/phy.txt   | 16 ++++++++++++++--
 drivers/phy/phy-core.c  | 50 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/phy/phy.h | 31 ++++++++++++++++++++----------
 3 files changed, 79 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/phy.txt b/Documentation/phy.txt
index b388c5af9e72..0aa994bd9a91 100644
--- a/Documentation/phy.txt
+++ b/Documentation/phy.txt
@@ -31,16 +31,28 @@ should provide its own implementation of of_xlate. of_xlate is used only for
 dt boot case.
 
 #define of_phy_provider_register(dev, xlate)    \
-        __of_phy_provider_register((dev), THIS_MODULE, (xlate))
+        __of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
 
 #define devm_of_phy_provider_register(dev, xlate)       \
-        __devm_of_phy_provider_register((dev), THIS_MODULE, (xlate))
+        __devm_of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
 
 of_phy_provider_register and devm_of_phy_provider_register macros can be used to
 register the phy_provider and it takes device and of_xlate as
 arguments. For the dt boot case, all PHY providers should use one of the above
 2 macros to register the PHY provider.
 
+Often the device tree nodes associated with a PHY provider will contain a set
+of children that each represent a single PHY. Some bindings may nest the child
+nodes within extra levels for context and extensibility, in which case the low
+level of_phy_provider_register_full() and devm_of_phy_provider_register_full()
+macros can be used to override the node containing the children.
+
+#define of_phy_provider_register_full(dev, children, xlate) \
+	__of_phy_provider_register(dev, children, THIS_MODULE, xlate)
+
+#define devm_of_phy_provider_register_full(dev, children, xlate) \
+	__devm_of_phy_provider_register_full(dev, children, THIS_MODULE, xlate)
+
 void devm_of_phy_provider_unregister(struct device *dev,
 	struct phy_provider *phy_provider);
 void of_phy_provider_unregister(struct phy_provider *phy_provider);
diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index e7e574dc667a..b72e9a3b6429 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -141,7 +141,7 @@ static struct phy_provider *of_phy_provider_lookup(struct device_node *node)
 		if (phy_provider->dev->of_node == node)
 			return phy_provider;
 
-		for_each_child_of_node(phy_provider->dev->of_node, child)
+		for_each_child_of_node(phy_provider->children, child)
 			if (child == node)
 				return phy_provider;
 	}
@@ -811,24 +811,59 @@ EXPORT_SYMBOL_GPL(devm_phy_destroy);
 /**
  * __of_phy_provider_register() - create/register phy provider with the framework
  * @dev: struct device of the phy provider
+ * @children: device node containing children (if different from dev->of_node)
  * @owner: the module owner containing of_xlate
  * @of_xlate: function pointer to obtain phy instance from phy provider
  *
  * Creates struct phy_provider from dev and of_xlate function pointer.
  * This is used in the case of dt boot for finding the phy instance from
  * phy provider.
+ *
+ * If the PHY provider doesn't nest children directly but uses a separate
+ * child node to contain the individual children, the @children parameter
+ * can be used to override the default. If NULL, the default (dev->of_node)
+ * will be used. If non-NULL, the device node must be a child (or further
+ * descendant) of dev->of_node. Otherwise an ERR_PTR()-encoded -EINVAL
+ * error code is returned.
  */
 struct phy_provider *__of_phy_provider_register(struct device *dev,
-	struct module *owner, struct phy * (*of_xlate)(struct device *dev,
-	struct of_phandle_args *args))
+	struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args))
 {
 	struct phy_provider *phy_provider;
 
+	/*
+	 * If specified, the device node containing the children must itself
+	 * be the provider's device node or a child (or further descendant)
+	 * thereof.
+	 */
+	if (children) {
+		struct device_node *parent = of_node_get(children), *next;
+
+		while (parent) {
+			if (parent == dev->of_node)
+				break;
+
+			next = of_get_parent(parent);
+			of_node_put(parent);
+			parent = next;
+		}
+
+		if (!parent)
+			return ERR_PTR(-EINVAL);
+
+		of_node_put(parent);
+	} else {
+		children = dev->of_node;
+	}
+
 	phy_provider = kzalloc(sizeof(*phy_provider), GFP_KERNEL);
 	if (!phy_provider)
 		return ERR_PTR(-ENOMEM);
 
 	phy_provider->dev = dev;
+	phy_provider->children = of_node_get(children);
 	phy_provider->owner = owner;
 	phy_provider->of_xlate = of_xlate;
 
@@ -854,8 +889,9 @@ EXPORT_SYMBOL_GPL(__of_phy_provider_register);
  * on the devres data, then, devres data is freed.
  */
 struct phy_provider *__devm_of_phy_provider_register(struct device *dev,
-	struct module *owner, struct phy * (*of_xlate)(struct device *dev,
-	struct of_phandle_args *args))
+	struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args))
 {
 	struct phy_provider **ptr, *phy_provider;
 
@@ -863,7 +899,8 @@ struct phy_provider *__devm_of_phy_provider_register(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	phy_provider = __of_phy_provider_register(dev, owner, of_xlate);
+	phy_provider = __of_phy_provider_register(dev, children, owner,
+						  of_xlate);
 	if (!IS_ERR(phy_provider)) {
 		*ptr = phy_provider;
 		devres_add(dev, ptr);
@@ -888,6 +925,7 @@ void of_phy_provider_unregister(struct phy_provider *phy_provider)
 
 	mutex_lock(&phy_provider_mutex);
 	list_del(&phy_provider->list);
+	of_node_put(phy_provider->children);
 	kfree(phy_provider);
 	mutex_unlock(&phy_provider_mutex);
 }
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 8cf05e341cff..a810f2a18842 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -77,6 +77,7 @@ struct phy {
  */
 struct phy_provider {
 	struct device		*dev;
+	struct device_node	*children;
 	struct module		*owner;
 	struct list_head	list;
 	struct phy * (*of_xlate)(struct device *dev,
@@ -93,10 +94,16 @@ struct phy_lookup {
 #define	to_phy(a)	(container_of((a), struct phy, dev))
 
 #define	of_phy_provider_register(dev, xlate)	\
-	__of_phy_provider_register((dev), THIS_MODULE, (xlate))
+	__of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
 
 #define	devm_of_phy_provider_register(dev, xlate)	\
-	__devm_of_phy_provider_register((dev), THIS_MODULE, (xlate))
+	__devm_of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
+
+#define of_phy_provider_register_full(dev, children, xlate) \
+	__of_phy_provider_register(dev, children, THIS_MODULE, xlate)
+
+#define devm_of_phy_provider_register_full(dev, children, xlate) \
+	__devm_of_phy_provider_register(dev, children, THIS_MODULE, xlate)
 
 static inline void phy_set_drvdata(struct phy *phy, void *data)
 {
@@ -147,11 +154,13 @@ struct phy *devm_phy_create(struct device *dev, struct device_node *node,
 void phy_destroy(struct phy *phy);
 void devm_phy_destroy(struct device *dev, struct phy *phy);
 struct phy_provider *__of_phy_provider_register(struct device *dev,
-	struct module *owner, struct phy * (*of_xlate)(struct device *dev,
-	struct of_phandle_args *args));
+	struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args));
 struct phy_provider *__devm_of_phy_provider_register(struct device *dev,
-	struct module *owner, struct phy * (*of_xlate)(struct device *dev,
-	struct of_phandle_args *args));
+	struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args));
 void of_phy_provider_unregister(struct phy_provider *phy_provider);
 void devm_of_phy_provider_unregister(struct device *dev,
 	struct phy_provider *phy_provider);
@@ -312,15 +321,17 @@ static inline void devm_phy_destroy(struct device *dev, struct phy *phy)
 }
 
 static inline struct phy_provider *__of_phy_provider_register(
-	struct device *dev, struct module *owner, struct phy * (*of_xlate)(
-	struct device *dev, struct of_phandle_args *args))
+	struct device *dev, struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args))
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct phy_provider *__devm_of_phy_provider_register(struct device
-	*dev, struct module *owner, struct phy * (*of_xlate)(struct device *dev,
-	struct of_phandle_args *args))
+	*dev, struct device_node *children, struct module *owner,
+	struct phy * (*of_xlate)(struct device *dev,
+				 struct of_phandle_args *args))
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From 53d2a715c24034ee4017f3c15c82bb4a53a07da5 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 11 Nov 2015 18:24:21 +0100
Subject: phy: Add Tegra XUSB pad controller support

Add a new driver for the XUSB pad controller found on NVIDIA Tegra SoCs.
This hardware block used to be exposed as a pin controller, but it turns
out that this isn't a good fit. The new driver and DT binding much more
accurately describe the hardware and are more flexible in supporting new
SoC generations.

Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/phy/Kconfig                        |    2 +
 drivers/phy/Makefile                       |    2 +
 drivers/phy/tegra/Kconfig                  |    8 +
 drivers/phy/tegra/Makefile                 |    5 +
 drivers/phy/tegra/xusb-tegra124.c          | 1752 ++++++++++++++++++++++++++++
 drivers/phy/tegra/xusb.c                   | 1021 ++++++++++++++++
 drivers/phy/tegra/xusb.h                   |  421 +++++++
 drivers/pinctrl/tegra/pinctrl-tegra-xusb.c |   20 +-
 include/linux/phy/tegra/xusb.h             |   30 +
 9 files changed, 3245 insertions(+), 16 deletions(-)
 create mode 100644 drivers/phy/tegra/Kconfig
 create mode 100644 drivers/phy/tegra/Makefile
 create mode 100644 drivers/phy/tegra/xusb-tegra124.c
 create mode 100644 drivers/phy/tegra/xusb.c
 create mode 100644 drivers/phy/tegra/xusb.h
 create mode 100644 include/linux/phy/tegra/xusb.h

(limited to 'include/linux')

diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index 26566db09de0..27e5f6ee9a2a 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -421,4 +421,6 @@ config PHY_CYGNUS_PCIE
 	  Enable this to support the Broadcom Cygnus PCIe PHY.
 	  If unsure, say N.
 
+source "drivers/phy/tegra/Kconfig"
+
 endmenu
diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index 24596a96a887..d4f06e69fd9a 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -52,3 +52,5 @@ obj-$(CONFIG_PHY_TUSB1210)		+= phy-tusb1210.o
 obj-$(CONFIG_PHY_BRCMSTB_SATA)		+= phy-brcmstb-sata.o
 obj-$(CONFIG_PHY_PISTACHIO_USB)		+= phy-pistachio-usb.o
 obj-$(CONFIG_PHY_CYGNUS_PCIE)		+= phy-bcm-cygnus-pcie.o
+
+obj-$(CONFIG_ARCH_TEGRA) += tegra/
diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig
new file mode 100644
index 000000000000..a3b1de953fb7
--- /dev/null
+++ b/drivers/phy/tegra/Kconfig
@@ -0,0 +1,8 @@
+config PHY_TEGRA_XUSB
+	tristate "NVIDIA Tegra XUSB pad controller driver"
+	depends on ARCH_TEGRA
+	help
+	  Choose this option if you have an NVIDIA Tegra SoC.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called phy-tegra-xusb.
diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile
new file mode 100644
index 000000000000..31150b4337cd
--- /dev/null
+++ b/drivers/phy/tegra/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_PHY_TEGRA_XUSB) += phy-tegra-xusb.o
+
+phy-tegra-xusb-y += xusb.o
+phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_124_SOC) += xusb-tegra124.o
+phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o
diff --git a/drivers/phy/tegra/xusb-tegra124.c b/drivers/phy/tegra/xusb-tegra124.c
new file mode 100644
index 000000000000..119957249a51
--- /dev/null
+++ b/drivers/phy/tegra/xusb-tegra124.c
@@ -0,0 +1,1752 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/mailbox_client.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/reset.h>
+#include <linux/slab.h>
+
+#include <soc/tegra/fuse.h>
+
+#include "xusb.h"
+
+#define FUSE_SKU_CALIB_HS_CURR_LEVEL_PADX_SHIFT(x) ((x) ? 15 : 0)
+#define FUSE_SKU_CALIB_HS_CURR_LEVEL_PAD_MASK 0x3f
+#define FUSE_SKU_CALIB_HS_IREF_CAP_SHIFT 13
+#define FUSE_SKU_CALIB_HS_IREF_CAP_MASK 0x3
+#define FUSE_SKU_CALIB_HS_SQUELCH_LEVEL_SHIFT 11
+#define FUSE_SKU_CALIB_HS_SQUELCH_LEVEL_MASK 0x3
+#define FUSE_SKU_CALIB_HS_TERM_RANGE_ADJ_SHIFT 7
+#define FUSE_SKU_CALIB_HS_TERM_RANGE_ADJ_MASK 0xf
+
+#define XUSB_PADCTL_USB2_PORT_CAP 0x008
+#define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_SHIFT(x) ((x) * 4)
+#define XUSB_PADCTL_USB2_PORT_CAP_PORT_CAP_MASK 0x3
+#define XUSB_PADCTL_USB2_PORT_CAP_DISABLED 0x0
+#define XUSB_PADCTL_USB2_PORT_CAP_HOST 0x1
+#define XUSB_PADCTL_USB2_PORT_CAP_DEVICE 0x2
+#define XUSB_PADCTL_USB2_PORT_CAP_OTG 0x3
+
+#define XUSB_PADCTL_SS_PORT_MAP 0x014
+#define XUSB_PADCTL_SS_PORT_MAP_PORTX_INTERNAL(x) (1 << (((x) * 4) + 3))
+#define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_SHIFT(x) ((x) * 4)
+#define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_MASK(x) (0x7 << ((x) * 4))
+#define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(x, v) (((v) & 0x7) << ((x) * 4))
+#define XUSB_PADCTL_SS_PORT_MAP_PORT_MAP_MASK 0x7
+
+#define XUSB_PADCTL_ELPG_PROGRAM 0x01c
+#define XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_VCORE_DOWN (1 << 26)
+#define XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN_EARLY (1 << 25)
+#define XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN (1 << 24)
+#define XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_VCORE_DOWN(x) (1 << (18 + (x) * 4))
+#define XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN_EARLY(x) \
+							(1 << (17 + (x) * 4))
+#define XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN(x) (1 << (16 + (x) * 4))
+
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL1 0x040
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL1_PLL0_LOCKDET (1 << 19)
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL1_REFCLK_SEL_MASK (0xf << 12)
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL1_PLL_RST (1 << 1)
+
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL2 0x044
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL2_REFCLKBUF_EN (1 << 6)
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL2_TXCLKREF_EN (1 << 5)
+#define XUSB_PADCTL_IOPHY_PLL_P0_CTL2_TXCLKREF_SEL (1 << 4)
+
+#define XUSB_PADCTL_IOPHY_USB3_PADX_CTL2(x) (0x058 + (x) * 4)
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_SHIFT 24
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_MASK 0xff
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_VAL 0x24
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_SHIFT 16
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_MASK 0x3f
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_SHIFT 8
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_MASK 0x3f
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_SHIFT 8
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_MASK 0xffff
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_VAL 0xf070
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_SHIFT 4
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_MASK 0xf
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_VAL 0xf
+
+#define XUSB_PADCTL_IOPHY_USB3_PADX_CTL4(x) (0x068 + (x) * 4)
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_SHIFT 24
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_MASK 0x1f
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_SHIFT 16
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_MASK 0x7f
+#define XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_VAL 0x002008ee
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL2(x) ((x) < 2 ? 0x078 + (x) * 4 : \
+					       0x0f8 + (x) * 4)
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_SHIFT 28
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_MASK 0x3
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_VAL 0x1
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL5(x) ((x) < 2 ? 0x090 + (x) * 4 : \
+					       0x11c + (x) * 4)
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL5_RX_QEYE_EN (1 << 8)
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL6(x) ((x) < 2 ? 0x098 + (x) * 4 : \
+					       0x128 + (x) * 4)
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SHIFT 24
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_G_Z_MASK 0x3f
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_TAP_MASK 0x1f
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_AMP_MASK 0x7f
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT 16
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK 0xff
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_G_Z 0x21
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_TAP 0x32
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_AMP 0x33
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_CTLE_Z 0x48
+#define XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_LATCH_G_Z 0xa1
+
+#define XUSB_PADCTL_USB2_OTG_PADX_CTL0(x) (0x0a0 + (x) * 4)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD_ZI (1 << 21)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD2 (1 << 20)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD (1 << 19)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_SHIFT 14
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_MASK 0x3
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_VAL(x) ((x) ? 0x0 : 0x3)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_SHIFT 6
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_MASK 0x3f
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_VAL 0x0e
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_CURR_LEVEL_SHIFT 0
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_CURR_LEVEL_MASK 0x3f
+
+#define XUSB_PADCTL_USB2_OTG_PADX_CTL1(x) (0x0ac + (x) * 4)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_HS_IREF_CAP_SHIFT 9
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_HS_IREF_CAP_MASK 0x3
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_TERM_RANGE_ADJ_SHIFT 3
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_TERM_RANGE_ADJ_MASK 0x7
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_DR (1 << 2)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_DISC_FORCE_POWERUP (1 << 1)
+#define XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_CHRP_FORCE_POWERUP (1 << 0)
+
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0 0x0b8
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_PD (1 << 12)
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_SHIFT 2
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_MASK 0x7
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_VAL 0x5
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_SHIFT 0
+#define XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_MASK 0x3
+
+#define XUSB_PADCTL_HSIC_PADX_CTL0(x) (0x0c0 + (x) * 4)
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWN_SHIFT 12
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWN_MASK 0x7
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWP_SHIFT 8
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWP_MASK 0x7
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEN_SHIFT 4
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEN_MASK 0x7
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEP_SHIFT 0
+#define XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEP_MASK 0x7
+
+#define XUSB_PADCTL_HSIC_PADX_CTL1(x) (0x0c8 + (x) * 4)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_RPU_STROBE (1 << 10)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_RPU_DATA (1 << 9)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_RPD_STROBE (1 << 8)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_RPD_DATA (1 << 7)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_PD_ZI (1 << 5)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_PD_RX (1 << 4)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_PD_TRX (1 << 3)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_PD_TX (1 << 2)
+#define XUSB_PADCTL_HSIC_PAD_CTL1_AUTO_TERM_EN (1 << 0)
+
+#define XUSB_PADCTL_HSIC_PADX_CTL2(x) (0x0d0 + (x) * 4)
+#define XUSB_PADCTL_HSIC_PAD_CTL2_RX_STROBE_TRIM_SHIFT 4
+#define XUSB_PADCTL_HSIC_PAD_CTL2_RX_STROBE_TRIM_MASK 0x7
+#define XUSB_PADCTL_HSIC_PAD_CTL2_RX_DATA_TRIM_SHIFT 0
+#define XUSB_PADCTL_HSIC_PAD_CTL2_RX_DATA_TRIM_MASK 0x7
+
+#define XUSB_PADCTL_HSIC_STRB_TRIM_CONTROL 0x0e0
+#define XUSB_PADCTL_HSIC_STRB_TRIM_CONTROL_STRB_TRIM_MASK 0x1f
+
+#define XUSB_PADCTL_USB3_PAD_MUX 0x134
+#define XUSB_PADCTL_USB3_PAD_MUX_PCIE_IDDQ_DISABLE(x) (1 << (1 + (x)))
+#define XUSB_PADCTL_USB3_PAD_MUX_SATA_IDDQ_DISABLE(x) (1 << (6 + (x)))
+
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1 0x138
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL1_LOCKDET (1 << 27)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL1_MODE (1 << 24)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL0_REFCLK_NDIV_SHIFT 20
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL0_REFCLK_NDIV_MASK 0x3
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_PWR_OVRD (1 << 3)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_RST (1 << 1)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_IDDQ (1 << 0)
+
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2 0x13c
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL1_CP_CNTL_SHIFT 20
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL1_CP_CNTL_MASK 0xf
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL0_CP_CNTL_SHIFT 16
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL0_CP_CNTL_MASK 0xf
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_TCLKOUT_EN (1 << 12)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_TXCLKREF_SEL (1 << 4)
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_XDIGCLK_SEL_SHIFT 0
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL2_XDIGCLK_SEL_MASK 0x7
+
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL3 0x140
+#define XUSB_PADCTL_IOPHY_PLL_S0_CTL3_RCAL_BYPASS (1 << 7)
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1 0x148
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ_OVRD (1 << 1)
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ (1 << 0)
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL2 0x14c
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL5 0x158
+
+#define XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL6 0x15c
+
+struct tegra124_xusb_fuse_calibration {
+	u32 hs_curr_level[3];
+	u32 hs_iref_cap;
+	u32 hs_term_range_adj;
+	u32 hs_squelch_level;
+};
+
+struct tegra124_xusb_padctl {
+	struct tegra_xusb_padctl base;
+
+	struct tegra124_xusb_fuse_calibration fuse;
+};
+
+static inline struct tegra124_xusb_padctl *
+to_tegra124_xusb_padctl(struct tegra_xusb_padctl *padctl)
+{
+	return container_of(padctl, struct tegra124_xusb_padctl, base);
+}
+
+static int tegra124_xusb_padctl_enable(struct tegra_xusb_padctl *padctl)
+{
+	u32 value;
+
+	mutex_lock(&padctl->lock);
+
+	if (padctl->enable++ > 0)
+		goto out;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN_EARLY;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_VCORE_DOWN;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+out:
+	mutex_unlock(&padctl->lock);
+	return 0;
+}
+
+static int tegra124_xusb_padctl_disable(struct tegra_xusb_padctl *padctl)
+{
+	u32 value;
+
+	mutex_lock(&padctl->lock);
+
+	if (WARN_ON(padctl->enable == 0))
+		goto out;
+
+	if (--padctl->enable > 0)
+		goto out;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_VCORE_DOWN;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN_EARLY;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_AUX_MUX_LP0_CLAMP_EN;
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+out:
+	mutex_unlock(&padctl->lock);
+	return 0;
+}
+
+static int tegra124_usb3_save_context(struct tegra_xusb_padctl *padctl,
+				      unsigned int index)
+{
+	struct tegra_xusb_usb3_port *port;
+	struct tegra_xusb_lane *lane;
+	u32 value, offset;
+
+	port = tegra_xusb_find_usb3_port(padctl, index);
+	if (!port)
+		return -ENODEV;
+
+	port->context_saved = true;
+	lane = port->base.lane;
+
+	if (lane->pad == padctl->pcie)
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL6(lane->index);
+	else
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL6;
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_TAP <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	value = padctl_readl(padctl, offset) >>
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SHIFT;
+	port->tap1 = value & XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_TAP_MASK;
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_AMP <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	value = padctl_readl(padctl, offset) >>
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SHIFT;
+	port->amp = value & XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_AMP_MASK;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_USB3_PADX_CTL4(index));
+	value &= ~((XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_SHIFT) |
+		   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_SHIFT));
+	value |= (port->tap1 <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_SHIFT) |
+		 (port->amp <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_USB3_PADX_CTL4(index));
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_LATCH_G_Z <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_G_Z <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	value = padctl_readl(padctl, offset) >>
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SHIFT;
+	port->ctle_g = value &
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_G_Z_MASK;
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_CTLE_Z <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SEL_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	value = padctl_readl(padctl, offset) >>
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_SHIFT;
+	port->ctle_z = value &
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL6_MISC_OUT_G_Z_MASK;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_USB3_PADX_CTL2(index));
+	value &= ~((XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_SHIFT) |
+		   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_SHIFT));
+	value |= (port->ctle_g <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_SHIFT) |
+		 (port->ctle_z <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_USB3_PADX_CTL2(index));
+
+	return 0;
+}
+
+static int tegra124_hsic_set_idle(struct tegra_xusb_padctl *padctl,
+				  unsigned int index, bool idle)
+{
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	if (idle)
+		value |= XUSB_PADCTL_HSIC_PAD_CTL1_RPD_DATA |
+			 XUSB_PADCTL_HSIC_PAD_CTL1_RPU_STROBE;
+	else
+		value &= ~(XUSB_PADCTL_HSIC_PAD_CTL1_RPD_DATA |
+			   XUSB_PADCTL_HSIC_PAD_CTL1_RPU_STROBE);
+
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	return 0;
+}
+
+#define TEGRA124_LANE(_name, _offset, _shift, _mask, _type)		\
+	{								\
+		.name = _name,						\
+		.offset = _offset,					\
+		.shift = _shift,					\
+		.mask = _mask,						\
+		.num_funcs = ARRAY_SIZE(tegra124_##_type##_functions),	\
+		.funcs = tegra124_##_type##_functions,			\
+	}
+
+static const char * const tegra124_usb2_functions[] = {
+	"snps",
+	"xusb",
+	"uart",
+};
+
+static const struct tegra_xusb_lane_soc tegra124_usb2_lanes[] = {
+	TEGRA124_LANE("usb2-0", 0x004,  0, 0x3, usb2),
+	TEGRA124_LANE("usb2-1", 0x004,  2, 0x3, usb2),
+	TEGRA124_LANE("usb2-2", 0x004,  4, 0x3, usb2),
+};
+
+static struct tegra_xusb_lane *
+tegra124_usb2_lane_probe(struct tegra_xusb_pad *pad, struct device_node *np,
+			 unsigned int index)
+{
+	struct tegra_xusb_usb2_lane *usb2;
+	int err;
+
+	usb2 = kzalloc(sizeof(*usb2), GFP_KERNEL);
+	if (!usb2)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&usb2->base.list);
+	usb2->base.soc = &pad->soc->lanes[index];
+	usb2->base.index = index;
+	usb2->base.pad = pad;
+	usb2->base.np = np;
+
+	err = tegra_xusb_lane_parse_dt(&usb2->base, np);
+	if (err < 0) {
+		kfree(usb2);
+		return ERR_PTR(err);
+	}
+
+	return &usb2->base;
+}
+
+static void tegra124_usb2_lane_remove(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_usb2_lane *usb2 = to_usb2_lane(lane);
+
+	kfree(usb2);
+}
+
+static const struct tegra_xusb_lane_ops tegra124_usb2_lane_ops = {
+	.probe = tegra124_usb2_lane_probe,
+	.remove = tegra124_usb2_lane_remove,
+};
+
+static int tegra124_usb2_phy_init(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_enable(lane->pad->padctl);
+}
+
+static int tegra124_usb2_phy_exit(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_disable(lane->pad->padctl);
+}
+
+static int tegra124_usb2_phy_power_on(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_usb2_lane *usb2 = to_usb2_lane(lane);
+	struct tegra_xusb_usb2_pad *pad = to_usb2_pad(lane->pad);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	struct tegra124_xusb_padctl *priv;
+	struct tegra_xusb_usb2_port *port;
+	unsigned int index = lane->index;
+	u32 value;
+	int err;
+
+	port = tegra_xusb_find_usb2_port(padctl, index);
+	if (!port) {
+		dev_err(&phy->dev, "no port found for USB2 lane %u\n", index);
+		return -ENODEV;
+	}
+
+	priv = to_tegra124_xusb_padctl(padctl);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+	value &= ~((XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_MASK <<
+		    XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_SHIFT) |
+		   (XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_MASK <<
+		    XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_SHIFT));
+	value |= (priv->fuse.hs_squelch_level <<
+		  XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_SHIFT) |
+		 (XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_VAL <<
+		  XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_DISCON_LEVEL_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_PORT_CAP);
+	value &= ~(XUSB_PADCTL_USB2_PORT_CAP_PORT_CAP_MASK <<
+		   XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_SHIFT(index));
+	value |= XUSB_PADCTL_USB2_PORT_CAP_HOST <<
+		XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_SHIFT(index);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_PORT_CAP);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL0(index));
+	value &= ~((XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_CURR_LEVEL_MASK <<
+		    XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_CURR_LEVEL_SHIFT) |
+		   (XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_MASK <<
+		    XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_SHIFT) |
+		   (XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_MASK <<
+		    XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_SHIFT) |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD2 |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL0_PD_ZI);
+	value |= (priv->fuse.hs_curr_level[index] +
+		  usb2->hs_curr_level_offset) <<
+		XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_CURR_LEVEL_SHIFT;
+	value |= XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_VAL <<
+		XUSB_PADCTL_USB2_OTG_PAD_CTL0_HS_SLEW_SHIFT;
+	value |= XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_VAL(index) <<
+		XUSB_PADCTL_USB2_OTG_PAD_CTL0_LS_RSLEW_SHIFT;
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_OTG_PADX_CTL0(index));
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index));
+	value &= ~((XUSB_PADCTL_USB2_OTG_PAD_CTL1_TERM_RANGE_ADJ_MASK <<
+		    XUSB_PADCTL_USB2_OTG_PAD_CTL1_TERM_RANGE_ADJ_SHIFT) |
+		   (XUSB_PADCTL_USB2_OTG_PAD_CTL1_HS_IREF_CAP_MASK <<
+		    XUSB_PADCTL_USB2_OTG_PAD_CTL1_HS_IREF_CAP_SHIFT) |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_DR |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_CHRP_FORCE_POWERUP |
+		   XUSB_PADCTL_USB2_OTG_PAD_CTL1_PD_DISC_FORCE_POWERUP);
+	value |= (priv->fuse.hs_term_range_adj <<
+		  XUSB_PADCTL_USB2_OTG_PAD_CTL1_TERM_RANGE_ADJ_SHIFT) |
+		 (priv->fuse.hs_iref_cap <<
+		  XUSB_PADCTL_USB2_OTG_PAD_CTL1_HS_IREF_CAP_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index));
+
+	err = regulator_enable(port->supply);
+	if (err)
+		return err;
+
+	mutex_lock(&pad->lock);
+
+	if (pad->enable++ > 0)
+		goto out;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+	value &= ~XUSB_PADCTL_USB2_BIAS_PAD_CTL0_PD;
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+
+out:
+	mutex_unlock(&pad->lock);
+	return 0;
+}
+
+static int tegra124_usb2_phy_power_off(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_usb2_pad *pad = to_usb2_pad(lane->pad);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	struct tegra_xusb_usb2_port *port;
+	u32 value;
+
+	port = tegra_xusb_find_usb2_port(padctl, lane->index);
+	if (!port) {
+		dev_err(&phy->dev, "no port found for USB2 lane %u\n",
+			lane->index);
+		return -ENODEV;
+	}
+
+	mutex_lock(&pad->lock);
+
+	if (WARN_ON(pad->enable == 0))
+		goto out;
+
+	if (--pad->enable > 0)
+		goto out;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+	value |= XUSB_PADCTL_USB2_BIAS_PAD_CTL0_PD;
+	padctl_writel(padctl, value, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
+
+out:
+	regulator_disable(port->supply);
+	mutex_unlock(&pad->lock);
+	return 0;
+}
+
+static const struct phy_ops tegra124_usb2_phy_ops = {
+	.init = tegra124_usb2_phy_init,
+	.exit = tegra124_usb2_phy_exit,
+	.power_on = tegra124_usb2_phy_power_on,
+	.power_off = tegra124_usb2_phy_power_off,
+	.owner = THIS_MODULE,
+};
+
+static struct tegra_xusb_pad *
+tegra124_usb2_pad_probe(struct tegra_xusb_padctl *padctl,
+			const struct tegra_xusb_pad_soc *soc,
+			struct device_node *np)
+{
+	struct tegra_xusb_usb2_pad *usb2;
+	struct tegra_xusb_pad *pad;
+	int err;
+
+	usb2 = kzalloc(sizeof(*usb2), GFP_KERNEL);
+	if (!usb2)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&usb2->lock);
+
+	pad = &usb2->base;
+	pad->ops = &tegra124_usb2_lane_ops;
+	pad->soc = soc;
+
+	err = tegra_xusb_pad_init(pad, padctl, np);
+	if (err < 0) {
+		kfree(usb2);
+		goto out;
+	}
+
+	err = tegra_xusb_pad_register(pad, &tegra124_usb2_phy_ops);
+	if (err < 0)
+		goto unregister;
+
+	dev_set_drvdata(&pad->dev, pad);
+
+	return pad;
+
+unregister:
+	device_unregister(&pad->dev);
+out:
+	return ERR_PTR(err);
+}
+
+static void tegra124_usb2_pad_remove(struct tegra_xusb_pad *pad)
+{
+	struct tegra_xusb_usb2_pad *usb2 = to_usb2_pad(pad);
+
+	kfree(usb2);
+}
+
+static const struct tegra_xusb_pad_ops tegra124_usb2_ops = {
+	.probe = tegra124_usb2_pad_probe,
+	.remove = tegra124_usb2_pad_remove,
+};
+
+static const struct tegra_xusb_pad_soc tegra124_usb2_pad = {
+	.name = "usb2",
+	.num_lanes = ARRAY_SIZE(tegra124_usb2_lanes),
+	.lanes = tegra124_usb2_lanes,
+	.ops = &tegra124_usb2_ops,
+};
+
+static const char * const tegra124_ulpi_functions[] = {
+	"snps",
+	"xusb",
+};
+
+static const struct tegra_xusb_lane_soc tegra124_ulpi_lanes[] = {
+	TEGRA124_LANE("ulpi-0", 0x004, 12, 0x1, ulpi),
+};
+
+static struct tegra_xusb_lane *
+tegra124_ulpi_lane_probe(struct tegra_xusb_pad *pad, struct device_node *np,
+			 unsigned int index)
+{
+	struct tegra_xusb_ulpi_lane *ulpi;
+	int err;
+
+	ulpi = kzalloc(sizeof(*ulpi), GFP_KERNEL);
+	if (!ulpi)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ulpi->base.list);
+	ulpi->base.soc = &pad->soc->lanes[index];
+	ulpi->base.index = index;
+	ulpi->base.pad = pad;
+	ulpi->base.np = np;
+
+	err = tegra_xusb_lane_parse_dt(&ulpi->base, np);
+	if (err < 0) {
+		kfree(ulpi);
+		return ERR_PTR(err);
+	}
+
+	return &ulpi->base;
+}
+
+static void tegra124_ulpi_lane_remove(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_ulpi_lane *ulpi = to_ulpi_lane(lane);
+
+	kfree(ulpi);
+}
+
+static const struct tegra_xusb_lane_ops tegra124_ulpi_lane_ops = {
+	.probe = tegra124_ulpi_lane_probe,
+	.remove = tegra124_ulpi_lane_remove,
+};
+
+static int tegra124_ulpi_phy_init(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_enable(lane->pad->padctl);
+}
+
+static int tegra124_ulpi_phy_exit(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_disable(lane->pad->padctl);
+}
+
+static int tegra124_ulpi_phy_power_on(struct phy *phy)
+{
+	return 0;
+}
+
+static int tegra124_ulpi_phy_power_off(struct phy *phy)
+{
+	return 0;
+}
+
+static const struct phy_ops tegra124_ulpi_phy_ops = {
+	.init = tegra124_ulpi_phy_init,
+	.exit = tegra124_ulpi_phy_exit,
+	.power_on = tegra124_ulpi_phy_power_on,
+	.power_off = tegra124_ulpi_phy_power_off,
+	.owner = THIS_MODULE,
+};
+
+static struct tegra_xusb_pad *
+tegra124_ulpi_pad_probe(struct tegra_xusb_padctl *padctl,
+			const struct tegra_xusb_pad_soc *soc,
+			struct device_node *np)
+{
+	struct tegra_xusb_ulpi_pad *ulpi;
+	struct tegra_xusb_pad *pad;
+	int err;
+
+	ulpi = kzalloc(sizeof(*ulpi), GFP_KERNEL);
+	if (!ulpi)
+		return ERR_PTR(-ENOMEM);
+
+	pad = &ulpi->base;
+	pad->ops = &tegra124_ulpi_lane_ops;
+	pad->soc = soc;
+
+	err = tegra_xusb_pad_init(pad, padctl, np);
+	if (err < 0) {
+		kfree(ulpi);
+		goto out;
+	}
+
+	err = tegra_xusb_pad_register(pad, &tegra124_ulpi_phy_ops);
+	if (err < 0)
+		goto unregister;
+
+	dev_set_drvdata(&pad->dev, pad);
+
+	return pad;
+
+unregister:
+	device_unregister(&pad->dev);
+out:
+	return ERR_PTR(err);
+}
+
+static void tegra124_ulpi_pad_remove(struct tegra_xusb_pad *pad)
+{
+	struct tegra_xusb_ulpi_pad *ulpi = to_ulpi_pad(pad);
+
+	kfree(ulpi);
+}
+
+static const struct tegra_xusb_pad_ops tegra124_ulpi_ops = {
+	.probe = tegra124_ulpi_pad_probe,
+	.remove = tegra124_ulpi_pad_remove,
+};
+
+static const struct tegra_xusb_pad_soc tegra124_ulpi_pad = {
+	.name = "ulpi",
+	.num_lanes = ARRAY_SIZE(tegra124_ulpi_lanes),
+	.lanes = tegra124_ulpi_lanes,
+	.ops = &tegra124_ulpi_ops,
+};
+
+static const char * const tegra124_hsic_functions[] = {
+	"snps",
+	"xusb",
+};
+
+static const struct tegra_xusb_lane_soc tegra124_hsic_lanes[] = {
+	TEGRA124_LANE("hsic-0", 0x004, 14, 0x1, hsic),
+	TEGRA124_LANE("hsic-1", 0x004, 15, 0x1, hsic),
+};
+
+static struct tegra_xusb_lane *
+tegra124_hsic_lane_probe(struct tegra_xusb_pad *pad, struct device_node *np,
+			 unsigned int index)
+{
+	struct tegra_xusb_hsic_lane *hsic;
+	int err;
+
+	hsic = kzalloc(sizeof(*hsic), GFP_KERNEL);
+	if (!hsic)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&hsic->base.list);
+	hsic->base.soc = &pad->soc->lanes[index];
+	hsic->base.index = index;
+	hsic->base.pad = pad;
+	hsic->base.np = np;
+
+	err = tegra_xusb_lane_parse_dt(&hsic->base, np);
+	if (err < 0) {
+		kfree(hsic);
+		return ERR_PTR(err);
+	}
+
+	return &hsic->base;
+}
+
+static void tegra124_hsic_lane_remove(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_hsic_lane *hsic = to_hsic_lane(lane);
+
+	kfree(hsic);
+}
+
+static const struct tegra_xusb_lane_ops tegra124_hsic_lane_ops = {
+	.probe = tegra124_hsic_lane_probe,
+	.remove = tegra124_hsic_lane_remove,
+};
+
+static int tegra124_hsic_phy_init(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_enable(lane->pad->padctl);
+}
+
+static int tegra124_hsic_phy_exit(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_disable(lane->pad->padctl);
+}
+
+static int tegra124_hsic_phy_power_on(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_hsic_lane *hsic = to_hsic_lane(lane);
+	struct tegra_xusb_hsic_pad *pad = to_hsic_pad(lane->pad);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	unsigned int index = lane->index;
+	u32 value;
+	int err;
+
+	err = regulator_enable(pad->supply);
+	if (err)
+		return err;
+
+	padctl_writel(padctl, hsic->strobe_trim,
+		      XUSB_PADCTL_HSIC_STRB_TRIM_CONTROL);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	if (hsic->auto_term)
+		value |= XUSB_PADCTL_HSIC_PAD_CTL1_AUTO_TERM_EN;
+	else
+		value &= ~XUSB_PADCTL_HSIC_PAD_CTL1_AUTO_TERM_EN;
+
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL0(index));
+	value &= ~((XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEN_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEN_SHIFT) |
+		   (XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEP_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEP_SHIFT) |
+		   (XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWN_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWN_SHIFT) |
+		   (XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWP_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWP_SHIFT));
+	value |= (hsic->tx_rtune_n <<
+		  XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEN_SHIFT) |
+		(hsic->tx_rtune_p <<
+		  XUSB_PADCTL_HSIC_PAD_CTL0_TX_RTUNEP_SHIFT) |
+		(hsic->tx_rslew_n <<
+		 XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWN_SHIFT) |
+		(hsic->tx_rslew_p <<
+		 XUSB_PADCTL_HSIC_PAD_CTL0_TX_RSLEWP_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL0(index));
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL2(index));
+	value &= ~((XUSB_PADCTL_HSIC_PAD_CTL2_RX_STROBE_TRIM_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL2_RX_STROBE_TRIM_SHIFT) |
+		   (XUSB_PADCTL_HSIC_PAD_CTL2_RX_DATA_TRIM_MASK <<
+		    XUSB_PADCTL_HSIC_PAD_CTL2_RX_DATA_TRIM_SHIFT));
+	value |= (hsic->rx_strobe_trim <<
+		  XUSB_PADCTL_HSIC_PAD_CTL2_RX_STROBE_TRIM_SHIFT) |
+		(hsic->rx_data_trim <<
+		 XUSB_PADCTL_HSIC_PAD_CTL2_RX_DATA_TRIM_SHIFT);
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL2(index));
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+	value &= ~(XUSB_PADCTL_HSIC_PAD_CTL1_RPD_STROBE |
+		   XUSB_PADCTL_HSIC_PAD_CTL1_RPU_DATA |
+		   XUSB_PADCTL_HSIC_PAD_CTL1_PD_RX |
+		   XUSB_PADCTL_HSIC_PAD_CTL1_PD_ZI |
+		   XUSB_PADCTL_HSIC_PAD_CTL1_PD_TRX |
+		   XUSB_PADCTL_HSIC_PAD_CTL1_PD_TX);
+	value |= XUSB_PADCTL_HSIC_PAD_CTL1_RPD_DATA |
+		 XUSB_PADCTL_HSIC_PAD_CTL1_RPU_STROBE;
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	return 0;
+}
+
+static int tegra124_hsic_phy_power_off(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_hsic_pad *pad = to_hsic_pad(lane->pad);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	unsigned int index = lane->index;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+	value |= XUSB_PADCTL_HSIC_PAD_CTL1_PD_RX |
+		 XUSB_PADCTL_HSIC_PAD_CTL1_PD_ZI |
+		 XUSB_PADCTL_HSIC_PAD_CTL1_PD_TRX |
+		 XUSB_PADCTL_HSIC_PAD_CTL1_PD_TX;
+	padctl_writel(padctl, value, XUSB_PADCTL_HSIC_PADX_CTL1(index));
+
+	regulator_disable(pad->supply);
+
+	return 0;
+}
+
+static const struct phy_ops tegra124_hsic_phy_ops = {
+	.init = tegra124_hsic_phy_init,
+	.exit = tegra124_hsic_phy_exit,
+	.power_on = tegra124_hsic_phy_power_on,
+	.power_off = tegra124_hsic_phy_power_off,
+	.owner = THIS_MODULE,
+};
+
+static struct tegra_xusb_pad *
+tegra124_hsic_pad_probe(struct tegra_xusb_padctl *padctl,
+			const struct tegra_xusb_pad_soc *soc,
+			struct device_node *np)
+{
+	struct tegra_xusb_hsic_pad *hsic;
+	struct tegra_xusb_pad *pad;
+	int err;
+
+	hsic = kzalloc(sizeof(*hsic), GFP_KERNEL);
+	if (!hsic)
+		return ERR_PTR(-ENOMEM);
+
+	pad = &hsic->base;
+	pad->ops = &tegra124_hsic_lane_ops;
+	pad->soc = soc;
+
+	err = tegra_xusb_pad_init(pad, padctl, np);
+	if (err < 0) {
+		kfree(hsic);
+		goto out;
+	}
+
+	err = tegra_xusb_pad_register(pad, &tegra124_hsic_phy_ops);
+	if (err < 0)
+		goto unregister;
+
+	dev_set_drvdata(&pad->dev, pad);
+
+	return pad;
+
+unregister:
+	device_unregister(&pad->dev);
+out:
+	return ERR_PTR(err);
+}
+
+static void tegra124_hsic_pad_remove(struct tegra_xusb_pad *pad)
+{
+	struct tegra_xusb_hsic_pad *hsic = to_hsic_pad(pad);
+
+	kfree(hsic);
+}
+
+static const struct tegra_xusb_pad_ops tegra124_hsic_ops = {
+	.probe = tegra124_hsic_pad_probe,
+	.remove = tegra124_hsic_pad_remove,
+};
+
+static const struct tegra_xusb_pad_soc tegra124_hsic_pad = {
+	.name = "hsic",
+	.num_lanes = ARRAY_SIZE(tegra124_hsic_lanes),
+	.lanes = tegra124_hsic_lanes,
+	.ops = &tegra124_hsic_ops,
+};
+
+static const char * const tegra124_pcie_functions[] = {
+	"pcie",
+	"usb3-ss",
+	"sata",
+};
+
+static const struct tegra_xusb_lane_soc tegra124_pcie_lanes[] = {
+	TEGRA124_LANE("pcie-0", 0x134, 16, 0x3, pcie),
+	TEGRA124_LANE("pcie-1", 0x134, 18, 0x3, pcie),
+	TEGRA124_LANE("pcie-2", 0x134, 20, 0x3, pcie),
+	TEGRA124_LANE("pcie-3", 0x134, 22, 0x3, pcie),
+	TEGRA124_LANE("pcie-4", 0x134, 24, 0x3, pcie),
+};
+
+static struct tegra_xusb_lane *
+tegra124_pcie_lane_probe(struct tegra_xusb_pad *pad, struct device_node *np,
+			 unsigned int index)
+{
+	struct tegra_xusb_pcie_lane *pcie;
+	int err;
+
+	pcie = kzalloc(sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pcie->base.list);
+	pcie->base.soc = &pad->soc->lanes[index];
+	pcie->base.index = index;
+	pcie->base.pad = pad;
+	pcie->base.np = np;
+
+	err = tegra_xusb_lane_parse_dt(&pcie->base, np);
+	if (err < 0) {
+		kfree(pcie);
+		return ERR_PTR(err);
+	}
+
+	return &pcie->base;
+}
+
+static void tegra124_pcie_lane_remove(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_pcie_lane *pcie = to_pcie_lane(lane);
+
+	kfree(pcie);
+}
+
+static const struct tegra_xusb_lane_ops tegra124_pcie_lane_ops = {
+	.probe = tegra124_pcie_lane_probe,
+	.remove = tegra124_pcie_lane_remove,
+};
+
+static int tegra124_pcie_phy_init(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_enable(lane->pad->padctl);
+}
+
+static int tegra124_pcie_phy_exit(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_disable(lane->pad->padctl);
+}
+
+static int tegra124_pcie_phy_power_on(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	unsigned long timeout;
+	int err = -ETIMEDOUT;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_PLL_P0_CTL1_REFCLK_SEL_MASK;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_P0_CTL2);
+	value |= XUSB_PADCTL_IOPHY_PLL_P0_CTL2_REFCLKBUF_EN |
+		 XUSB_PADCTL_IOPHY_PLL_P0_CTL2_TXCLKREF_EN |
+		 XUSB_PADCTL_IOPHY_PLL_P0_CTL2_TXCLKREF_SEL;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_P0_CTL2);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+	value |= XUSB_PADCTL_IOPHY_PLL_P0_CTL1_PLL_RST;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+
+	timeout = jiffies + msecs_to_jiffies(50);
+
+	while (time_before(jiffies, timeout)) {
+		value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+		if (value & XUSB_PADCTL_IOPHY_PLL_P0_CTL1_PLL0_LOCKDET) {
+			err = 0;
+			break;
+		}
+
+		usleep_range(100, 200);
+	}
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB3_PAD_MUX);
+	value |= XUSB_PADCTL_USB3_PAD_MUX_PCIE_IDDQ_DISABLE(lane->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB3_PAD_MUX);
+
+	return err;
+}
+
+static int tegra124_pcie_phy_power_off(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB3_PAD_MUX);
+	value &= ~XUSB_PADCTL_USB3_PAD_MUX_PCIE_IDDQ_DISABLE(lane->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB3_PAD_MUX);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_PLL_P0_CTL1_PLL_RST;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_P0_CTL1);
+
+	return 0;
+}
+
+static const struct phy_ops tegra124_pcie_phy_ops = {
+	.init = tegra124_pcie_phy_init,
+	.exit = tegra124_pcie_phy_exit,
+	.power_on = tegra124_pcie_phy_power_on,
+	.power_off = tegra124_pcie_phy_power_off,
+	.owner = THIS_MODULE,
+};
+
+static struct tegra_xusb_pad *
+tegra124_pcie_pad_probe(struct tegra_xusb_padctl *padctl,
+			const struct tegra_xusb_pad_soc *soc,
+			struct device_node *np)
+{
+	struct tegra_xusb_pcie_pad *pcie;
+	struct tegra_xusb_pad *pad;
+	int err;
+
+	pcie = kzalloc(sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return ERR_PTR(-ENOMEM);
+
+	pad = &pcie->base;
+	pad->ops = &tegra124_pcie_lane_ops;
+	pad->soc = soc;
+
+	err = tegra_xusb_pad_init(pad, padctl, np);
+	if (err < 0) {
+		kfree(pcie);
+		goto out;
+	}
+
+	err = tegra_xusb_pad_register(pad, &tegra124_pcie_phy_ops);
+	if (err < 0)
+		goto unregister;
+
+	dev_set_drvdata(&pad->dev, pad);
+
+	return pad;
+
+unregister:
+	device_unregister(&pad->dev);
+out:
+	return ERR_PTR(err);
+}
+
+static void tegra124_pcie_pad_remove(struct tegra_xusb_pad *pad)
+{
+	struct tegra_xusb_pcie_pad *pcie = to_pcie_pad(pad);
+
+	kfree(pcie);
+}
+
+static const struct tegra_xusb_pad_ops tegra124_pcie_ops = {
+	.probe = tegra124_pcie_pad_probe,
+	.remove = tegra124_pcie_pad_remove,
+};
+
+static const struct tegra_xusb_pad_soc tegra124_pcie_pad = {
+	.name = "pcie",
+	.num_lanes = ARRAY_SIZE(tegra124_pcie_lanes),
+	.lanes = tegra124_pcie_lanes,
+	.ops = &tegra124_pcie_ops,
+};
+
+static const struct tegra_xusb_lane_soc tegra124_sata_lanes[] = {
+	TEGRA124_LANE("sata-0", 0x134, 26, 0x3, pcie),
+};
+
+static struct tegra_xusb_lane *
+tegra124_sata_lane_probe(struct tegra_xusb_pad *pad, struct device_node *np,
+			 unsigned int index)
+{
+	struct tegra_xusb_sata_lane *sata;
+	int err;
+
+	sata = kzalloc(sizeof(*sata), GFP_KERNEL);
+	if (!sata)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&sata->base.list);
+	sata->base.soc = &pad->soc->lanes[index];
+	sata->base.index = index;
+	sata->base.pad = pad;
+	sata->base.np = np;
+
+	err = tegra_xusb_lane_parse_dt(&sata->base, np);
+	if (err < 0) {
+		kfree(sata);
+		return ERR_PTR(err);
+	}
+
+	return &sata->base;
+}
+
+static void tegra124_sata_lane_remove(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_sata_lane *sata = to_sata_lane(lane);
+
+	kfree(sata);
+}
+
+static const struct tegra_xusb_lane_ops tegra124_sata_lane_ops = {
+	.probe = tegra124_sata_lane_probe,
+	.remove = tegra124_sata_lane_remove,
+};
+
+static int tegra124_sata_phy_init(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_enable(lane->pad->padctl);
+}
+
+static int tegra124_sata_phy_exit(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	return tegra124_xusb_padctl_disable(lane->pad->padctl);
+}
+
+static int tegra124_sata_phy_power_on(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	unsigned long timeout;
+	int err = -ETIMEDOUT;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ_OVRD;
+	value &= ~XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_PWR_OVRD;
+	value &= ~XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_IDDQ;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value |= XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL1_MODE;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value |= XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_RST;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	timeout = jiffies + msecs_to_jiffies(50);
+
+	while (time_before(jiffies, timeout)) {
+		value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+		if (value & XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL1_LOCKDET) {
+			err = 0;
+			break;
+		}
+
+		usleep_range(100, 200);
+	}
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB3_PAD_MUX);
+	value |= XUSB_PADCTL_USB3_PAD_MUX_SATA_IDDQ_DISABLE(lane->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB3_PAD_MUX);
+
+	return err;
+}
+
+static int tegra124_sata_phy_power_off(struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_USB3_PAD_MUX);
+	value &= ~XUSB_PADCTL_USB3_PAD_MUX_SATA_IDDQ_DISABLE(lane->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_USB3_PAD_MUX);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_RST;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value &= ~XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL1_MODE;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+	value |= XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_PWR_OVRD;
+	value |= XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL_IDDQ;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1);
+	value |= ~XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ_OVRD;
+	value |= ~XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1_IDDQ;
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL1);
+
+	return 0;
+}
+
+static const struct phy_ops tegra124_sata_phy_ops = {
+	.init = tegra124_sata_phy_init,
+	.exit = tegra124_sata_phy_exit,
+	.power_on = tegra124_sata_phy_power_on,
+	.power_off = tegra124_sata_phy_power_off,
+	.owner = THIS_MODULE,
+};
+
+static struct tegra_xusb_pad *
+tegra124_sata_pad_probe(struct tegra_xusb_padctl *padctl,
+			const struct tegra_xusb_pad_soc *soc,
+			struct device_node *np)
+{
+	struct tegra_xusb_sata_pad *sata;
+	struct tegra_xusb_pad *pad;
+	int err;
+
+	sata = kzalloc(sizeof(*sata), GFP_KERNEL);
+	if (!sata)
+		return ERR_PTR(-ENOMEM);
+
+	pad = &sata->base;
+	pad->ops = &tegra124_sata_lane_ops;
+	pad->soc = soc;
+
+	err = tegra_xusb_pad_init(pad, padctl, np);
+	if (err < 0) {
+		kfree(sata);
+		goto out;
+	}
+
+	err = tegra_xusb_pad_register(pad, &tegra124_sata_phy_ops);
+	if (err < 0)
+		goto unregister;
+
+	dev_set_drvdata(&pad->dev, pad);
+
+	return pad;
+
+unregister:
+	device_unregister(&pad->dev);
+out:
+	return ERR_PTR(err);
+}
+
+static void tegra124_sata_pad_remove(struct tegra_xusb_pad *pad)
+{
+	struct tegra_xusb_sata_pad *sata = to_sata_pad(pad);
+
+	kfree(sata);
+}
+
+static const struct tegra_xusb_pad_ops tegra124_sata_ops = {
+	.probe = tegra124_sata_pad_probe,
+	.remove = tegra124_sata_pad_remove,
+};
+
+static const struct tegra_xusb_pad_soc tegra124_sata_pad = {
+	.name = "sata",
+	.num_lanes = ARRAY_SIZE(tegra124_sata_lanes),
+	.lanes = tegra124_sata_lanes,
+	.ops = &tegra124_sata_ops,
+};
+
+static const struct tegra_xusb_pad_soc *tegra124_pads[] = {
+	&tegra124_usb2_pad,
+	&tegra124_ulpi_pad,
+	&tegra124_hsic_pad,
+	&tegra124_pcie_pad,
+	&tegra124_sata_pad,
+};
+
+static int tegra124_usb2_port_enable(struct tegra_xusb_port *port)
+{
+	return 0;
+}
+
+static void tegra124_usb2_port_disable(struct tegra_xusb_port *port)
+{
+}
+
+static struct tegra_xusb_lane *
+tegra124_usb2_port_map(struct tegra_xusb_port *port)
+{
+	return tegra_xusb_find_lane(port->padctl, "usb2", port->index);
+}
+
+static const struct tegra_xusb_port_ops tegra124_usb2_port_ops = {
+	.enable = tegra124_usb2_port_enable,
+	.disable = tegra124_usb2_port_disable,
+	.map = tegra124_usb2_port_map,
+};
+
+static int tegra124_ulpi_port_enable(struct tegra_xusb_port *port)
+{
+	return 0;
+}
+
+static void tegra124_ulpi_port_disable(struct tegra_xusb_port *port)
+{
+}
+
+static struct tegra_xusb_lane *
+tegra124_ulpi_port_map(struct tegra_xusb_port *port)
+{
+	return tegra_xusb_find_lane(port->padctl, "ulpi", port->index);
+}
+
+static const struct tegra_xusb_port_ops tegra124_ulpi_port_ops = {
+	.enable = tegra124_ulpi_port_enable,
+	.disable = tegra124_ulpi_port_disable,
+	.map = tegra124_ulpi_port_map,
+};
+
+static int tegra124_hsic_port_enable(struct tegra_xusb_port *port)
+{
+	return 0;
+}
+
+static void tegra124_hsic_port_disable(struct tegra_xusb_port *port)
+{
+}
+
+static struct tegra_xusb_lane *
+tegra124_hsic_port_map(struct tegra_xusb_port *port)
+{
+	return tegra_xusb_find_lane(port->padctl, "hsic", port->index);
+}
+
+static const struct tegra_xusb_port_ops tegra124_hsic_port_ops = {
+	.enable = tegra124_hsic_port_enable,
+	.disable = tegra124_hsic_port_disable,
+	.map = tegra124_hsic_port_map,
+};
+
+static int tegra124_usb3_port_enable(struct tegra_xusb_port *port)
+{
+	struct tegra_xusb_usb3_port *usb3 = to_usb3_port(port);
+	struct tegra_xusb_padctl *padctl = port->padctl;
+	struct tegra_xusb_lane *lane = usb3->base.lane;
+	unsigned int index = port->index, offset;
+	int ret = 0;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_SS_PORT_MAP);
+
+	if (!usb3->internal)
+		value &= ~XUSB_PADCTL_SS_PORT_MAP_PORTX_INTERNAL(index);
+	else
+		value |= XUSB_PADCTL_SS_PORT_MAP_PORTX_INTERNAL(index);
+
+	value &= ~XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_MASK(index);
+	value |= XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(index, usb3->port);
+	padctl_writel(padctl, value, XUSB_PADCTL_SS_PORT_MAP);
+
+	/*
+	 * TODO: move this code into the PCIe/SATA PHY ->power_on() callbacks
+	 * and conditionalize based on mux function? This seems to work, but
+	 * might not be the exact proper sequence.
+	 */
+	value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_USB3_PADX_CTL2(index));
+	value &= ~((XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_SHIFT) |
+		   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_SHIFT) |
+		   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_MASK <<
+		    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_SHIFT));
+	value |= (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_VAL <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_WANDER_SHIFT) |
+		 (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_VAL <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_CDR_CNTL_SHIFT) |
+		 (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_VAL <<
+		  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_SHIFT);
+
+	if (usb3->context_saved) {
+		value &= ~((XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_MASK <<
+			    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_SHIFT) |
+			   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_MASK <<
+			    XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_SHIFT));
+		value |= (usb3->ctle_g <<
+			  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_G_SHIFT) |
+			 (usb3->ctle_z <<
+			  XUSB_PADCTL_IOPHY_USB3_PAD_CTL2_RX_EQ_Z_SHIFT);
+	}
+
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_USB3_PADX_CTL2(index));
+
+	value = XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_VAL;
+
+	if (usb3->context_saved) {
+		value &= ~((XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_MASK <<
+			    XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_SHIFT) |
+			   (XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_MASK <<
+			    XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_SHIFT));
+		value |= (usb3->tap1 <<
+			  XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_TAP_SHIFT) |
+			 (usb3->amp <<
+			  XUSB_PADCTL_IOPHY_USB3_PAD_CTL4_DFE_CNTL_AMP_SHIFT);
+	}
+
+	padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_USB3_PADX_CTL4(index));
+
+	if (lane->pad == padctl->pcie)
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL2(lane->index);
+	else
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL2;
+
+	value = padctl_readl(padctl, offset);
+	value &= ~(XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_MASK <<
+		   XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_SHIFT);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_VAL <<
+		XUSB_PADCTL_IOPHY_MISC_PAD_CTL2_SPARE_IN_SHIFT;
+	padctl_writel(padctl, value, offset);
+
+	if (lane->pad == padctl->pcie)
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_PX_CTL5(lane->index);
+	else
+		offset = XUSB_PADCTL_IOPHY_MISC_PAD_S0_CTL5;
+
+	value = padctl_readl(padctl, offset);
+	value |= XUSB_PADCTL_IOPHY_MISC_PAD_CTL5_RX_QEYE_EN;
+	padctl_writel(padctl, value, offset);
+
+	/* Enable SATA PHY when SATA lane is used */
+	if (lane->pad == padctl->sata) {
+		value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+		value &= ~(XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL0_REFCLK_NDIV_MASK <<
+			   XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL0_REFCLK_NDIV_SHIFT);
+		value |= 0x2 <<
+			XUSB_PADCTL_IOPHY_PLL_S0_CTL1_PLL0_REFCLK_NDIV_SHIFT;
+		padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL1);
+
+		value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL2);
+		value &= ~((XUSB_PADCTL_IOPHY_PLL_S0_CTL2_XDIGCLK_SEL_MASK <<
+			    XUSB_PADCTL_IOPHY_PLL_S0_CTL2_XDIGCLK_SEL_SHIFT) |
+			   (XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL1_CP_CNTL_MASK <<
+			    XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL1_CP_CNTL_SHIFT) |
+			   (XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL0_CP_CNTL_MASK <<
+			    XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL0_CP_CNTL_SHIFT) |
+			   XUSB_PADCTL_IOPHY_PLL_S0_CTL2_TCLKOUT_EN);
+		value |= (0x7 <<
+			  XUSB_PADCTL_IOPHY_PLL_S0_CTL2_XDIGCLK_SEL_SHIFT) |
+			 (0x8 <<
+			  XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL1_CP_CNTL_SHIFT) |
+			 (0x8 <<
+			  XUSB_PADCTL_IOPHY_PLL_S0_CTL2_PLL0_CP_CNTL_SHIFT) |
+			 XUSB_PADCTL_IOPHY_PLL_S0_CTL2_TXCLKREF_SEL;
+		padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL2);
+
+		value = padctl_readl(padctl, XUSB_PADCTL_IOPHY_PLL_S0_CTL3);
+		value &= ~XUSB_PADCTL_IOPHY_PLL_S0_CTL3_RCAL_BYPASS;
+		padctl_writel(padctl, value, XUSB_PADCTL_IOPHY_PLL_S0_CTL3);
+	}
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_VCORE_DOWN(index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN_EARLY(index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value &= ~XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN(index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	return ret;
+}
+
+static void tegra124_usb3_port_disable(struct tegra_xusb_port *port)
+{
+	struct tegra_xusb_padctl *padctl = port->padctl;
+	u32 value;
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN_EARLY(port->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(100, 200);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_CLAMP_EN(port->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	usleep_range(250, 350);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM);
+	value |= XUSB_PADCTL_ELPG_PROGRAM_SSPX_ELPG_VCORE_DOWN(port->index);
+	padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM);
+
+	value = padctl_readl(padctl, XUSB_PADCTL_SS_PORT_MAP);
+	value &= ~XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_MASK(port->index);
+	value |= XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(port->index, 0x7);
+	padctl_writel(padctl, value, XUSB_PADCTL_SS_PORT_MAP);
+}
+
+static const struct tegra_xusb_lane_map tegra124_usb3_map[] = {
+	{ 0, "pcie", 0 },
+	{ 1, "pcie", 1 },
+	{ 1, "sata", 0 },
+	{ 0, NULL,   0 },
+};
+
+static struct tegra_xusb_lane *
+tegra124_usb3_port_map(struct tegra_xusb_port *port)
+{
+	return tegra_xusb_port_find_lane(port, tegra124_usb3_map, "usb3-ss");
+}
+
+static const struct tegra_xusb_port_ops tegra124_usb3_port_ops = {
+	.enable = tegra124_usb3_port_enable,
+	.disable = tegra124_usb3_port_disable,
+	.map = tegra124_usb3_port_map,
+};
+
+static int
+tegra124_xusb_read_fuse_calibration(struct tegra124_xusb_fuse_calibration *fuse)
+{
+	unsigned int i;
+	int err;
+	u32 value;
+
+	err = tegra_fuse_readl(TEGRA_FUSE_SKU_CALIB_0, &value);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(fuse->hs_curr_level); i++) {
+		fuse->hs_curr_level[i] =
+			(value >> FUSE_SKU_CALIB_HS_CURR_LEVEL_PADX_SHIFT(i)) &
+			FUSE_SKU_CALIB_HS_CURR_LEVEL_PAD_MASK;
+	}
+	fuse->hs_iref_cap =
+		(value >> FUSE_SKU_CALIB_HS_IREF_CAP_SHIFT) &
+		FUSE_SKU_CALIB_HS_IREF_CAP_MASK;
+	fuse->hs_term_range_adj =
+		(value >> FUSE_SKU_CALIB_HS_TERM_RANGE_ADJ_SHIFT) &
+		FUSE_SKU_CALIB_HS_TERM_RANGE_ADJ_MASK;
+	fuse->hs_squelch_level =
+		(value >> FUSE_SKU_CALIB_HS_SQUELCH_LEVEL_SHIFT) &
+		FUSE_SKU_CALIB_HS_SQUELCH_LEVEL_MASK;
+
+	return 0;
+}
+
+static struct tegra_xusb_padctl *
+tegra124_xusb_padctl_probe(struct device *dev,
+			   const struct tegra_xusb_padctl_soc *soc)
+{
+	struct tegra124_xusb_padctl *padctl;
+	int err;
+
+	padctl = devm_kzalloc(dev, sizeof(*padctl), GFP_KERNEL);
+	if (!padctl)
+		return ERR_PTR(-ENOMEM);
+
+	padctl->base.dev = dev;
+	padctl->base.soc = soc;
+
+	err = tegra124_xusb_read_fuse_calibration(&padctl->fuse);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return &padctl->base;
+}
+
+static void tegra124_xusb_padctl_remove(struct tegra_xusb_padctl *padctl)
+{
+}
+
+static const struct tegra_xusb_padctl_ops tegra124_xusb_padctl_ops = {
+	.probe = tegra124_xusb_padctl_probe,
+	.remove = tegra124_xusb_padctl_remove,
+	.usb3_save_context = tegra124_usb3_save_context,
+	.hsic_set_idle = tegra124_hsic_set_idle,
+};
+
+const struct tegra_xusb_padctl_soc tegra124_xusb_padctl_soc = {
+	.num_pads = ARRAY_SIZE(tegra124_pads),
+	.pads = tegra124_pads,
+	.ports = {
+		.usb2 = {
+			.ops = &tegra124_usb2_port_ops,
+			.count = 3,
+		},
+		.ulpi = {
+			.ops = &tegra124_ulpi_port_ops,
+			.count = 1,
+		},
+		.hsic = {
+			.ops = &tegra124_hsic_port_ops,
+			.count = 2,
+		},
+		.usb3 = {
+			.ops = &tegra124_usb3_port_ops,
+			.count = 2,
+		},
+	},
+	.ops = &tegra124_xusb_padctl_ops,
+};
+EXPORT_SYMBOL_GPL(tegra124_xusb_padctl_soc);
+
+MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra 124 XUSB Pad Controller driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
new file mode 100644
index 000000000000..ec83dfdbc206
--- /dev/null
+++ b/drivers/phy/tegra/xusb.c
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/mailbox_client.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/reset.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <soc/tegra/fuse.h>
+
+#include "xusb.h"
+
+static struct phy *tegra_xusb_pad_of_xlate(struct device *dev,
+					   struct of_phandle_args *args)
+{
+	struct tegra_xusb_pad *pad = dev_get_drvdata(dev);
+	struct phy *phy = NULL;
+	unsigned int i;
+
+	if (args->args_count != 0)
+		return ERR_PTR(-EINVAL);
+
+	for (i = 0; i < pad->soc->num_lanes; i++) {
+		if (!pad->lanes[i])
+			continue;
+
+		if (pad->lanes[i]->dev.of_node == args->np) {
+			phy = pad->lanes[i];
+			break;
+		}
+	}
+
+	if (phy == NULL)
+		phy = ERR_PTR(-ENODEV);
+
+	return phy;
+}
+
+static const struct of_device_id tegra_xusb_padctl_of_match[] = {
+#if defined(CONFIG_ARCH_TEGRA_124_SOC) || defined(CONFIG_ARCH_TEGRA_132_SOC)
+	{
+		.compatible = "nvidia,tegra124-xusb-padctl",
+		.data = &tegra124_xusb_padctl_soc,
+	},
+#endif
+#if defined(CONFIG_ARCH_TEGRA_210_SOC)
+	{
+		.compatible = "nvidia,tegra210-xusb-padctl",
+		.data = &tegra210_xusb_padctl_soc,
+	},
+#endif
+	{ }
+};
+MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
+
+static struct device_node *
+tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
+{
+	/*
+	 * of_find_node_by_name() drops a reference, so make sure to grab one.
+	 */
+	struct device_node *np = of_node_get(padctl->dev->of_node);
+
+	np = of_find_node_by_name(np, "pads");
+	if (np)
+		np = of_find_node_by_name(np, name);
+
+	return np;
+}
+
+static struct device_node *
+tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
+{
+	/*
+	 * of_find_node_by_name() drops a reference, so make sure to grab one.
+	 */
+	struct device_node *np = of_node_get(pad->dev.of_node);
+
+	np = of_find_node_by_name(np, "lanes");
+	if (!np)
+		return NULL;
+
+	return of_find_node_by_name(np, pad->soc->lanes[index].name);
+}
+
+int tegra_xusb_lane_lookup_function(struct tegra_xusb_lane *lane,
+				    const char *function)
+{
+	unsigned int i;
+
+	for (i = 0; i < lane->soc->num_funcs; i++)
+		if (strcmp(function, lane->soc->funcs[i]) == 0)
+			return i;
+
+	return -EINVAL;
+}
+
+int tegra_xusb_lane_parse_dt(struct tegra_xusb_lane *lane,
+			     struct device_node *np)
+{
+	struct device *dev = &lane->pad->dev;
+	const char *function;
+	int err;
+
+	err = of_property_read_string(np, "nvidia,function", &function);
+	if (err < 0)
+		return err;
+
+	err = tegra_xusb_lane_lookup_function(lane, function);
+	if (err < 0) {
+		dev_err(dev, "invalid function \"%s\" for lane \"%s\"\n",
+			function, np->name);
+		return err;
+	}
+
+	lane->function = err;
+
+	return 0;
+}
+
+static void tegra_xusb_lane_destroy(struct phy *phy)
+{
+	if (phy) {
+		struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+		lane->pad->ops->remove(lane);
+		phy_destroy(phy);
+	}
+}
+
+static void tegra_xusb_pad_release(struct device *dev)
+{
+	struct tegra_xusb_pad *pad = to_tegra_xusb_pad(dev);
+
+	pad->soc->ops->remove(pad);
+}
+
+static struct device_type tegra_xusb_pad_type = {
+	.release = tegra_xusb_pad_release,
+};
+
+int tegra_xusb_pad_init(struct tegra_xusb_pad *pad,
+			struct tegra_xusb_padctl *padctl,
+			struct device_node *np)
+{
+	int err;
+
+	device_initialize(&pad->dev);
+	INIT_LIST_HEAD(&pad->list);
+	pad->dev.parent = padctl->dev;
+	pad->dev.type = &tegra_xusb_pad_type;
+	pad->dev.of_node = np;
+	pad->padctl = padctl;
+
+	err = dev_set_name(&pad->dev, "%s", pad->soc->name);
+	if (err < 0)
+		goto unregister;
+
+	err = device_add(&pad->dev);
+	if (err < 0)
+		goto unregister;
+
+	return 0;
+
+unregister:
+	device_unregister(&pad->dev);
+	return err;
+}
+
+int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
+			    const struct phy_ops *ops)
+{
+	struct device_node *children;
+	struct phy *lane;
+	unsigned int i;
+	int err;
+
+	children = of_find_node_by_name(pad->dev.of_node, "lanes");
+	if (!children)
+		return -ENODEV;
+
+	pad->lanes = devm_kcalloc(&pad->dev, pad->soc->num_lanes, sizeof(lane),
+				  GFP_KERNEL);
+	if (!pad->lanes) {
+		of_node_put(children);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < pad->soc->num_lanes; i++) {
+		struct device_node *np = tegra_xusb_pad_find_phy_node(pad, i);
+		struct tegra_xusb_lane *lane;
+
+		/* skip disabled lanes */
+		if (!np || !of_device_is_available(np)) {
+			of_node_put(np);
+			continue;
+		}
+
+		pad->lanes[i] = phy_create(&pad->dev, np, ops);
+		if (IS_ERR(pad->lanes[i])) {
+			err = PTR_ERR(pad->lanes[i]);
+			of_node_put(np);
+			goto remove;
+		}
+
+		lane = pad->ops->probe(pad, np, i);
+		if (IS_ERR(lane)) {
+			phy_destroy(pad->lanes[i]);
+			err = PTR_ERR(lane);
+			goto remove;
+		}
+
+		list_add_tail(&lane->list, &pad->padctl->lanes);
+		phy_set_drvdata(pad->lanes[i], lane);
+	}
+
+	pad->provider = of_phy_provider_register_full(&pad->dev, children,
+						      tegra_xusb_pad_of_xlate);
+	if (IS_ERR(pad->provider)) {
+		err = PTR_ERR(pad->provider);
+		goto remove;
+	}
+
+	return 0;
+
+remove:
+	while (i--)
+		tegra_xusb_lane_destroy(pad->lanes[i]);
+
+	of_node_put(children);
+
+	return err;
+}
+
+void tegra_xusb_pad_unregister(struct tegra_xusb_pad *pad)
+{
+	unsigned int i = pad->soc->num_lanes;
+
+	of_phy_provider_unregister(pad->provider);
+
+	while (i--)
+		tegra_xusb_lane_destroy(pad->lanes[i]);
+
+	device_unregister(&pad->dev);
+}
+
+static struct tegra_xusb_pad *
+tegra_xusb_pad_create(struct tegra_xusb_padctl *padctl,
+		      const struct tegra_xusb_pad_soc *soc)
+{
+	struct tegra_xusb_pad *pad;
+	struct device_node *np;
+	int err;
+
+	np = tegra_xusb_find_pad_node(padctl, soc->name);
+	if (!np || !of_device_is_available(np))
+		return NULL;
+
+	pad = soc->ops->probe(padctl, soc, np);
+	if (IS_ERR(pad)) {
+		err = PTR_ERR(pad);
+		dev_err(padctl->dev, "failed to create pad %s: %d\n",
+			soc->name, err);
+		return ERR_PTR(err);
+	}
+
+	/* XXX move this into ->probe() to avoid string comparison */
+	if (strcmp(soc->name, "pcie") == 0)
+		padctl->pcie = pad;
+
+	if (strcmp(soc->name, "sata") == 0)
+		padctl->sata = pad;
+
+	if (strcmp(soc->name, "usb2") == 0)
+		padctl->usb2 = pad;
+
+	if (strcmp(soc->name, "ulpi") == 0)
+		padctl->ulpi = pad;
+
+	if (strcmp(soc->name, "hsic") == 0)
+		padctl->hsic = pad;
+
+	return pad;
+}
+
+static void __tegra_xusb_remove_pads(struct tegra_xusb_padctl *padctl)
+{
+	struct tegra_xusb_pad *pad, *tmp;
+
+	list_for_each_entry_safe_reverse(pad, tmp, &padctl->pads, list) {
+		list_del(&pad->list);
+		tegra_xusb_pad_unregister(pad);
+	}
+}
+
+static void tegra_xusb_remove_pads(struct tegra_xusb_padctl *padctl)
+{
+	mutex_lock(&padctl->lock);
+	__tegra_xusb_remove_pads(padctl);
+	mutex_unlock(&padctl->lock);
+}
+
+static void tegra_xusb_lane_program(struct tegra_xusb_lane *lane)
+{
+	struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+	const struct tegra_xusb_lane_soc *soc = lane->soc;
+	u32 value;
+
+	/* choose function */
+	value = padctl_readl(padctl, soc->offset);
+	value &= ~(soc->mask << soc->shift);
+	value |= lane->function << soc->shift;
+	padctl_writel(padctl, value, soc->offset);
+}
+
+static void tegra_xusb_pad_program(struct tegra_xusb_pad *pad)
+{
+	unsigned int i;
+
+	for (i = 0; i < pad->soc->num_lanes; i++) {
+		struct tegra_xusb_lane *lane;
+
+		if (pad->lanes[i]) {
+			lane = phy_get_drvdata(pad->lanes[i]);
+			tegra_xusb_lane_program(lane);
+		}
+	}
+}
+
+static int tegra_xusb_setup_pads(struct tegra_xusb_padctl *padctl)
+{
+	struct tegra_xusb_pad *pad;
+	unsigned int i;
+
+	mutex_lock(&padctl->lock);
+
+	for (i = 0; i < padctl->soc->num_pads; i++) {
+		const struct tegra_xusb_pad_soc *soc = padctl->soc->pads[i];
+		int err;
+
+		pad = tegra_xusb_pad_create(padctl, soc);
+		if (IS_ERR(pad)) {
+			err = PTR_ERR(pad);
+			dev_err(padctl->dev, "failed to create pad %s: %d\n",
+				soc->name, err);
+			__tegra_xusb_remove_pads(padctl);
+			mutex_unlock(&padctl->lock);
+			return err;
+		}
+
+		if (!pad)
+			continue;
+
+		list_add_tail(&pad->list, &padctl->pads);
+	}
+
+	list_for_each_entry(pad, &padctl->pads, list)
+		tegra_xusb_pad_program(pad);
+
+	mutex_unlock(&padctl->lock);
+	return 0;
+}
+
+static bool tegra_xusb_lane_check(struct tegra_xusb_lane *lane,
+				  const char *function)
+{
+	const char *func = lane->soc->funcs[lane->function];
+
+	return strcmp(function, func) == 0;
+}
+
+struct tegra_xusb_lane *tegra_xusb_find_lane(struct tegra_xusb_padctl *padctl,
+					     const char *type,
+					     unsigned int index)
+{
+	struct tegra_xusb_lane *lane, *hit = ERR_PTR(-ENODEV);
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
+	list_for_each_entry(lane, &padctl->lanes, list) {
+		if (strcmp(lane->soc->name, name) == 0) {
+			hit = lane;
+			break;
+		}
+	}
+
+	kfree(name);
+	return hit;
+}
+
+struct tegra_xusb_lane *
+tegra_xusb_port_find_lane(struct tegra_xusb_port *port,
+			  const struct tegra_xusb_lane_map *map,
+			  const char *function)
+{
+	struct tegra_xusb_lane *lane, *match = ERR_PTR(-ENODEV);
+
+	for (map = map; map->type; map++) {
+		if (port->index != map->port)
+			continue;
+
+		lane = tegra_xusb_find_lane(port->padctl, map->type,
+					    map->index);
+		if (IS_ERR(lane))
+			continue;
+
+		if (!tegra_xusb_lane_check(lane, function))
+			continue;
+
+		if (!IS_ERR(match))
+			dev_err(&port->dev, "conflicting match: %s-%u / %s\n",
+				map->type, map->index, match->soc->name);
+		else
+			match = lane;
+	}
+
+	return match;
+}
+
+static struct device_node *
+tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
+			  unsigned int index)
+{
+	/*
+	 * of_find_node_by_name() drops a reference, so make sure to grab one.
+	 */
+	struct device_node *np = of_node_get(padctl->dev->of_node);
+
+	np = of_find_node_by_name(np, "ports");
+	if (np) {
+		char *name;
+
+		name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
+		np = of_find_node_by_name(np, name);
+		kfree(name);
+	}
+
+	return np;
+}
+
+struct tegra_xusb_port *
+tegra_xusb_find_port(struct tegra_xusb_padctl *padctl, const char *type,
+		     unsigned int index)
+{
+	struct tegra_xusb_port *port;
+	struct device_node *np;
+
+	np = tegra_xusb_find_port_node(padctl, type, index);
+	if (!np)
+		return NULL;
+
+	list_for_each_entry(port, &padctl->ports, list) {
+		if (np == port->dev.of_node) {
+			of_node_put(np);
+			return port;
+		}
+	}
+
+	of_node_put(np);
+
+	return NULL;
+}
+
+struct tegra_xusb_usb2_port *
+tegra_xusb_find_usb2_port(struct tegra_xusb_padctl *padctl, unsigned int index)
+{
+	struct tegra_xusb_port *port;
+
+	port = tegra_xusb_find_port(padctl, "usb2", index);
+	if (port)
+		return to_usb2_port(port);
+
+	return NULL;
+}
+
+struct tegra_xusb_usb3_port *
+tegra_xusb_find_usb3_port(struct tegra_xusb_padctl *padctl, unsigned int index)
+{
+	struct tegra_xusb_port *port;
+
+	port = tegra_xusb_find_port(padctl, "usb3", index);
+	if (port)
+		return to_usb3_port(port);
+
+	return NULL;
+}
+
+static void tegra_xusb_port_release(struct device *dev)
+{
+}
+
+static struct device_type tegra_xusb_port_type = {
+	.release = tegra_xusb_port_release,
+};
+
+static int tegra_xusb_port_init(struct tegra_xusb_port *port,
+				struct tegra_xusb_padctl *padctl,
+				struct device_node *np,
+				const char *name,
+				unsigned int index)
+{
+	int err;
+
+	INIT_LIST_HEAD(&port->list);
+	port->padctl = padctl;
+	port->index = index;
+
+	device_initialize(&port->dev);
+	port->dev.type = &tegra_xusb_port_type;
+	port->dev.of_node = of_node_get(np);
+	port->dev.parent = padctl->dev;
+
+	err = dev_set_name(&port->dev, "%s-%u", name, index);
+	if (err < 0)
+		goto unregister;
+
+	err = device_add(&port->dev);
+	if (err < 0)
+		goto unregister;
+
+	return 0;
+
+unregister:
+	device_unregister(&port->dev);
+	return err;
+}
+
+static void tegra_xusb_port_unregister(struct tegra_xusb_port *port)
+{
+	device_unregister(&port->dev);
+}
+
+static int tegra_xusb_usb2_port_parse_dt(struct tegra_xusb_usb2_port *usb2)
+{
+	struct tegra_xusb_port *port = &usb2->base;
+	struct device_node *np = port->dev.of_node;
+
+	usb2->internal = of_property_read_bool(np, "nvidia,internal");
+
+	usb2->supply = devm_regulator_get(&port->dev, "vbus");
+	if (IS_ERR(usb2->supply))
+		return PTR_ERR(usb2->supply);
+
+	return 0;
+}
+
+static int tegra_xusb_add_usb2_port(struct tegra_xusb_padctl *padctl,
+				    unsigned int index)
+{
+	struct tegra_xusb_usb2_port *usb2;
+	struct device_node *np;
+	int err = 0;
+
+	/*
+	 * USB2 ports don't require additional properties, but if the port is
+	 * marked as disabled there is no reason to register it.
+	 */
+	np = tegra_xusb_find_port_node(padctl, "usb2", index);
+	if (!np || !of_device_is_available(np))
+		goto out;
+
+	usb2 = devm_kzalloc(padctl->dev, sizeof(*usb2), GFP_KERNEL);
+	if (!usb2) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = tegra_xusb_port_init(&usb2->base, padctl, np, "usb2", index);
+	if (err < 0)
+		goto out;
+
+	usb2->base.ops = padctl->soc->ports.usb2.ops;
+
+	usb2->base.lane = usb2->base.ops->map(&usb2->base);
+	if (IS_ERR(usb2->base.lane)) {
+		err = PTR_ERR(usb2->base.lane);
+		goto out;
+	}
+
+	err = tegra_xusb_usb2_port_parse_dt(usb2);
+	if (err < 0) {
+		tegra_xusb_port_unregister(&usb2->base);
+		goto out;
+	}
+
+	list_add_tail(&usb2->base.list, &padctl->ports);
+
+out:
+	of_node_put(np);
+	return err;
+}
+
+static int tegra_xusb_ulpi_port_parse_dt(struct tegra_xusb_ulpi_port *ulpi)
+{
+	struct tegra_xusb_port *port = &ulpi->base;
+	struct device_node *np = port->dev.of_node;
+
+	ulpi->internal = of_property_read_bool(np, "nvidia,internal");
+
+	return 0;
+}
+
+static int tegra_xusb_add_ulpi_port(struct tegra_xusb_padctl *padctl,
+				    unsigned int index)
+{
+	struct tegra_xusb_ulpi_port *ulpi;
+	struct device_node *np;
+	int err = 0;
+
+	np = tegra_xusb_find_port_node(padctl, "ulpi", index);
+	if (!np || !of_device_is_available(np))
+		goto out;
+
+	ulpi = devm_kzalloc(padctl->dev, sizeof(*ulpi), GFP_KERNEL);
+	if (!ulpi) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = tegra_xusb_port_init(&ulpi->base, padctl, np, "ulpi", index);
+	if (err < 0)
+		goto out;
+
+	ulpi->base.ops = padctl->soc->ports.ulpi.ops;
+
+	ulpi->base.lane = ulpi->base.ops->map(&ulpi->base);
+	if (IS_ERR(ulpi->base.lane)) {
+		err = PTR_ERR(ulpi->base.lane);
+		goto out;
+	}
+
+	err = tegra_xusb_ulpi_port_parse_dt(ulpi);
+	if (err < 0) {
+		tegra_xusb_port_unregister(&ulpi->base);
+		goto out;
+	}
+
+	list_add_tail(&ulpi->base.list, &padctl->ports);
+
+out:
+	of_node_put(np);
+	return err;
+}
+
+static int tegra_xusb_hsic_port_parse_dt(struct tegra_xusb_hsic_port *hsic)
+{
+	/* XXX */
+	return 0;
+}
+
+static int tegra_xusb_add_hsic_port(struct tegra_xusb_padctl *padctl,
+				    unsigned int index)
+{
+	struct tegra_xusb_hsic_port *hsic;
+	struct device_node *np;
+	int err = 0;
+
+	np = tegra_xusb_find_port_node(padctl, "hsic", index);
+	if (!np || !of_device_is_available(np))
+		goto out;
+
+	hsic = devm_kzalloc(padctl->dev, sizeof(*hsic), GFP_KERNEL);
+	if (!hsic) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = tegra_xusb_port_init(&hsic->base, padctl, np, "hsic", index);
+	if (err < 0)
+		goto out;
+
+	hsic->base.ops = padctl->soc->ports.hsic.ops;
+
+	hsic->base.lane = hsic->base.ops->map(&hsic->base);
+	if (IS_ERR(hsic->base.lane)) {
+		err = PTR_ERR(hsic->base.lane);
+		goto out;
+	}
+
+	err = tegra_xusb_hsic_port_parse_dt(hsic);
+	if (err < 0) {
+		tegra_xusb_port_unregister(&hsic->base);
+		goto out;
+	}
+
+	list_add_tail(&hsic->base.list, &padctl->ports);
+
+out:
+	of_node_put(np);
+	return err;
+}
+
+static int tegra_xusb_usb3_port_parse_dt(struct tegra_xusb_usb3_port *usb3)
+{
+	struct tegra_xusb_port *port = &usb3->base;
+	struct device_node *np = port->dev.of_node;
+	u32 value;
+	int err;
+
+	err = of_property_read_u32(np, "nvidia,usb2-companion", &value);
+	if (err < 0) {
+		dev_err(&port->dev, "failed to read port: %d\n", err);
+		return err;
+	}
+
+	usb3->port = value;
+
+	usb3->internal = of_property_read_bool(np, "nvidia,internal");
+
+	usb3->supply = devm_regulator_get(&port->dev, "vbus");
+	if (IS_ERR(usb3->supply))
+		return PTR_ERR(usb3->supply);
+
+	return 0;
+}
+
+static int tegra_xusb_add_usb3_port(struct tegra_xusb_padctl *padctl,
+				    unsigned int index)
+{
+	struct tegra_xusb_usb3_port *usb3;
+	struct device_node *np;
+	int err = 0;
+
+	/*
+	 * If there is no supplemental configuration in the device tree the
+	 * port is unusable. But it is valid to configure only a single port,
+	 * hence return 0 instead of an error to allow ports to be optional.
+	 */
+	np = tegra_xusb_find_port_node(padctl, "usb3", index);
+	if (!np || !of_device_is_available(np))
+		goto out;
+
+	usb3 = devm_kzalloc(padctl->dev, sizeof(*usb3), GFP_KERNEL);
+	if (!usb3) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = tegra_xusb_port_init(&usb3->base, padctl, np, "usb3", index);
+	if (err < 0)
+		goto out;
+
+	usb3->base.ops = padctl->soc->ports.usb3.ops;
+
+	usb3->base.lane = usb3->base.ops->map(&usb3->base);
+	if (IS_ERR(usb3->base.lane)) {
+		err = PTR_ERR(usb3->base.lane);
+		goto out;
+	}
+
+	err = tegra_xusb_usb3_port_parse_dt(usb3);
+	if (err < 0) {
+		tegra_xusb_port_unregister(&usb3->base);
+		goto out;
+	}
+
+	list_add_tail(&usb3->base.list, &padctl->ports);
+
+out:
+	of_node_put(np);
+	return err;
+}
+
+static void __tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
+{
+	struct tegra_xusb_port *port, *tmp;
+
+	list_for_each_entry_safe_reverse(port, tmp, &padctl->ports, list) {
+		list_del(&port->list);
+		tegra_xusb_port_unregister(port);
+	}
+}
+
+static int tegra_xusb_setup_ports(struct tegra_xusb_padctl *padctl)
+{
+	struct tegra_xusb_port *port;
+	unsigned int i;
+	int err = 0;
+
+	mutex_lock(&padctl->lock);
+
+	for (i = 0; i < padctl->soc->ports.usb2.count; i++) {
+		err = tegra_xusb_add_usb2_port(padctl, i);
+		if (err < 0)
+			goto remove_ports;
+	}
+
+	for (i = 0; i < padctl->soc->ports.ulpi.count; i++) {
+		err = tegra_xusb_add_ulpi_port(padctl, i);
+		if (err < 0)
+			goto remove_ports;
+	}
+
+	for (i = 0; i < padctl->soc->ports.hsic.count; i++) {
+		err = tegra_xusb_add_hsic_port(padctl, i);
+		if (err < 0)
+			goto remove_ports;
+	}
+
+	for (i = 0; i < padctl->soc->ports.usb3.count; i++) {
+		err = tegra_xusb_add_usb3_port(padctl, i);
+		if (err < 0)
+			goto remove_ports;
+	}
+
+	list_for_each_entry(port, &padctl->ports, list) {
+		err = port->ops->enable(port);
+		if (err < 0)
+			dev_err(padctl->dev, "failed to enable port %s: %d\n",
+				dev_name(&port->dev), err);
+	}
+
+	goto unlock;
+
+remove_ports:
+	__tegra_xusb_remove_ports(padctl);
+unlock:
+	mutex_unlock(&padctl->lock);
+	return err;
+}
+
+static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
+{
+	mutex_lock(&padctl->lock);
+	__tegra_xusb_remove_ports(padctl);
+	mutex_unlock(&padctl->lock);
+}
+
+static int tegra_xusb_padctl_probe(struct platform_device *pdev)
+{
+	struct device_node *np = of_node_get(pdev->dev.of_node);
+	const struct tegra_xusb_padctl_soc *soc;
+	struct tegra_xusb_padctl *padctl;
+	const struct of_device_id *match;
+	struct resource *res;
+	int err;
+
+	/* for backwards compatibility with old device trees */
+	np = of_find_node_by_name(np, "pads");
+	if (!np) {
+		dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
+		return tegra_xusb_padctl_legacy_probe(pdev);
+	}
+
+	of_node_put(np);
+
+	match = of_match_node(tegra_xusb_padctl_of_match, pdev->dev.of_node);
+	soc = match->data;
+
+	padctl = soc->ops->probe(&pdev->dev, soc);
+	if (IS_ERR(padctl))
+		return PTR_ERR(padctl);
+
+	platform_set_drvdata(pdev, padctl);
+	INIT_LIST_HEAD(&padctl->ports);
+	INIT_LIST_HEAD(&padctl->lanes);
+	INIT_LIST_HEAD(&padctl->pads);
+	mutex_init(&padctl->lock);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	padctl->regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(padctl->regs)) {
+		err = PTR_ERR(padctl->regs);
+		goto remove;
+	}
+
+	padctl->rst = devm_reset_control_get(&pdev->dev, NULL);
+	if (IS_ERR(padctl->rst)) {
+		err = PTR_ERR(padctl->rst);
+		goto remove;
+	}
+
+	err = reset_control_deassert(padctl->rst);
+	if (err < 0)
+		goto remove;
+
+	err = tegra_xusb_setup_pads(padctl);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to setup pads: %d\n", err);
+		goto reset;
+	}
+
+	err = tegra_xusb_setup_ports(padctl);
+	if (err) {
+		dev_err(&pdev->dev, "failed to setup XUSB ports: %d\n", err);
+		goto remove_pads;
+	}
+
+	return 0;
+
+remove_pads:
+	tegra_xusb_remove_pads(padctl);
+reset:
+	reset_control_assert(padctl->rst);
+remove:
+	soc->ops->remove(padctl);
+	return err;
+}
+
+static int tegra_xusb_padctl_remove(struct platform_device *pdev)
+{
+	struct tegra_xusb_padctl *padctl = platform_get_drvdata(pdev);
+	int err;
+
+	tegra_xusb_remove_ports(padctl);
+	tegra_xusb_remove_pads(padctl);
+
+	err = reset_control_assert(padctl->rst);
+	if (err < 0)
+		dev_err(&pdev->dev, "failed to assert reset: %d\n", err);
+
+	padctl->soc->ops->remove(padctl);
+
+	return err;
+}
+
+static struct platform_driver tegra_xusb_padctl_driver = {
+	.driver = {
+		.name = "tegra-xusb-padctl",
+		.of_match_table = tegra_xusb_padctl_of_match,
+	},
+	.probe = tegra_xusb_padctl_probe,
+	.remove = tegra_xusb_padctl_remove,
+};
+module_platform_driver(tegra_xusb_padctl_driver);
+
+struct tegra_xusb_padctl *tegra_xusb_padctl_get(struct device *dev)
+{
+	struct tegra_xusb_padctl *padctl;
+	struct platform_device *pdev;
+	struct device_node *np;
+
+	np = of_parse_phandle(dev->of_node, "nvidia,xusb-padctl", 0);
+	if (!np)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * This is slightly ugly. A better implementation would be to keep a
+	 * registry of pad controllers, but since there will almost certainly
+	 * only ever be one per SoC that would be a little overkill.
+	 */
+	pdev = of_find_device_by_node(np);
+	if (!pdev) {
+		of_node_put(np);
+		return ERR_PTR(-ENODEV);
+	}
+
+	of_node_put(np);
+
+	padctl = platform_get_drvdata(pdev);
+	if (!padctl) {
+		put_device(&pdev->dev);
+		return ERR_PTR(-EPROBE_DEFER);
+	}
+
+	return padctl;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_get);
+
+void tegra_xusb_padctl_put(struct tegra_xusb_padctl *padctl)
+{
+	if (padctl)
+		put_device(padctl->dev);
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_put);
+
+int tegra_xusb_padctl_usb3_save_context(struct tegra_xusb_padctl *padctl,
+					unsigned int port)
+{
+	if (padctl->soc->ops->usb3_save_context)
+		return padctl->soc->ops->usb3_save_context(padctl, port);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_usb3_save_context);
+
+int tegra_xusb_padctl_hsic_set_idle(struct tegra_xusb_padctl *padctl,
+				    unsigned int port, bool idle)
+{
+	if (padctl->soc->ops->hsic_set_idle)
+		return padctl->soc->ops->hsic_set_idle(padctl, port, idle);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_hsic_set_idle);
+
+int tegra_xusb_padctl_usb3_set_lfps_detect(struct tegra_xusb_padctl *padctl,
+					   unsigned int port, bool enable)
+{
+	if (padctl->soc->ops->usb3_set_lfps_detect)
+		return padctl->soc->ops->usb3_set_lfps_detect(padctl, port,
+							      enable);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_usb3_set_lfps_detect);
+
+MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
+MODULE_DESCRIPTION("Tegra XUSB Pad Controller driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/tegra/xusb.h b/drivers/phy/tegra/xusb.h
new file mode 100644
index 000000000000..b49dbc36efa3
--- /dev/null
+++ b/drivers/phy/tegra/xusb.h
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2015, Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __PHY_TEGRA_XUSB_H
+#define __PHY_TEGRA_XUSB_H
+
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+/* legacy entry points for backwards-compatibility */
+int tegra_xusb_padctl_legacy_probe(struct platform_device *pdev);
+int tegra_xusb_padctl_legacy_remove(struct platform_device *pdev);
+
+struct phy;
+struct phy_provider;
+struct platform_device;
+struct regulator;
+
+/*
+ * lanes
+ */
+struct tegra_xusb_lane_soc {
+	const char *name;
+
+	unsigned int offset;
+	unsigned int shift;
+	unsigned int mask;
+
+	const char * const *funcs;
+	unsigned int num_funcs;
+};
+
+struct tegra_xusb_lane {
+	const struct tegra_xusb_lane_soc *soc;
+	struct tegra_xusb_pad *pad;
+	struct device_node *np;
+	struct list_head list;
+	unsigned int function;
+	unsigned int index;
+};
+
+int tegra_xusb_lane_parse_dt(struct tegra_xusb_lane *lane,
+			     struct device_node *np);
+
+struct tegra_xusb_usb2_lane {
+	struct tegra_xusb_lane base;
+
+	u32 hs_curr_level_offset;
+};
+
+static inline struct tegra_xusb_usb2_lane *
+to_usb2_lane(struct tegra_xusb_lane *lane)
+{
+	return container_of(lane, struct tegra_xusb_usb2_lane, base);
+}
+
+struct tegra_xusb_ulpi_lane {
+	struct tegra_xusb_lane base;
+};
+
+static inline struct tegra_xusb_ulpi_lane *
+to_ulpi_lane(struct tegra_xusb_lane *lane)
+{
+	return container_of(lane, struct tegra_xusb_ulpi_lane, base);
+}
+
+struct tegra_xusb_hsic_lane {
+	struct tegra_xusb_lane base;
+
+	u32 strobe_trim;
+	u32 rx_strobe_trim;
+	u32 rx_data_trim;
+	u32 tx_rtune_n;
+	u32 tx_rtune_p;
+	u32 tx_rslew_n;
+	u32 tx_rslew_p;
+	bool auto_term;
+};
+
+static inline struct tegra_xusb_hsic_lane *
+to_hsic_lane(struct tegra_xusb_lane *lane)
+{
+	return container_of(lane, struct tegra_xusb_hsic_lane, base);
+}
+
+struct tegra_xusb_pcie_lane {
+	struct tegra_xusb_lane base;
+};
+
+static inline struct tegra_xusb_pcie_lane *
+to_pcie_lane(struct tegra_xusb_lane *lane)
+{
+	return container_of(lane, struct tegra_xusb_pcie_lane, base);
+}
+
+struct tegra_xusb_sata_lane {
+	struct tegra_xusb_lane base;
+};
+
+static inline struct tegra_xusb_sata_lane *
+to_sata_lane(struct tegra_xusb_lane *lane)
+{
+	return container_of(lane, struct tegra_xusb_sata_lane, base);
+}
+
+struct tegra_xusb_lane_ops {
+	struct tegra_xusb_lane *(*probe)(struct tegra_xusb_pad *pad,
+					 struct device_node *np,
+					 unsigned int index);
+	void (*remove)(struct tegra_xusb_lane *lane);
+};
+
+/*
+ * pads
+ */
+struct tegra_xusb_pad_soc;
+struct tegra_xusb_padctl;
+
+struct tegra_xusb_pad_ops {
+	struct tegra_xusb_pad *(*probe)(struct tegra_xusb_padctl *padctl,
+					const struct tegra_xusb_pad_soc *soc,
+					struct device_node *np);
+	void (*remove)(struct tegra_xusb_pad *pad);
+};
+
+struct tegra_xusb_pad_soc {
+	const char *name;
+
+	const struct tegra_xusb_lane_soc *lanes;
+	unsigned int num_lanes;
+
+	const struct tegra_xusb_pad_ops *ops;
+};
+
+struct tegra_xusb_pad {
+	const struct tegra_xusb_pad_soc *soc;
+	struct tegra_xusb_padctl *padctl;
+	struct phy_provider *provider;
+	struct phy **lanes;
+	struct device dev;
+
+	const struct tegra_xusb_lane_ops *ops;
+
+	struct list_head list;
+};
+
+static inline struct tegra_xusb_pad *to_tegra_xusb_pad(struct device *dev)
+{
+	return container_of(dev, struct tegra_xusb_pad, dev);
+}
+
+int tegra_xusb_pad_init(struct tegra_xusb_pad *pad,
+			struct tegra_xusb_padctl *padctl,
+			struct device_node *np);
+int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
+			    const struct phy_ops *ops);
+void tegra_xusb_pad_unregister(struct tegra_xusb_pad *pad);
+
+struct tegra_xusb_usb2_pad {
+	struct tegra_xusb_pad base;
+
+	struct clk *clk;
+	unsigned int enable;
+	struct mutex lock;
+};
+
+static inline struct tegra_xusb_usb2_pad *
+to_usb2_pad(struct tegra_xusb_pad *pad)
+{
+	return container_of(pad, struct tegra_xusb_usb2_pad, base);
+}
+
+struct tegra_xusb_ulpi_pad {
+	struct tegra_xusb_pad base;
+};
+
+static inline struct tegra_xusb_ulpi_pad *
+to_ulpi_pad(struct tegra_xusb_pad *pad)
+{
+	return container_of(pad, struct tegra_xusb_ulpi_pad, base);
+}
+
+struct tegra_xusb_hsic_pad {
+	struct tegra_xusb_pad base;
+
+	struct regulator *supply;
+	struct clk *clk;
+};
+
+static inline struct tegra_xusb_hsic_pad *
+to_hsic_pad(struct tegra_xusb_pad *pad)
+{
+	return container_of(pad, struct tegra_xusb_hsic_pad, base);
+}
+
+struct tegra_xusb_pcie_pad {
+	struct tegra_xusb_pad base;
+
+	struct reset_control *rst;
+	struct clk *pll;
+
+	unsigned int enable;
+};
+
+static inline struct tegra_xusb_pcie_pad *
+to_pcie_pad(struct tegra_xusb_pad *pad)
+{
+	return container_of(pad, struct tegra_xusb_pcie_pad, base);
+}
+
+struct tegra_xusb_sata_pad {
+	struct tegra_xusb_pad base;
+
+	struct reset_control *rst;
+	struct clk *pll;
+
+	unsigned int enable;
+};
+
+static inline struct tegra_xusb_sata_pad *
+to_sata_pad(struct tegra_xusb_pad *pad)
+{
+	return container_of(pad, struct tegra_xusb_sata_pad, base);
+}
+
+/*
+ * ports
+ */
+struct tegra_xusb_port_ops;
+
+struct tegra_xusb_port {
+	struct tegra_xusb_padctl *padctl;
+	struct tegra_xusb_lane *lane;
+	unsigned int index;
+
+	struct list_head list;
+	struct device dev;
+
+	const struct tegra_xusb_port_ops *ops;
+};
+
+struct tegra_xusb_lane_map {
+	unsigned int port;
+	const char *type;
+	unsigned int index;
+	const char *func;
+};
+
+struct tegra_xusb_lane *
+tegra_xusb_port_find_lane(struct tegra_xusb_port *port,
+			  const struct tegra_xusb_lane_map *map,
+			  const char *function);
+
+struct tegra_xusb_port *
+tegra_xusb_find_port(struct tegra_xusb_padctl *padctl, const char *type,
+		     unsigned int index);
+
+struct tegra_xusb_usb2_port {
+	struct tegra_xusb_port base;
+
+	struct regulator *supply;
+	bool internal;
+};
+
+static inline struct tegra_xusb_usb2_port *
+to_usb2_port(struct tegra_xusb_port *port)
+{
+	return container_of(port, struct tegra_xusb_usb2_port, base);
+}
+
+struct tegra_xusb_usb2_port *
+tegra_xusb_find_usb2_port(struct tegra_xusb_padctl *padctl,
+			  unsigned int index);
+
+struct tegra_xusb_ulpi_port {
+	struct tegra_xusb_port base;
+
+	struct regulator *supply;
+	bool internal;
+};
+
+static inline struct tegra_xusb_ulpi_port *
+to_ulpi_port(struct tegra_xusb_port *port)
+{
+	return container_of(port, struct tegra_xusb_ulpi_port, base);
+}
+
+struct tegra_xusb_hsic_port {
+	struct tegra_xusb_port base;
+};
+
+static inline struct tegra_xusb_hsic_port *
+to_hsic_port(struct tegra_xusb_port *port)
+{
+	return container_of(port, struct tegra_xusb_hsic_port, base);
+}
+
+struct tegra_xusb_usb3_port {
+	struct tegra_xusb_port base;
+	struct regulator *supply;
+	bool context_saved;
+	unsigned int port;
+	bool internal;
+
+	u32 tap1;
+	u32 amp;
+	u32 ctle_z;
+	u32 ctle_g;
+};
+
+static inline struct tegra_xusb_usb3_port *
+to_usb3_port(struct tegra_xusb_port *port)
+{
+	return container_of(port, struct tegra_xusb_usb3_port, base);
+}
+
+struct tegra_xusb_usb3_port *
+tegra_xusb_find_usb3_port(struct tegra_xusb_padctl *padctl,
+			  unsigned int index);
+
+struct tegra_xusb_port_ops {
+	int (*enable)(struct tegra_xusb_port *port);
+	void (*disable)(struct tegra_xusb_port *port);
+	struct tegra_xusb_lane *(*map)(struct tegra_xusb_port *port);
+};
+
+/*
+ * pad controller
+ */
+struct tegra_xusb_padctl_soc;
+
+struct tegra_xusb_padctl_ops {
+	struct tegra_xusb_padctl *
+		(*probe)(struct device *dev,
+			 const struct tegra_xusb_padctl_soc *soc);
+	void (*remove)(struct tegra_xusb_padctl *padctl);
+
+	int (*usb3_save_context)(struct tegra_xusb_padctl *padctl,
+				 unsigned int index);
+	int (*hsic_set_idle)(struct tegra_xusb_padctl *padctl,
+			     unsigned int index, bool idle);
+	int (*usb3_set_lfps_detect)(struct tegra_xusb_padctl *padctl,
+				    unsigned int index, bool enable);
+};
+
+struct tegra_xusb_padctl_soc {
+	const struct tegra_xusb_pad_soc * const *pads;
+	unsigned int num_pads;
+
+	struct {
+		struct {
+			const struct tegra_xusb_port_ops *ops;
+			unsigned int count;
+		} usb2, ulpi, hsic, usb3;
+	} ports;
+
+	const struct tegra_xusb_padctl_ops *ops;
+};
+
+struct tegra_xusb_padctl {
+	struct device *dev;
+	void __iomem *regs;
+	struct mutex lock;
+	struct reset_control *rst;
+
+	const struct tegra_xusb_padctl_soc *soc;
+
+	struct tegra_xusb_pad *pcie;
+	struct tegra_xusb_pad *sata;
+	struct tegra_xusb_pad *ulpi;
+	struct tegra_xusb_pad *usb2;
+	struct tegra_xusb_pad *hsic;
+
+	struct list_head ports;
+	struct list_head lanes;
+	struct list_head pads;
+
+	unsigned int enable;
+
+	struct clk *clk;
+};
+
+static inline void padctl_writel(struct tegra_xusb_padctl *padctl, u32 value,
+				 unsigned long offset)
+{
+	dev_dbg(padctl->dev, "%08lx < %08x\n", offset, value);
+	writel(value, padctl->regs + offset);
+}
+
+static inline u32 padctl_readl(struct tegra_xusb_padctl *padctl,
+			       unsigned long offset)
+{
+	u32 value = readl(padctl->regs + offset);
+	dev_dbg(padctl->dev, "%08lx > %08x\n", offset, value);
+	return value;
+}
+
+struct tegra_xusb_lane *tegra_xusb_find_lane(struct tegra_xusb_padctl *padctl,
+					     const char *name,
+					     unsigned int index);
+
+#if defined(CONFIG_ARCH_TEGRA_124_SOC) || defined(CONFIG_ARCH_TEGRA_132_SOC)
+extern const struct tegra_xusb_padctl_soc tegra124_xusb_padctl_soc;
+#endif
+#if defined(CONFIG_ARCH_TEGRA_210_SOC)
+extern const struct tegra_xusb_padctl_soc tegra210_xusb_padctl_soc;
+#endif
+
+#endif /* __PHY_TEGRA_XUSB_H */
diff --git a/drivers/pinctrl/tegra/pinctrl-tegra-xusb.c b/drivers/pinctrl/tegra/pinctrl-tegra-xusb.c
index 2f06029c9405..946cda3fee35 100644
--- a/drivers/pinctrl/tegra/pinctrl-tegra-xusb.c
+++ b/drivers/pinctrl/tegra/pinctrl-tegra-xusb.c
@@ -873,7 +873,7 @@ static const struct of_device_id tegra_xusb_padctl_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
 
-static int tegra_xusb_padctl_probe(struct platform_device *pdev)
+int tegra_xusb_padctl_legacy_probe(struct platform_device *pdev)
 {
 	struct tegra_xusb_padctl *padctl;
 	const struct of_device_id *match;
@@ -955,8 +955,9 @@ reset:
 	reset_control_assert(padctl->rst);
 	return err;
 }
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_legacy_probe);
 
-static int tegra_xusb_padctl_remove(struct platform_device *pdev)
+int tegra_xusb_padctl_legacy_remove(struct platform_device *pdev)
 {
 	struct tegra_xusb_padctl *padctl = platform_get_drvdata(pdev);
 	int err;
@@ -969,17 +970,4 @@ static int tegra_xusb_padctl_remove(struct platform_device *pdev)
 
 	return err;
 }
-
-static struct platform_driver tegra_xusb_padctl_driver = {
-	.driver = {
-		.name = "tegra-xusb-padctl",
-		.of_match_table = tegra_xusb_padctl_of_match,
-	},
-	.probe = tegra_xusb_padctl_probe,
-	.remove = tegra_xusb_padctl_remove,
-};
-module_platform_driver(tegra_xusb_padctl_driver);
-
-MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
-MODULE_DESCRIPTION("Tegra 124 XUSB Pad Control driver");
-MODULE_LICENSE("GPL v2");
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_legacy_remove);
diff --git a/include/linux/phy/tegra/xusb.h b/include/linux/phy/tegra/xusb.h
new file mode 100644
index 000000000000..8e1a57a78d9f
--- /dev/null
+++ b/include/linux/phy/tegra/xusb.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef PHY_TEGRA_XUSB_H
+#define PHY_TEGRA_XUSB_H
+
+struct tegra_xusb_padctl;
+struct device;
+
+struct tegra_xusb_padctl *tegra_xusb_padctl_get(struct device *dev);
+void tegra_xusb_padctl_put(struct tegra_xusb_padctl *padctl);
+
+int tegra_xusb_padctl_usb3_save_context(struct tegra_xusb_padctl *padctl,
+					unsigned int port);
+int tegra_xusb_padctl_hsic_set_idle(struct tegra_xusb_padctl *padctl,
+				    unsigned int port, bool idle);
+int tegra_xusb_padctl_usb3_set_lfps_detect(struct tegra_xusb_padctl *padctl,
+					   unsigned int port, bool enable);
+
+#endif /* PHY_TEGRA_XUSB_H */
-- 
cgit v1.2.3


From f4b05d27ec6b032ca504591e2a157b058b6f172f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 28 Apr 2016 17:59:28 +0200
Subject: net: constify is_skb_forwardable's arguments

is_skb_forwardable is not supposed to change anything so constify its
arguments

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 net/core/dev.c            | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 934ca866562d..52914a854386 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3264,7 +3264,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb);
+bool is_skb_forwardable(const struct net_device *dev,
+			const struct sk_buff *skb);
 
 extern int		netdev_budget;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c2f3d5dbde56..d91dfbec0fc6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
 			__net_timestamp(SKB);		\
 	}						\
 
-bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 {
 	unsigned int len;
 
-- 
cgit v1.2.3


From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Apr 2016 12:04:13 -0400
Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header

The functions trace_buffer_unlock_commit() and the _regs() version are only
used within the kernel/trace directory. Move them to the local header and
remove the export as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  9 ---------
 kernel/trace/trace.c         |  2 --
 kernel/trace/trace.h         | 10 ++++++++++
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index bb383af35cc7..48cc5e19c5f5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,15 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
-void trace_buffer_unlock_commit_regs(struct trace_array *tr,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
-				     struct pt_regs *regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ba54e241c8d..94e7e4d11b79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
 static struct ring_buffer *temp_buffer;
 
@@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
 
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0862e7559548..bd5ae56dec7a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1068,6 +1068,16 @@ struct trace_subsystem_dir {
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event,
+				     unsigned long flags, int pc,
+				     struct pt_regs *regs);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 16:12:39 -0400
Subject: tracing: Remove unused function trace_current_buffer_discard_commit()

The function trace_current_buffer_discard_commit() has no callers, remove
it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 2 --
 kernel/trace/trace.c         | 8 --------
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 48cc5e19c5f5..356c39b3abbb 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,8 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 94e7e4d11b79..e5bdb9accf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
@@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event)
-{
-	ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
-
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-- 
cgit v1.2.3


From d745098cedb3f5c6a554796d4a3a505abd4ebaa6 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:33 +0300
Subject: net/mlx5: Introduce modify flow rule destination

This API is used for modifying the flow rule destination.
This is needed for modifying the pointed flow table by the
traffic type classifier rules to point on the aRFS tables.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++--
 include/linux/mlx5/fs.h                           | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 89cce97d46c6..bb2c1cd35fd7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -615,8 +615,8 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	return err;
 }
 
-static int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-					struct mlx5_flow_destination *dest)
+int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+				 struct mlx5_flow_destination *dest)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 8dec5508d93d..28a5b662ab6a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -113,4 +113,7 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		   struct mlx5_flow_destination *dest);
 void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
 
+int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+				 struct mlx5_flow_destination *dest);
+
 #endif
-- 
cgit v1.2.3


From d63cd28608bb563d52e62990fa01c016e8dbdb75 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:35 +0300
Subject: net/mlx5: Add user chosen levels when allocating flow tables

Currently, consumers of the flow steering infrastructure can't
choose their own flow table levels and are limited to one
flow table per level. This just waste levels.
Instead, we introduce here the possibility to use multiple
flow tables in a level. The user is free to connect these
flow tables, while following the rule (FTEs in FT of level x
could only point to FTs of level y where y > x).

In addition this patch switch the order of the create/destroy
flow tables of the NIC(vlan and main).

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                 |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c   | 30 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 66 +++++++++++++++--------
 include/linux/mlx5/fs.h                           |  6 ++-
 6 files changed, 70 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 99eb1c1a3b7b..3ff663c35bac 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1438,7 +1438,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 	if (!ft) {
 		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
 							 num_entries,
-							 num_groups);
+							 num_groups,
+							 0);
 
 		if (!IS_ERR(ft)) {
 			prio->refcount = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 4df49e660587..d61171ae0168 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -37,6 +37,11 @@
 #include <linux/mlx5/fs.h>
 #include "en.h"
 
+enum {
+	MLX5E_VLAN_FT_LEVEL = 0,
+	MLX5E_MAIN_FT_LEVEL
+};
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 enum {
@@ -1041,7 +1046,8 @@ static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv)
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_MAIN_TABLE_SIZE);
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_MAIN_TABLE_SIZE,
+				       MLX5E_MAIN_FT_LEVEL);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
@@ -1150,7 +1156,8 @@ static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_VLAN_TABLE_SIZE);
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 1, MLX5E_VLAN_TABLE_SIZE,
+				       MLX5E_VLAN_FT_LEVEL);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
@@ -1167,11 +1174,16 @@ static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
 	if (err)
 		goto err_free_g;
 
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+	if (err)
+		goto err_destroy_vlan_flow_groups;
+
 	return 0;
 
+err_destroy_vlan_flow_groups:
+	mlx5e_destroy_groups(ft);
 err_free_g:
 	kfree(ft->g);
-
 err_destroy_vlan_flow_table:
 	mlx5_destroy_flow_table(ft->t);
 	ft->t = NULL;
@@ -1194,15 +1206,11 @@ int mlx5e_create_flow_tables(struct mlx5e_priv *priv)
 	if (!priv->fts.ns)
 		return -EINVAL;
 
-	err = mlx5e_create_vlan_flow_table(priv);
-	if (err)
-		return err;
-
 	err = mlx5e_create_main_flow_table(priv);
 	if (err)
-		goto err_destroy_vlan_flow_table;
+		return err;
 
-	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+	err = mlx5e_create_vlan_flow_table(priv);
 	if (err)
 		goto err_destroy_main_flow_table;
 
@@ -1210,8 +1218,6 @@ int mlx5e_create_flow_tables(struct mlx5e_priv *priv)
 
 err_destroy_main_flow_table:
 	mlx5e_destroy_main_flow_table(priv);
-err_destroy_vlan_flow_table:
-	mlx5e_destroy_vlan_flow_table(priv);
 
 	return err;
 }
@@ -1219,6 +1225,6 @@ err_destroy_vlan_flow_table:
 void mlx5e_destroy_flow_tables(struct mlx5e_priv *priv)
 {
 	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
-	mlx5e_destroy_main_flow_table(priv);
 	mlx5e_destroy_vlan_flow_table(priv);
+	mlx5e_destroy_main_flow_table(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b3de09f13425..2137387c043d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -64,7 +64,8 @@ static struct mlx5_flow_rule *mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 		priv->fts.tc.t =
 			mlx5_create_auto_grouped_flow_table(priv->fts.ns, 0,
 							    MLX5E_TC_FLOW_TABLE_NUM_ENTRIES,
-							    MLX5E_TC_FLOW_TABLE_NUM_GROUPS);
+							    MLX5E_TC_FLOW_TABLE_NUM_GROUPS,
+							    0);
 		if (IS_ERR(priv->fts.tc.t)) {
 			netdev_err(priv->netdev,
 				   "Failed to create tc offload table\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index bc3d9f8a75c1..ff91bb5e1c43 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -401,7 +401,7 @@ static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	memset(flow_group_in, 0, inlen);
 
 	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
-	fdb = mlx5_create_flow_table(root_ns, 0, table_size);
+	fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0);
 	if (IS_ERR_OR_NULL(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cfb35c334cd1..ca55d7e30e5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -225,19 +225,6 @@ static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns,
 	return NULL;
 }
 
-static unsigned int find_next_free_level(struct fs_prio *prio)
-{
-	if (!list_empty(&prio->node.children)) {
-		struct mlx5_flow_table *ft;
-
-		ft = list_last_entry(&prio->node.children,
-				     struct mlx5_flow_table,
-				     node.list);
-		return ft->level + 1;
-	}
-	return prio->start_level;
-}
-
 static bool masked_memcmp(void *mask, void *val1, void *val2, size_t size)
 {
 	unsigned int i;
@@ -696,9 +683,23 @@ static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table
 	return err;
 }
 
+static void list_add_flow_table(struct mlx5_flow_table *ft,
+				struct fs_prio *prio)
+{
+	struct list_head *prev = &prio->node.children;
+	struct mlx5_flow_table *iter;
+
+	fs_for_each_ft(iter, prio) {
+		if (iter->level > ft->level)
+			break;
+		prev = &iter->node.list;
+	}
+	list_add(&ft->node.list, prev);
+}
+
 struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-					       int prio,
-					       int max_fte)
+					       int prio, int max_fte,
+					       u32 level)
 {
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_table *ft;
@@ -719,12 +720,15 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		err = -EINVAL;
 		goto unlock_root;
 	}
-	if (fs_prio->num_ft == fs_prio->num_levels) {
+	if (level >= fs_prio->num_levels) {
 		err = -ENOSPC;
 		goto unlock_root;
 	}
-
-	ft = alloc_flow_table(find_next_free_level(fs_prio),
+	/* The level is related to the
+	 * priority level range.
+	 */
+	level += fs_prio->start_level;
+	ft = alloc_flow_table(level,
 			      roundup_pow_of_two(max_fte),
 			      root->table_type);
 	if (!ft) {
@@ -745,7 +749,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		goto destroy_ft;
 	lock_ref_node(&fs_prio->node);
 	tree_add_node(&ft->node, &fs_prio->node);
-	list_add_tail(&ft->node.list, &fs_prio->node.children);
+	list_add_flow_table(ft, fs_prio);
 	fs_prio->num_ft++;
 	unlock_ref_node(&fs_prio->node);
 	mutex_unlock(&root->chain_lock);
@@ -762,14 +766,15 @@ unlock_root:
 struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 							    int prio,
 							    int num_flow_table_entries,
-							    int max_num_groups)
+							    int max_num_groups,
+							    u32 level)
 {
 	struct mlx5_flow_table *ft;
 
 	if (max_num_groups > num_flow_table_entries)
 		return ERR_PTR(-EINVAL);
 
-	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries);
+	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries, level);
 	if (IS_ERR(ft))
 		return ft;
 
@@ -1068,6 +1073,20 @@ unlock_fg:
 	return rule;
 }
 
+static bool dest_is_valid(struct mlx5_flow_destination *dest,
+			  u32 action,
+			  struct mlx5_flow_table *ft)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return true;
+
+	if (!dest || ((dest->type ==
+	    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) &&
+	    (dest->ft->level <= ft->level)))
+		return false;
+	return true;
+}
+
 static struct mlx5_flow_rule *
 _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		    u8 match_criteria_enable,
@@ -1080,7 +1099,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_rule *rule;
 
-	if ((action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) && !dest)
+	if (!dest_is_valid(dest, action, ft))
 		return ERR_PTR(-EINVAL);
 
 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
@@ -1517,6 +1536,7 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns)
 
 #define ANCHOR_PRIO 0
 #define ANCHOR_SIZE 1
+#define ANCHOR_LEVEL 0
 static int create_anchor_flow_table(struct mlx5_core_dev
 							*dev)
 {
@@ -1526,7 +1546,7 @@ static int create_anchor_flow_table(struct mlx5_core_dev
 	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ANCHOR);
 	if (!ns)
 		return -EINVAL;
-	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE);
+	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL);
 	if (IS_ERR(ft)) {
 		mlx5_core_err(dev, "Failed to create last anchor flow table");
 		return PTR_ERR(ft);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 28a5b662ab6a..165ff4f9cc6a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -82,12 +82,14 @@ struct mlx5_flow_table *
 mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    int prio,
 				    int num_flow_table_entries,
-				    int max_num_groups);
+				    int max_num_groups,
+				    u32 level);
 
 struct mlx5_flow_table *
 mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
-		       int num_flow_table_entries);
+		       int num_flow_table_entries,
+		       u32 level);
 int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
 
 /* inbox should be set with the following values:
-- 
cgit v1.2.3


From 5a7b27eb9cf3986f487469b57a3a41286d2e7100 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Fri, 29 Apr 2016 01:36:39 +0300
Subject: net/mlx5: Initializing CPU reverse mapping

Allocating CPU rmap and add entry for each IRQ.
CPU rmap is used in aRFS to get the RX queue number
of the RX completion interrupts.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 +++
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 18 ++++++++++++++++++
 include/linux/mlx5/driver.h                       |  3 +++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 518192e59779..8fee224fb98c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1691,6 +1691,9 @@ int mlx5e_open_locked(struct net_device *netdev)
 	mlx5e_redirect_rqts(priv);
 	mlx5e_update_carrier(priv);
 	mlx5e_timestamp_init(priv);
+#ifdef CONFIG_RFS_ACCEL
+	priv->netdev->rx_cpu_rmap = priv->mdev->rmap;
+#endif
 
 	schedule_delayed_work(&priv->update_stats_work, 0);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6892746fd10d..6feef7fb9d6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -48,6 +48,9 @@
 #include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
 #include "mlx5_core.h"
 #include "fs_core.h"
 #ifdef CONFIG_MLX5_CORE_EN
@@ -665,6 +668,12 @@ static void free_comp_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	struct mlx5_eq *eq, *n;
 
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
 	spin_lock(&table->lock);
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
 		list_del(&eq->list);
@@ -691,6 +700,11 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 	INIT_LIST_HEAD(&table->comp_eqs_list);
 	ncomp_vec = table->num_comp_vectors;
 	nent = MLX5_COMP_EQ_SIZE;
+#ifdef CONFIG_RFS_ACCEL
+	dev->rmap = alloc_irq_cpu_rmap(ncomp_vec);
+	if (!dev->rmap)
+		return -ENOMEM;
+#endif
 	for (i = 0; i < ncomp_vec; i++) {
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
 		if (!eq) {
@@ -698,6 +712,10 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 			goto clean;
 		}
 
+#ifdef CONFIG_RFS_ACCEL
+		irq_cpu_rmap_add(dev->rmap,
+				 dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector);
+#endif
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96a428dcac9f..d5529449ef47 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -560,6 +560,9 @@ struct mlx5_core_dev {
 	struct mlx5_profile	*profile;
 	atomic_t		num_qps;
 	u32			issi;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap         *rmap;
+#endif
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From f09d90864eb7cc00cadbbfd083b4eff84c167981 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Thu, 28 Apr 2016 16:34:08 +0200
Subject: livepatch: make object/func-walking helpers more robust

Current object-walking helper checks the presence of obj->funcs to
determine the end of objs array in klp_object structure. This is
somewhat fragile because one can easily forget about funcs definition
during livepatch creation. In such a case the livepatch module is
successfully loaded and all objects after the incorrect one are omitted.
This is very confusing. Let's make the helper more robust and check also
for the other external member, name. Thus the helper correctly stops on
an empty item of the array. We need to have a check for obj->funcs in
klp_init_object() to make it work.

The same applies to a func-walking helper.

As a benefit we'll check for new_func member definition during the
livepatch initialization. There is no such check anywhere in the code
now.

[jkosina@suse.cz: fix shortlog]
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h | 6 ++++--
 kernel/livepatch/core.c   | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index bd830d590465..90843ccb7852 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -124,10 +124,12 @@ struct klp_patch {
 };
 
 #define klp_for_each_object(patch, obj) \
-	for (obj = patch->objs; obj->funcs; obj++)
+	for (obj = patch->objs; obj->funcs || obj->name; obj++)
 
 #define klp_for_each_func(obj, func) \
-	for (func = obj->funcs; func->old_name; func++)
+	for (func = obj->funcs; \
+	     func->old_name || func->new_func || func->old_sympos; \
+	     func++)
 
 int klp_register_patch(struct klp_patch *);
 int klp_unregister_patch(struct klp_patch *);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index d68fbf63b083..ddef649a2914 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -683,6 +683,9 @@ static void klp_free_patch(struct klp_patch *patch)
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
+	if (!func->old_name || !func->new_func)
+		return -EINVAL;
+
 	INIT_LIST_HEAD(&func->stack_node);
 	func->state = KLP_DISABLED;
 
-- 
cgit v1.2.3


From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:11:54 -0400
Subject: tracing: Remove unused function trace_current_buffer_lock_reserve()

trace_current_buffer_lock_reserve() has no more users. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  4 ----
 kernel/trace/trace.c         | 10 ----------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 356c39b3abbb..3111a1efdad6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -154,10 +154,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				struct trace_event_file *trace_file,
 				int type, unsigned long len,
 				unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 41bf14412666..c09e8ffadc73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc)
-{
-	*current_rb = global_trace.trace_buffer.buffer;
-	return trace_buffer_lock_reserve(*current_rb,
-					 type, len, flags, pc);
-}
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
-- 
cgit v1.2.3


From 3ed605bc8a0a688d8750a1e2eff39c854418c5cf Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Tue, 26 Apr 2016 12:32:27 -0300
Subject: kernel.h: add u64_to_user_ptr()

This function had copies in 3 different files. Unify them in kernel.h.

Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Rob Clark <robdclark@gmail.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Acked-by: Daniel Vetter <daniel.vetter@intel.com>	[drm/i915/]
Acked-by: Rob Clark <robdclark@gmail.com>		[drm/msm/]
Acked-by: Lucas Stach <l.stach@pengutronix.de>		[drm/etinav/]
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 11 +++--------
 drivers/gpu/drm/i915/i915_drv.h              |  5 -----
 drivers/gpu/drm/i915/i915_gem.c              | 14 +++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 14 +++++++-------
 drivers/gpu/drm/msm/msm_gem_submit.c         | 11 +++--------
 include/linux/kernel.h                       |  7 +++++++
 6 files changed, 27 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 236ada93df53..afdd55ddf821 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -28,11 +28,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct etnaviv_gem_submit *submit_create(struct drm_device *dev,
 		struct etnaviv_gpu *gpu, size_t nr)
 {
@@ -347,21 +342,21 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	cmdbuf->exec_state = args->exec_state;
 	cmdbuf->ctx = file->driver_priv;
 
-	ret = copy_from_user(bos, to_user_ptr(args->bos),
+	ret = copy_from_user(bos, u64_to_user_ptr(args->bos),
 			     args->nr_bos * sizeof(*bos));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(relocs, to_user_ptr(args->relocs),
+	ret = copy_from_user(relocs, u64_to_user_ptr(args->relocs),
 			     args->nr_relocs * sizeof(*relocs));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(stream, to_user_ptr(args->stream),
+	ret = copy_from_user(stream, u64_to_user_ptr(args->stream),
 			     args->stream_size);
 	if (ret) {
 		ret = -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 10480939159c..bb624ccabcb4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3576,11 +3576,6 @@ static inline i915_reg_t i915_vgacntrl_reg(struct drm_device *dev)
 		return VGACNTRL;
 }
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static inline unsigned long msecs_to_jiffies_timeout(const unsigned int m)
 {
 	unsigned long j = msecs_to_jiffies(m);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dabc08987b5e..288971670759 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -324,7 +324,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 {
 	struct drm_device *dev = obj->base.dev;
 	void *vaddr = obj->phys_handle->vaddr + args->offset;
-	char __user *user_data = to_user_ptr(args->data_ptr);
+	char __user *user_data = u64_to_user_ptr(args->data_ptr);
 	int ret = 0;
 
 	/* We manually control the domain here and pretend that it
@@ -605,7 +605,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int needs_clflush = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -692,7 +692,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_WRITE,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
@@ -783,7 +783,7 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 	if (ret)
 		goto out_unpin;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	offset = i915_gem_obj_ggtt_offset(obj) + args->offset;
@@ -907,7 +907,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int needs_clflush_before = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -1036,12 +1036,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_READ,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
 	if (likely(!i915.prefault_disable)) {
-		ret = fault_in_multipages_readable(to_user_ptr(args->data_ptr),
+		ret = fault_in_multipages_readable(u64_to_user_ptr(args->data_ptr),
 						   args->size);
 		if (ret)
 			return -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1328bc5021b4..e60b4e72a5e4 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -514,7 +514,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int remain, ret;
 
-	user_relocs = to_user_ptr(entry->relocs_ptr);
+	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -865,7 +865,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		u64 invalid_offset = (u64)-1;
 		int j;
 
-		user_relocs = to_user_ptr(exec[i].relocs_ptr);
+		user_relocs = u64_to_user_ptr(exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
 				   exec[i].relocation_count * sizeof(*reloc))) {
@@ -1009,7 +1009,7 @@ validate_exec_list(struct drm_device *dev,
 		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 
 	for (i = 0; i < count; i++) {
-		char __user *ptr = to_user_ptr(exec[i].relocs_ptr);
+		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
 		int length; /* limited by fault_in_pages_readable() */
 
 		if (exec[i].flags & invalid_flags)
@@ -1696,7 +1696,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1732,7 +1732,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);
 	if (!ret) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
-			to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
@@ -1786,7 +1786,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec2_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec2_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1799,7 +1799,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   to_user_ptr(args->buffers_ptr);
+				   u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
 		for (i = 0; i < args->buffer_count; i++) {
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 43d2181231c0..23d25283616c 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -28,11 +28,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
 		struct msm_gpu *gpu, int nr)
 {
@@ -68,7 +63,7 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
 		struct drm_gem_object *obj;
 		struct msm_gem_object *msm_obj;
 		void __user *userptr =
-			to_user_ptr(args->bos + (i * sizeof(submit_bo)));
+			u64_to_user_ptr(args->bos + (i * sizeof(submit_bo)));
 
 		ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo));
 		if (ret) {
@@ -257,7 +252,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob
 	for (i = 0; i < nr_relocs; i++) {
 		struct drm_msm_gem_submit_reloc submit_reloc;
 		void __user *userptr =
-			to_user_ptr(relocs + (i * sizeof(submit_reloc)));
+			u64_to_user_ptr(relocs + (i * sizeof(submit_reloc)));
 		uint32_t iova, off;
 		bool valid;
 
@@ -356,7 +351,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	for (i = 0; i < args->nr_cmds; i++) {
 		struct drm_msm_gem_submit_cmd submit_cmd;
 		void __user *userptr =
-			to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
+			u64_to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
 		struct msm_gem_object *msm_obj;
 		uint32_t iova;
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e229b0..f3e45cb0b6a2 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -53,6 +53,13 @@
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+#define u64_to_user_ptr(x) (		\
+{					\
+	typecheck(u64, x);		\
+	(void __user *)(uintptr_t)x;	\
+}					\
+)
+
 /*
  * This looks more complex than it should be. But we need to
  * get the type for the ~ right in round_down (it needs to be
-- 
cgit v1.2.3


From 460bfc41fd52959311ed0328163f785e023857af Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Thu, 28 Apr 2016 10:46:57 -0300
Subject: dma-buf/sync_file: de-stage sync_file headers

Move sync_file headers file to include/ dir.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/sync.h           |   4 +-
 drivers/staging/android/sync_debug.c     |   2 +-
 drivers/staging/android/sync_file.c      |   4 +-
 drivers/staging/android/sync_file.h      |  57 ------------------
 drivers/staging/android/uapi/sync_file.h | 100 -------------------------------
 include/linux/sync_file.h                |  57 ++++++++++++++++++
 include/uapi/linux/sync_file.h           | 100 +++++++++++++++++++++++++++++++
 7 files changed, 162 insertions(+), 162 deletions(-)
 delete mode 100644 drivers/staging/android/sync_file.h
 delete mode 100644 drivers/staging/android/uapi/sync_file.h
 create mode 100644 include/linux/sync_file.h
 create mode 100644 include/uapi/linux/sync_file.h

(limited to 'include/linux')

diff --git a/drivers/staging/android/sync.h b/drivers/staging/android/sync.h
index df44abb95963..b56885c14839 100644
--- a/drivers/staging/android/sync.h
+++ b/drivers/staging/android/sync.h
@@ -20,8 +20,8 @@
 #include <linux/spinlock.h>
 #include <linux/fence.h>
 
-#include "sync_file.h"
-#include "uapi/sync_file.h"
+#include <linux/sync_file.h>
+#include <uapi/linux/sync_file.h>
 
 struct sync_timeline;
 
diff --git a/drivers/staging/android/sync_debug.c b/drivers/staging/android/sync_debug.c
index 8b55218f5535..5f57499c98bf 100644
--- a/drivers/staging/android/sync_debug.c
+++ b/drivers/staging/android/sync_debug.c
@@ -26,7 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
 #include <linux/time64.h>
-#include "sync_file.h"
+#include <linux/sync_file.h>
 #include "sw_sync.h"
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/staging/android/sync_file.c b/drivers/staging/android/sync_file.c
index eabf90dd63b3..f08cf2d8309e 100644
--- a/drivers/staging/android/sync_file.c
+++ b/drivers/staging/android/sync_file.c
@@ -23,8 +23,8 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
-#include "sync_file.h"
-#include "uapi/sync_file.h"
+#include <linux/sync_file.h>
+#include <uapi/linux/sync_file.h>
 
 static const struct file_operations sync_file_fops;
 
diff --git a/drivers/staging/android/sync_file.h b/drivers/staging/android/sync_file.h
deleted file mode 100644
index c6ffe8b0725c..000000000000
--- a/drivers/staging/android/sync_file.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * include/linux/sync_file.h
- *
- * Copyright (C) 2012 Google, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_SYNC_FILE_H
-#define _LINUX_SYNC_FILE_H
-
-#include <linux/types.h>
-#include <linux/kref.h>
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/fence.h>
-
-struct sync_file_cb {
-	struct fence_cb cb;
-	struct fence *fence;
-	struct sync_file *sync_file;
-};
-
-/**
- * struct sync_file - sync file to export to the userspace
- * @file:		file representing this fence
- * @kref:		reference count on fence.
- * @name:		name of sync_file.  Useful for debugging
- * @sync_file_list:	membership in global file list
- * @num_fences:		number of sync_pts in the fence
- * @wq:			wait queue for fence signaling
- * @status:		0: signaled, >0:active, <0: error
- * @cbs:		sync_pts callback information
- */
-struct sync_file {
-	struct file		*file;
-	struct kref		kref;
-	char			name[32];
-#ifdef CONFIG_DEBUG_FS
-	struct list_head	sync_file_list;
-#endif
-	int num_fences;
-
-	wait_queue_head_t	wq;
-	atomic_t		status;
-
-	struct sync_file_cb	cbs[];
-};
-
-struct sync_file *sync_file_create(struct fence *fence);
-
-#endif /* _LINUX_SYNC_H */
diff --git a/drivers/staging/android/uapi/sync_file.h b/drivers/staging/android/uapi/sync_file.h
deleted file mode 100644
index 413303d37b56..000000000000
--- a/drivers/staging/android/uapi/sync_file.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2012 Google, Inc.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _UAPI_LINUX_SYNC_H
-#define _UAPI_LINUX_SYNC_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-/**
- * struct sync_merge_data - data passed to merge ioctl
- * @name:	name of new fence
- * @fd2:	file descriptor of second fence
- * @fence:	returns the fd of the new fence to userspace
- * @flags:	merge_data flags
- * @pad:	padding for 64-bit alignment, should always be zero
- */
-struct sync_merge_data {
-	char	name[32];
-	__s32	fd2;
-	__s32	fence;
-	__u32	flags;
-	__u32	pad;
-};
-
-/**
- * struct sync_fence_info - detailed fence information
- * @obj_name:		name of parent sync_timeline
-* @driver_name:	name of driver implementing the parent
-* @status:		status of the fence 0:active 1:signaled <0:error
- * @flags:		fence_info flags
- * @timestamp_ns:	timestamp of status change in nanoseconds
- */
-struct sync_fence_info {
-	char	obj_name[32];
-	char	driver_name[32];
-	__s32	status;
-	__u32	flags;
-	__u64	timestamp_ns;
-};
-
-/**
- * struct sync_file_info - data returned from fence info ioctl
- * @name:	name of fence
- * @status:	status of fence. 1: signaled 0:active <0:error
- * @flags:	sync_file_info flags
- * @num_fences	number of fences in the sync_file
- * @pad:	padding for 64-bit alignment, should always be zero
- * @sync_fence_info: pointer to array of structs sync_fence_info with all
- *		 fences in the sync_file
- */
-struct sync_file_info {
-	char	name[32];
-	__s32	status;
-	__u32	flags;
-	__u32	num_fences;
-	__u32	pad;
-
-	__u64	sync_fence_info;
-};
-
-#define SYNC_IOC_MAGIC		'>'
-
-/**
- * Opcodes  0, 1 and 2 were burned during a API change to avoid users of the
- * old API to get weird errors when trying to handling sync_files. The API
- * change happened during the de-stage of the Sync Framework when there was
- * no upstream users available.
- */
-
-/**
- * DOC: SYNC_IOC_MERGE - merge two fences
- *
- * Takes a struct sync_merge_data.  Creates a new fence containing copies of
- * the sync_pts in both the calling fd and sync_merge_data.fd2.  Returns the
- * new fence's fd in sync_merge_data.fence
- */
-#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
-
-/**
- * DOC: SYNC_IOC_FENCE_INFO - get detailed information on a fence
- *
- * Takes a struct sync_file_info_data with extra space allocated for pt_info.
- * Caller should write the size of the buffer into len.  On return, len is
- * updated to reflect the total size of the sync_file_info_data including
- * pt_info.
- *
- * pt_info is a buffer containing sync_pt_infos for every sync_pt in the fence.
- * To iterate over the sync_pt_infos, use the sync_pt_info.len field.
- */
-#define SYNC_IOC_FILE_INFO	_IOWR(SYNC_IOC_MAGIC, 4, struct sync_file_info)
-
-#endif /* _UAPI_LINUX_SYNC_H */
diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h
new file mode 100644
index 000000000000..c6ffe8b0725c
--- /dev/null
+++ b/include/linux/sync_file.h
@@ -0,0 +1,57 @@
+/*
+ * include/linux/sync_file.h
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_SYNC_FILE_H
+#define _LINUX_SYNC_FILE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/fence.h>
+
+struct sync_file_cb {
+	struct fence_cb cb;
+	struct fence *fence;
+	struct sync_file *sync_file;
+};
+
+/**
+ * struct sync_file - sync file to export to the userspace
+ * @file:		file representing this fence
+ * @kref:		reference count on fence.
+ * @name:		name of sync_file.  Useful for debugging
+ * @sync_file_list:	membership in global file list
+ * @num_fences:		number of sync_pts in the fence
+ * @wq:			wait queue for fence signaling
+ * @status:		0: signaled, >0:active, <0: error
+ * @cbs:		sync_pts callback information
+ */
+struct sync_file {
+	struct file		*file;
+	struct kref		kref;
+	char			name[32];
+#ifdef CONFIG_DEBUG_FS
+	struct list_head	sync_file_list;
+#endif
+	int num_fences;
+
+	wait_queue_head_t	wq;
+	atomic_t		status;
+
+	struct sync_file_cb	cbs[];
+};
+
+struct sync_file *sync_file_create(struct fence *fence);
+
+#endif /* _LINUX_SYNC_H */
diff --git a/include/uapi/linux/sync_file.h b/include/uapi/linux/sync_file.h
new file mode 100644
index 000000000000..413303d37b56
--- /dev/null
+++ b/include/uapi/linux/sync_file.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_SYNC_H
+#define _UAPI_LINUX_SYNC_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct sync_merge_data - data passed to merge ioctl
+ * @name:	name of new fence
+ * @fd2:	file descriptor of second fence
+ * @fence:	returns the fd of the new fence to userspace
+ * @flags:	merge_data flags
+ * @pad:	padding for 64-bit alignment, should always be zero
+ */
+struct sync_merge_data {
+	char	name[32];
+	__s32	fd2;
+	__s32	fence;
+	__u32	flags;
+	__u32	pad;
+};
+
+/**
+ * struct sync_fence_info - detailed fence information
+ * @obj_name:		name of parent sync_timeline
+* @driver_name:	name of driver implementing the parent
+* @status:		status of the fence 0:active 1:signaled <0:error
+ * @flags:		fence_info flags
+ * @timestamp_ns:	timestamp of status change in nanoseconds
+ */
+struct sync_fence_info {
+	char	obj_name[32];
+	char	driver_name[32];
+	__s32	status;
+	__u32	flags;
+	__u64	timestamp_ns;
+};
+
+/**
+ * struct sync_file_info - data returned from fence info ioctl
+ * @name:	name of fence
+ * @status:	status of fence. 1: signaled 0:active <0:error
+ * @flags:	sync_file_info flags
+ * @num_fences	number of fences in the sync_file
+ * @pad:	padding for 64-bit alignment, should always be zero
+ * @sync_fence_info: pointer to array of structs sync_fence_info with all
+ *		 fences in the sync_file
+ */
+struct sync_file_info {
+	char	name[32];
+	__s32	status;
+	__u32	flags;
+	__u32	num_fences;
+	__u32	pad;
+
+	__u64	sync_fence_info;
+};
+
+#define SYNC_IOC_MAGIC		'>'
+
+/**
+ * Opcodes  0, 1 and 2 were burned during a API change to avoid users of the
+ * old API to get weird errors when trying to handling sync_files. The API
+ * change happened during the de-stage of the Sync Framework when there was
+ * no upstream users available.
+ */
+
+/**
+ * DOC: SYNC_IOC_MERGE - merge two fences
+ *
+ * Takes a struct sync_merge_data.  Creates a new fence containing copies of
+ * the sync_pts in both the calling fd and sync_merge_data.fd2.  Returns the
+ * new fence's fd in sync_merge_data.fence
+ */
+#define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+
+/**
+ * DOC: SYNC_IOC_FENCE_INFO - get detailed information on a fence
+ *
+ * Takes a struct sync_file_info_data with extra space allocated for pt_info.
+ * Caller should write the size of the buffer into len.  On return, len is
+ * updated to reflect the total size of the sync_file_info_data including
+ * pt_info.
+ *
+ * pt_info is a buffer containing sync_pt_infos for every sync_pt in the fence.
+ * To iterate over the sync_pt_infos, use the sync_pt_info.len field.
+ */
+#define SYNC_IOC_FILE_INFO	_IOWR(SYNC_IOC_MAGIC, 4, struct sync_file_info)
+
+#endif /* _UAPI_LINUX_SYNC_H */
-- 
cgit v1.2.3


From d3feb406733544dbf0e239ef945a09decdceac88 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Thu, 14 Apr 2016 11:37:43 +0200
Subject: phy: bcm-ns-usb2: new driver for USB 2.0 PHY on Northstar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Northstar is a family of SoCs used in home routers. They have USB 2.0
and 3.0 controllers with PHYs that need to be properly initialized.
This driver provides PHY init support in a generic way and can be bound
with an EHCI controller driver.
There are (just a few) registers being defined in bcma header. It's
because DMU/CRU registers will be also needed in other drivers. We will
need them e.g. in PCIe controller/PHY driver and at some point probably
in clock driver for BCM53573 chipset. By using include/linux/bcma/ we
avoid code duplication.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/bcm-ns-usb2-phy.txt    |  21 ++++
 drivers/phy/Kconfig                                |   9 ++
 drivers/phy/Makefile                               |   1 +
 drivers/phy/phy-bcm-ns-usb2.c                      | 137 +++++++++++++++++++++
 include/linux/bcma/bcma.h                          |   1 +
 include/linux/bcma/bcma_driver_arm_c9.h            |  15 +++
 6 files changed, 184 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
 create mode 100644 drivers/phy/phy-bcm-ns-usb2.c
 create mode 100644 include/linux/bcma/bcma_driver_arm_c9.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt b/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
new file mode 100644
index 000000000000..a7aee9ea8926
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.txt
@@ -0,0 +1,21 @@
+Driver for Broadcom Northstar USB 2.0 PHY
+
+Required properties:
+- compatible: brcm,ns-usb2-phy
+- reg: iomem address range of DMU (Device Management Unit)
+- reg-names: "dmu", the only needed & supported reg right now
+- clocks: USB PHY reference clock
+- clock-names: "phy-ref-clk", the only needed & supported clock right now
+
+To initialize USB 2.0 PHY driver needs to setup PLL correctly. To do this it
+requires passing phandle to the USB PHY reference clock.
+
+Example:
+	usb2-phy {
+		compatible = "brcm,ns-usb2-phy";
+		reg = <0x1800c000 0x1000>;
+		reg-names = "dmu";
+		#phy-cells = <0>;
+		clocks = <&genpll BCM_NSP_GENPLL_USB_PHY_REF_CLK>;
+		clock-names = "phy-ref-clk";
+	};
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index f6ff76ec89dc..f2b458fc0c89 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -15,6 +15,15 @@ config GENERIC_PHY
 	  phy users can obtain reference to the PHY. All the users of this
 	  framework should select this config.
 
+config PHY_BCM_NS_USB2
+	tristate "Broadcom Northstar USB 2.0 PHY Driver"
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
+	depends on HAS_IOMEM && OF
+	select GENERIC_PHY
+	help
+	  Enable this to support Broadcom USB 2.0 PHY connected to the USB
+	  controller on Northstar family.
+
 config PHY_BERLIN_USB
 	tristate "Marvell Berlin USB PHY Driver"
 	depends on ARCH_BERLIN && RESET_CONTROLLER && HAS_IOMEM && OF
diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index f03fa1fdf322..0de09e13fdbc 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_GENERIC_PHY)		+= phy-core.o
+obj-$(CONFIG_PHY_BCM_NS_USB2)		+= phy-bcm-ns-usb2.o
 obj-$(CONFIG_PHY_BERLIN_USB)		+= phy-berlin-usb.o
 obj-$(CONFIG_PHY_BERLIN_SATA)		+= phy-berlin-sata.o
 obj-$(CONFIG_PHY_DM816X_USB)		+= phy-dm816x-usb.o
diff --git a/drivers/phy/phy-bcm-ns-usb2.c b/drivers/phy/phy-bcm-ns-usb2.c
new file mode 100644
index 000000000000..95ab6b2a0de5
--- /dev/null
+++ b/drivers/phy/phy-bcm-ns-usb2.c
@@ -0,0 +1,137 @@
+/*
+ * Broadcom Northstar USB 2.0 PHY Driver
+ *
+ * Copyright (C) 2016 Rafał Miłecki <zajec5@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bcma/bcma.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct bcm_ns_usb2 {
+	struct device *dev;
+	struct clk *ref_clk;
+	struct phy *phy;
+	void __iomem *dmu;
+};
+
+static int bcm_ns_usb2_phy_init(struct phy *phy)
+{
+	struct bcm_ns_usb2 *usb2 = phy_get_drvdata(phy);
+	struct device *dev = usb2->dev;
+	void __iomem *dmu = usb2->dmu;
+	u32 ref_clk_rate, usb2ctl, usb_pll_ndiv, usb_pll_pdiv;
+	int err = 0;
+
+	err = clk_prepare_enable(usb2->ref_clk);
+	if (err < 0) {
+		dev_err(dev, "Failed to prepare ref clock: %d\n", err);
+		goto err_out;
+	}
+
+	ref_clk_rate = clk_get_rate(usb2->ref_clk);
+	if (!ref_clk_rate) {
+		dev_err(dev, "Failed to get ref clock rate\n");
+		err = -EINVAL;
+		goto err_clk_off;
+	}
+
+	usb2ctl = readl(dmu + BCMA_DMU_CRU_USB2_CONTROL);
+
+	if (usb2ctl & BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK) {
+		usb_pll_pdiv = usb2ctl;
+		usb_pll_pdiv &= BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK;
+		usb_pll_pdiv >>= BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_SHIFT;
+	} else {
+		usb_pll_pdiv = 1 << 3;
+	}
+
+	/* Calculate ndiv based on a solid 1920 MHz that is for USB2 PHY */
+	usb_pll_ndiv = (1920000000 * usb_pll_pdiv) / ref_clk_rate;
+
+	/* Unlock DMU PLL settings with some magic value */
+	writel(0x0000ea68, dmu + BCMA_DMU_CRU_CLKSET_KEY);
+
+	/* Write USB 2.0 PLL control setting */
+	usb2ctl &= ~BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_MASK;
+	usb2ctl |= usb_pll_ndiv << BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_SHIFT;
+	writel(usb2ctl, dmu + BCMA_DMU_CRU_USB2_CONTROL);
+
+	/* Lock DMU PLL settings */
+	writel(0x00000000, dmu + BCMA_DMU_CRU_CLKSET_KEY);
+
+err_clk_off:
+	clk_disable_unprepare(usb2->ref_clk);
+err_out:
+	return err;
+}
+
+static const struct phy_ops ops = {
+	.init		= bcm_ns_usb2_phy_init,
+	.owner		= THIS_MODULE,
+};
+
+static int bcm_ns_usb2_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct bcm_ns_usb2 *usb2;
+	struct resource *res;
+	struct phy_provider *phy_provider;
+
+	usb2 = devm_kzalloc(&pdev->dev, sizeof(*usb2), GFP_KERNEL);
+	if (!usb2)
+		return -ENOMEM;
+	usb2->dev = dev;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dmu");
+	usb2->dmu = devm_ioremap_resource(dev, res);
+	if (IS_ERR(usb2->dmu)) {
+		dev_err(dev, "Failed to map DMU regs\n");
+		return PTR_ERR(usb2->dmu);
+	}
+
+	usb2->ref_clk = devm_clk_get(dev, "phy-ref-clk");
+	if (IS_ERR(usb2->ref_clk)) {
+		dev_err(dev, "Clock not defined\n");
+		return PTR_ERR(usb2->ref_clk);
+	}
+
+	usb2->phy = devm_phy_create(dev, NULL, &ops);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	phy_set_drvdata(usb2->phy, usb2);
+	platform_set_drvdata(pdev, usb2);
+
+	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
+	return PTR_ERR_OR_ZERO(phy_provider);
+}
+
+static const struct of_device_id bcm_ns_usb2_id_table[] = {
+	{ .compatible = "brcm,ns-usb2-phy", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, bcm_ns_usb2_id_table);
+
+static struct platform_driver bcm_ns_usb2_driver = {
+	.probe		= bcm_ns_usb2_probe,
+	.driver = {
+		.name = "bcm_ns_usb2",
+		.of_match_table = bcm_ns_usb2_id_table,
+	},
+};
+module_platform_driver(bcm_ns_usb2_driver);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 0367c63f5960..e6b41f42602b 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -4,6 +4,7 @@
 #include <linux/pci.h>
 #include <linux/mod_devicetable.h>
 
+#include <linux/bcma/bcma_driver_arm_c9.h>
 #include <linux/bcma/bcma_driver_chipcommon.h>
 #include <linux/bcma/bcma_driver_pci.h>
 #include <linux/bcma/bcma_driver_pcie2.h>
diff --git a/include/linux/bcma/bcma_driver_arm_c9.h b/include/linux/bcma/bcma_driver_arm_c9.h
new file mode 100644
index 000000000000..93bd73d670d5
--- /dev/null
+++ b/include/linux/bcma/bcma_driver_arm_c9.h
@@ -0,0 +1,15 @@
+#ifndef LINUX_BCMA_DRIVER_ARM_C9_H_
+#define LINUX_BCMA_DRIVER_ARM_C9_H_
+
+/* DMU (Device Management Unit) */
+#define BCMA_DMU_CRU_USB2_CONTROL			0x0164
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_MASK	0x00000FFC
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_NDIV_SHIFT	2
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_MASK	0x00007000
+#define  BCMA_DMU_CRU_USB2_CONTROL_USB_PLL_PDIV_SHIFT	12
+#define BCMA_DMU_CRU_CLKSET_KEY				0x0180
+#define BCMA_DMU_CRU_STRAPS_CTRL			0x02A0
+#define  BCMA_DMU_CRU_STRAPS_CTRL_USB3			0x00000010
+#define  BCMA_DMU_CRU_STRAPS_CTRL_4BYTE			0x00008000
+
+#endif /* LINUX_BCMA_DRIVER_ARM_C9_H_ */
-- 
cgit v1.2.3


From 71f5c63c07e5be7abdce40891778ffbf3cec04f0 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 23 Mar 2016 12:09:18 +0100
Subject: phy: exynos-mipi-video: Add support for Exynos 5420 and 5433 SoCs

This patch adds support for MIPI DPHYs found in Exynos5420-compatible
(5420, 5422 and 5800) and Exynos5433 SoCs. Those SoCs differs from
earlier by different offset of MIPI DPHY registers in PMU controllers
(Exynos 5420-compatible case) or by moving MIPI DPHY reset registers to
separate system register controllers (Exynos 5433 case). In both case
also additional 5th PHY (MIPI CSIS 2) has been added.

Acked-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../devicetree/bindings/phy/samsung-phy.txt        |  18 ++-
 drivers/phy/phy-exynos-mipi-video.c                | 129 ++++++++++++++++++++-
 include/linux/mfd/syscon/exynos5-pmu.h             |   3 +
 3 files changed, 147 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/phy/samsung-phy.txt b/Documentation/devicetree/bindings/phy/samsung-phy.txt
index 0289d3b07853..9872ba8546bd 100644
--- a/Documentation/devicetree/bindings/phy/samsung-phy.txt
+++ b/Documentation/devicetree/bindings/phy/samsung-phy.txt
@@ -2,9 +2,20 @@ Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY
 -------------------------------------------------
 
 Required properties:
-- compatible : should be "samsung,s5pv210-mipi-video-phy";
+- compatible : should be one of the listed compatibles:
+	- "samsung,s5pv210-mipi-video-phy"
+	- "samsung,exynos5420-mipi-video-phy"
+	- "samsung,exynos5433-mipi-video-phy"
 - #phy-cells : from the generic phy bindings, must be 1;
-- syscon - phandle to the PMU system controller;
+
+In case of s5pv210 and exynos5420 compatible PHYs:
+- syscon - phandle to the PMU system controller
+
+In case of exynos5433 compatible PHY:
+ - samsung,pmu-syscon - phandle to the PMU system controller
+ - samsung,disp-sysreg - phandle to the DISP system registers controller
+ - samsung,cam0-sysreg - phandle to the CAM0 system registers controller
+ - samsung,cam1-sysreg - phandle to the CAM1 system registers controller
 
 For "samsung,s5pv210-mipi-video-phy" compatible PHYs the second cell in
 the PHY specifier identifies the PHY and its meaning is as follows:
@@ -12,6 +23,9 @@ the PHY specifier identifies the PHY and its meaning is as follows:
   1 - MIPI DSIM 0,
   2 - MIPI CSIS 1,
   3 - MIPI DSIM 1.
+"samsung,exynos5420-mipi-video-phy" and "samsung,exynos5433-mipi-video-phy"
+supports additional fifth PHY:
+  4 - MIPI CSIS 2.
 
 Samsung EXYNOS SoC series Display Port PHY
 -------------------------------------------------
diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index 3cb69e005f18..cc093ebfda94 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -1,7 +1,7 @@
 /*
  * Samsung S5P/EXYNOS SoC series MIPI CSIS/DSIM DPHY driver
  *
- * Copyright (C) 2013 Samsung Electronics Co., Ltd.
+ * Copyright (C) 2013,2016 Samsung Electronics Co., Ltd.
  * Author: Sylwester Nawrocki <s.nawrocki@samsung.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -13,6 +13,7 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/mfd/syscon/exynos4-pmu.h>
+#include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -28,11 +29,15 @@ enum exynos_mipi_phy_id {
 	EXYNOS_MIPI_PHY_ID_DSIM0,
 	EXYNOS_MIPI_PHY_ID_CSIS1,
 	EXYNOS_MIPI_PHY_ID_DSIM1,
+	EXYNOS_MIPI_PHY_ID_CSIS2,
 	EXYNOS_MIPI_PHYS_NUM
 };
 
 enum exynos_mipi_phy_regmap_id {
 	EXYNOS_MIPI_REGMAP_PMU,
+	EXYNOS_MIPI_REGMAP_DISP,
+	EXYNOS_MIPI_REGMAP_CAM0,
+	EXYNOS_MIPI_REGMAP_CAM1,
 	EXYNOS_MIPI_REGMAPS_NUM
 };
 
@@ -96,6 +101,122 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 	},
 };
 
+static const struct mipi_phy_device_desc exynos5420_mipi_phy = {
+	.num_regmaps = 1,
+	.regmap_names = {"syscon"},
+	.num_phys = 5,
+	.phys = {
+		{
+			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM1,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS1,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
+		},
+	},
+};
+
+#define EXYNOS5433_SYSREG_DISP_MIPI_PHY		0x100C
+#define EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON	0x1014
+#define EXYNOS5433_SYSREG_CAM1_MIPI_DPHY_CON	0x1020
+
+static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
+	.num_regmaps = 4,
+	.regmap_names = {
+		"samsung,pmu-syscon",
+		"samsung,disp-sysreg",
+		"samsung,cam0-sysreg",
+		"samsung,cam1-sysreg"
+	},
+	.num_phys = 5,
+	.phys = {
+		{
+			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
+			.resetn_map = EXYNOS_MIPI_REGMAP_DISP,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(1),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(1),
+			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
+			.resetn_map = EXYNOS_MIPI_REGMAP_DISP,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_reg = EXYNOS5433_MIPI_PHY2_CONTROL,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = EXYNOS5433_SYSREG_CAM1_MIPI_DPHY_CON,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM1,
+		},
+	},
+};
 
 struct exynos_mipi_video_phy {
 	struct regmap *regmaps[EXYNOS_MIPI_REGMAPS_NUM];
@@ -241,6 +362,12 @@ static const struct of_device_id exynos_mipi_video_phy_of_match[] = {
 	{
 		.compatible = "samsung,s5pv210-mipi-video-phy",
 		.data = &s5pv210_mipi_phy,
+	}, {
+		.compatible = "samsung,exynos5420-mipi-video-phy",
+		.data = &exynos5420_mipi_phy,
+	}, {
+		.compatible = "samsung,exynos5433-mipi-video-phy",
+		.data = &exynos5433_mipi_phy,
 	},
 	{ /* sentinel */ },
 };
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
index 9352adc95de6..76f30f940c70 100644
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ b/include/linux/mfd/syscon/exynos5-pmu.h
@@ -38,6 +38,9 @@
 
 /* Exynos5433 specific register definitions */
 #define EXYNOS5433_USBHOST30_PHY_CONTROL	(0x728)
+#define EXYNOS5433_MIPI_PHY0_CONTROL		(0x710)
+#define EXYNOS5433_MIPI_PHY1_CONTROL		(0x714)
+#define EXYNOS5433_MIPI_PHY2_CONTROL		(0x718)
 
 #define EXYNOS5_PHY_ENABLE			BIT(0)
 
-- 
cgit v1.2.3


From 18900ca65a8553edc608b6c9d518eb31e6c09ba1 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:06:48 -0700
Subject: tty: Replace TTY_IO_ERROR bit tests with tty_io_error()

Abstract TTY_IO_ERROR status test treewide with tty_io_error().
NB: tty->flags uses atomic bit ops; replace non-atomic bit test
with test_bit().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/ia64/hp/sim/simserial.c       | 2 +-
 drivers/char/pcmcia/synclink_cs.c  | 2 +-
 drivers/isdn/i4l/isdn_tty.c        | 6 +++---
 drivers/s390/char/tty3270.c        | 4 ++--
 drivers/staging/dgnc/dgnc_tty.c    | 2 +-
 drivers/tty/amiserial.c            | 6 +++---
 drivers/tty/mxser.c                | 7 +++----
 drivers/tty/pty.c                  | 2 +-
 drivers/tty/serial/crisv10.c       | 5 ++---
 drivers/tty/serial/serial_core.c   | 8 ++++----
 drivers/tty/synclink.c             | 4 ++--
 drivers/tty/synclink_gt.c          | 4 ++--
 drivers/tty/synclinkmp.c           | 4 ++--
 drivers/tty/tty_io.c               | 5 ++---
 drivers/tty/tty_port.c             | 2 +-
 include/linux/tty.h                | 5 +++++
 net/irda/ircomm/ircomm_tty.c       | 2 +-
 net/irda/ircomm/ircomm_tty_ioctl.c | 6 +++---
 18 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index e70cadec7ce6..21fd50def270 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -300,7 +300,7 @@ static int rs_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 22c27652e46a..825db423b7a8 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2246,7 +2246,7 @@ static int mgslpc_ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 947d5c978b8f..f1edc0814120 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1351,7 +1351,7 @@ isdn_tty_tiocmget(struct tty_struct *tty)
 
 	if (isdn_tty_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	mutex_lock(&modem_info_mutex);
@@ -1378,7 +1378,7 @@ isdn_tty_tiocmset(struct tty_struct *tty,
 
 	if (isdn_tty_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 #ifdef ISDN_DEBUG_MODEM_IOCTL
@@ -1419,7 +1419,7 @@ isdn_tty_ioctl(struct tty_struct *tty, uint cmd, ulong arg)
 
 	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	switch (cmd) {
 	case TCSBRK:   /* SVID version: non-zero arg --> no break */
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index e96fc7fd9498..080a9872c68f 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -1804,7 +1804,7 @@ static int tty3270_ioctl(struct tty_struct *tty, unsigned int cmd,
 	tp = tty->driver_data;
 	if (!tp)
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	return kbd_ioctl(tp->kbd, cmd, arg);
 }
@@ -1818,7 +1818,7 @@ static long tty3270_compat_ioctl(struct tty_struct *tty,
 	tp = tty->driver_data;
 	if (!tp)
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 	return kbd_ioctl(tp->kbd, cmd, (unsigned long)compat_ptr(arg));
 }
diff --git a/drivers/staging/dgnc/dgnc_tty.c b/drivers/staging/dgnc/dgnc_tty.c
index bcd2bdfb9c8f..5c221593a0c6 100644
--- a/drivers/staging/dgnc/dgnc_tty.c
+++ b/drivers/staging/dgnc/dgnc_tty.c
@@ -1255,7 +1255,7 @@ static int dgnc_block_til_ready(struct tty_struct *tty,
 			if (file->f_flags & O_NONBLOCK)
 				break;
 
-			if (tty->flags & (1 << TTY_IO_ERROR)) {
+			if (tty_io_error(tty)) {
 				retval = -EIO;
 				break;
 			}
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index eacf4c9f3b29..183e98e84d09 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1143,7 +1143,7 @@ static int rs_tiocmget(struct tty_struct *tty)
 
 	if (serial_paranoia_check(info, tty->name, "rs_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	control = info->MCR;
@@ -1165,7 +1165,7 @@ static int rs_tiocmset(struct tty_struct *tty, unsigned int set,
 
 	if (serial_paranoia_check(info, tty->name, "rs_ioctl"))
 		return -ENODEV;
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	local_irq_save(flags);
@@ -1250,7 +1250,7 @@ static int rs_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 2f12bb9f4336..f23c2a101688 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1334,7 +1334,7 @@ static int mxser_tiocmget(struct tty_struct *tty)
 
 	if (tty->index == MXSER_PORTS)
 		return -ENOIOCTLCMD;
-	if (test_bit(TTY_IO_ERROR, &tty->flags))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	control = info->MCR;
@@ -1361,7 +1361,7 @@ static int mxser_tiocmset(struct tty_struct *tty,
 
 	if (tty->index == MXSER_PORTS)
 		return -ENOIOCTLCMD;
-	if (test_bit(TTY_IO_ERROR, &tty->flags))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	spin_lock_irqsave(&info->slock, flags);
@@ -1715,8 +1715,7 @@ static int mxser_ioctl(struct tty_struct *tty,
 		return 0;
 	}
 
-	if (cmd != TIOCGSERIAL && cmd != TIOCMIWAIT &&
-			test_bit(TTY_IO_ERROR, &tty->flags))
+	if (cmd != TIOCGSERIAL && cmd != TIOCMIWAIT && tty_io_error(tty))
 		return -EIO;
 
 	switch (cmd) {
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 0058d9fbf931..a8a292fd564f 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -44,7 +44,7 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 	if (tty->driver->subtype == PTY_TYPE_MASTER)
 		WARN_ON(tty->count > 1);
 	else {
-		if (test_bit(TTY_IO_ERROR, &tty->flags))
+		if (tty_io_error(tty))
 			return;
 		if (tty->count > 2)
 			return;
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index c0172bf54a9b..546990334815 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3445,7 +3445,7 @@ rs_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGWILD)  &&
 	    (cmd != TIOCSERSWILD) && (cmd != TIOCSERGSTRUCT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 			return -EIO;
 	}
 
@@ -3755,8 +3755,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 	 * If non-blocking mode is set, or the port is not enabled,
 	 * then make the check up front and then exit.
 	 */
-	if ((filp->f_flags & O_NONBLOCK) ||
-	    (tty->flags & (1 << TTY_IO_ERROR))) {
+	if ((filp->f_flags & O_NONBLOCK) || tty_io_error(tty)) {
 		info->port.flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index a126a603b083..67b395031347 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -969,7 +969,7 @@ static int uart_tiocmget(struct tty_struct *tty)
 	int result = -EIO;
 
 	mutex_lock(&port->mutex);
-	if (!(tty->flags & (1 << TTY_IO_ERROR))) {
+	if (!tty_io_error(tty)) {
 		result = uport->mctrl;
 		spin_lock_irq(&uport->lock);
 		result |= uport->ops->get_mctrl(uport);
@@ -989,7 +989,7 @@ uart_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
 	int ret = -EIO;
 
 	mutex_lock(&port->mutex);
-	if (!(tty->flags & (1 << TTY_IO_ERROR))) {
+	if (!tty_io_error(tty)) {
 		uart_update_mctrl(uport, set, clear);
 		ret = 0;
 	}
@@ -1238,7 +1238,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd,
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		ret = -EIO;
 		goto out;
 	}
@@ -1257,7 +1257,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd,
 
 	mutex_lock(&port->mutex);
 
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		ret = -EIO;
 		goto out_up;
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index f5476e270734..8b2277223ee7 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -2972,7 +2972,7 @@ static int mgsl_ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3270,7 +3270,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 		printk("%s(%d):block_til_ready on %s\n",
 			 __FILE__,__LINE__, tty->driver->name );
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index c0a2f5a1b1c2..1f7d6d9437e6 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1032,7 +1032,7 @@ static int ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3269,7 +3269,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	DBGINFO(("%s block_til_ready\n", tty->driver->name));
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 90da0c712262..e93879944905 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -1261,7 +1261,7 @@ static int ioctl(struct tty_struct *tty,
 
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCMIWAIT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
@@ -3285,7 +3285,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		printk("%s(%d):%s block_til_ready()\n",
 			 __FILE__,__LINE__, tty->driver->name );
 
-	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		/* just verify that callout device is not active */
 		port->flags |= ASYNC_NORMAL_ACTIVE;
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 9b04d72e752e..320dc4da7162 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1070,7 +1070,7 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 
 	if (tty_paranoia_check(tty, inode, "tty_read"))
 		return -EIO;
-	if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags)))
+	if (!tty || tty_io_error(tty))
 		return -EIO;
 
 	/* We want to wait for the line discipline to sort out in this
@@ -1245,8 +1245,7 @@ static ssize_t tty_write(struct file *file, const char __user *buf,
 
 	if (tty_paranoia_check(tty, file_inode(file), "tty_write"))
 		return -EIO;
-	if (!tty || !tty->ops->write ||
-		(test_bit(TTY_IO_ERROR, &tty->flags)))
+	if (!tty || !tty->ops->write ||	tty_io_error(tty))
 			return -EIO;
 	/* Short term debug to catch buggy drivers */
 	if (tty->ops->write_room == NULL)
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index dbcca30a54b1..9127c54b803e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -364,7 +364,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	/* if non-blocking mode is set we can pass directly to open unless
 	   the port has just hung up or is in another error state */
-	if (tty->flags & (1 << TTY_IO_ERROR)) {
+	if (tty_io_error(tty)) {
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3b09f235db66..68d829bf93b8 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -360,6 +360,11 @@ static inline void tty_set_flow_change(struct tty_struct *tty, int val)
 	smp_mb();
 }
 
+static inline bool tty_io_error(struct tty_struct *tty)
+{
+	return test_bit(TTY_IO_ERROR, &tty->flags);
+}
+
 #ifdef CONFIG_TTY
 extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index da126ee6d218..840b82f760ba 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -280,7 +280,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	 * If non-blocking mode is set, or the port is not enabled,
 	 * then make the check up front and then exit.
 	 */
-	if (test_bit(TTY_IO_ERROR, &tty->flags)) {
+	if (tty_io_error(tty)) {
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d3687aaa23de..9beb011441fa 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -190,7 +190,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty)
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 	unsigned int result;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	result =  ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
@@ -213,7 +213,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty,
 {
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	IRDA_ASSERT(self != NULL, return -1;);
@@ -362,7 +362,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
-- 
cgit v1.2.3


From 97ef38b8210d7459d4cb51668cdf3983772ac6b7 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:11:36 -0700
Subject: tty: Replace TTY_THROTTLED bit tests with tty_throttled()

Abstract TTY_THROTTLED bit tests with tty_throttled().

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c      | 2 +-
 drivers/mmc/card/sdio_uart.c           | 2 +-
 drivers/net/usb/hso.c                  | 2 +-
 drivers/staging/fwserial/fwserial.c    | 2 +-
 drivers/staging/speakup/selection.c    | 2 +-
 drivers/tty/amiserial.c                | 2 +-
 drivers/tty/hvc/hvc_console.c          | 2 +-
 drivers/tty/hvc/hvcs.c                 | 2 +-
 drivers/tty/hvc/hvsi.c                 | 2 +-
 drivers/tty/moxa.c                     | 2 +-
 drivers/tty/nozomi.c                   | 2 +-
 drivers/tty/serial/serial_core.c       | 2 +-
 drivers/tty/synclink.c                 | 2 +-
 drivers/tty/synclink_gt.c              | 2 +-
 drivers/tty/synclinkmp.c               | 2 +-
 drivers/tty/tty_ioctl.c                | 4 ++--
 drivers/tty/vt/selection.c             | 2 +-
 drivers/usb/gadget/function/u_serial.c | 4 ++--
 drivers/usb/serial/digi_acceleport.c   | 3 +--
 include/linux/tty.h                    | 5 +++++
 net/irda/ircomm/ircomm_tty_ioctl.c     | 2 +-
 21 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 825db423b7a8..bcae5bb15751 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2316,7 +2316,7 @@ static void mgslpc_set_termios(struct tty_struct *tty, struct ktermios *old_term
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock, flags);
 		set_signals(info);
diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c
index 5415056f9aa5..5af6fb9a9ce2 100644
--- a/drivers/mmc/card/sdio_uart.c
+++ b/drivers/mmc/card/sdio_uart.c
@@ -895,7 +895,7 @@ static void sdio_uart_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		unsigned int mask = TIOCM_DTR;
-		if (!(cflag & CRTSCTS) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!(cflag & CRTSCTS) || !tty_throttled(tty))
 			mask |= TIOCM_RTS;
 		sdio_uart_set_mctrl(port, mask);
 	}
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 111d907e0c11..4b4458616693 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2029,7 +2029,7 @@ static int put_rxbuf_data(struct urb *urb, struct hso_serial *serial)
 
 	tty = tty_port_tty_get(&serial->port);
 
-	if (tty && test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && tty_throttled(tty)) {
 		tty_kref_put(tty);
 		return -1;
 	}
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 9b23b5c95f5e..1f9389d8c152 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1305,7 +1305,7 @@ static void fwtty_set_termios(struct tty_struct *tty, struct ktermios *old)
 	if ((baud == 0) && (old->c_cflag & CBAUD)) {
 		port->mctrl &= ~(TIOCM_DTR | TIOCM_RTS);
 	} else if ((baud != 0) && !(old->c_cflag & CBAUD)) {
-		if (C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (C_CRTSCTS(tty) || !tty_throttled(tty))
 			port->mctrl |= TIOCM_DTR | TIOCM_RTS;
 		else
 			port->mctrl |= TIOCM_DTR;
diff --git a/drivers/staging/speakup/selection.c b/drivers/staging/speakup/selection.c
index 41ef099b7aa6..0149edc1e0ae 100644
--- a/drivers/staging/speakup/selection.c
+++ b/drivers/staging/speakup/selection.c
@@ -150,7 +150,7 @@ static void __speakup_paste_selection(struct work_struct *work)
 	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty_throttled(tty)) {
 			schedule();
 			continue;
 		}
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 183e98e84d09..e68208eac322 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1342,7 +1342,7 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		info->MCR |= SER_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->MCR |= SER_RTS;
 		local_irq_save(flags);
 		rtsdtr_ctrl(info->MCR);
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index e46d628998f5..209dad8c96a0 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -632,7 +632,7 @@ int hvc_poll(struct hvc_struct *hp)
 		goto bail;
 
 	/* Now check if we can get data (are we throttled ?) */
-	if (test_bit(TTY_THROTTLED, &tty->flags))
+	if (tty_throttled(tty))
 		goto throttled;
 
 	/* If we aren't notifier driven and aren't throttled, we always
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index 5997b1731111..3c4d7c2b4ade 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -600,7 +600,7 @@ static int hvcs_io(struct hvcs_struct *hvcsd)
 
 	hvcs_try_write(hvcsd);
 
-	if (!tty || test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (!tty || tty_throttled(tty)) {
 		hvcsd->todo_mask &= ~(HVCS_READ_MASK);
 		goto bail;
 	} else if (!(hvcsd->todo_mask & (HVCS_READ_MASK)))
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index a75146f600cb..96ce6bd1cc6f 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -509,7 +509,7 @@ static irqreturn_t hvsi_interrupt(int irq, void *arg)
 	}
 
 	spin_lock_irqsave(&hp->lock, flags);
-	if (tty && hp->n_throttle && !test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && hp->n_throttle && !tty_throttled(tty)) {
 		/* we weren't hung up and we weren't throttled, so we can
 		 * deliver the rest now */
 		hvsi_send_overflow(hp);
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 92982d7c0489..ce521d3f58cb 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -1394,7 +1394,7 @@ static int moxa_poll_port(struct moxa_port *p, unsigned int handle,
 			tty_wakeup(tty);
 		}
 
-		if (inited && !test_bit(TTY_THROTTLED, &tty->flags) &&
+		if (inited && !tty_throttled(tty) &&
 				MoxaPortRxQueue(p) > 0) { /* RX */
 			MoxaPortReadData(p);
 			tty_schedule_flip(&p->port);
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 5cc80b80c82b..d6fd0e802ef5 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -826,7 +826,7 @@ static int receive_data(enum port_type index, struct nozomi *dc)
 	size = __le32_to_cpu(readl(addr));
 	/*  DBG1( "%d bytes port: %d", size, index); */
 
-	if (tty && test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty && tty_throttled(tty)) {
 		DBG1("No room in tty, don't read data, don't ack interrupt, "
 			"disable interrupt");
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 67b395031347..64a5c00d7468 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1350,7 +1350,7 @@ static void uart_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	else if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		unsigned int mask = TIOCM_DTR;
-		if (!(cflag & CRTSCTS) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!(cflag & CRTSCTS) || !tty_throttled(tty))
 			mask |= TIOCM_RTS;
 		uart_set_mctrl(uport, mask);
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 8b2277223ee7..3768e5c71c0b 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3049,7 +3049,7 @@ static void mgsl_set_termios(struct tty_struct *tty, struct ktermios *old_termio
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 	 	usc_set_serial_signals(info);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 1f7d6d9437e6..ceeaeb703f51 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -784,7 +784,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->signals |= SerialSignal_DTR;
- 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index e93879944905..b0cce4b24e51 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -881,7 +881,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
- 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			info->serial_signals |= SerialSignal_RTS;
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 23bf5bb1d8bf..bf36ac9aee41 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -158,7 +158,7 @@ int tty_throttle_safe(struct tty_struct *tty)
 	int ret = 0;
 
 	mutex_lock(&tty->throttle_mutex);
-	if (!test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (!tty_throttled(tty)) {
 		if (tty->flow_change != TTY_THROTTLE_SAFE)
 			ret = 1;
 		else {
@@ -189,7 +189,7 @@ int tty_unthrottle_safe(struct tty_struct *tty)
 	int ret = 0;
 
 	mutex_lock(&tty->throttle_mutex);
-	if (test_bit(TTY_THROTTLED, &tty->flags)) {
+	if (tty_throttled(tty)) {
 		if (tty->flow_change != TTY_UNTHROTTLE_SAFE)
 			ret = 1;
 		else {
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index 4dd9dd2270a0..368ce1803e8f 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -354,7 +354,7 @@ int paste_selection(struct tty_struct *tty)
 	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty_throttled(tty)) {
 			schedule();
 			continue;
 		}
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 6af145f2a99d..3580f198df8b 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -512,7 +512,7 @@ static void gs_rx_push(unsigned long _port)
 		req = list_first_entry(queue, struct usb_request, list);
 
 		/* leave data queued if tty was rx throttled */
-		if (tty && test_bit(TTY_THROTTLED, &tty->flags))
+		if (tty && tty_throttled(tty))
 			break;
 
 		switch (req->status) {
@@ -579,7 +579,7 @@ static void gs_rx_push(unsigned long _port)
 	 * from starving ... but it's not clear that case ever happens.
 	 */
 	if (!list_empty(queue) && tty) {
-		if (!test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!tty_throttled(tty)) {
 			if (do_push)
 				tasklet_schedule(&port->push);
 			else
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 16e8e37b3b36..6a1df9e824ca 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -699,8 +699,7 @@ static void digi_set_termios(struct tty_struct *tty,
 			/* don't set RTS if using hardware flow control */
 			/* and throttling input */
 			modem_signals = TIOCM_DTR;
-			if (!C_CRTSCTS(tty) ||
-			    !test_bit(TTY_THROTTLED, &tty->flags))
+			if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 				modem_signals |= TIOCM_RTS;
 			digi_set_modem_signals(port, modem_signals, 1);
 		}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 68d829bf93b8..89f9c91b40f5 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -365,6 +365,11 @@ static inline bool tty_io_error(struct tty_struct *tty)
 	return test_bit(TTY_IO_ERROR, &tty->flags);
 }
 
+static inline bool tty_throttled(struct tty_struct *tty)
+{
+	return test_bit(TTY_THROTTLED, &tty->flags);
+}
+
 #ifdef CONFIG_TTY
 extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 9beb011441fa..8d8fd28ff4d9 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -166,7 +166,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		self->settings.dte |= IRCOMM_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			self->settings.dte |= IRCOMM_RTS;
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
 	}
-- 
cgit v1.2.3


From e4d38f334ad24f80229a8ebab26950de8e8f34d7 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:20 -0700
Subject: tty: Define ASYNC_ replacement bits

Prepare for relocating kernel private state bits out of tty_port::flags
field; tty_port::flags field is not atomic and can become corrupted
by concurrent updates. It also suffers from the complication of sharing
in a userspace-visible field which must be masked.

Define new tty_port::iflags field and new, substitute bit definitions
for the former ASYNC_* flags.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h            | 16 +++++++++++++++-
 include/uapi/linux/tty_flags.h |  9 ++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 89f9c91b40f5..4e0dbda05180 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -228,7 +228,8 @@ struct tty_port {
 	int			count;		/* Usage count */
 	wait_queue_head_t	open_wait;	/* Open waiters */
 	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
-	unsigned long		flags;		/* TTY flags ASY_*/
+	unsigned long		flags;		/* User TTY flags ASYNC_ */
+	unsigned long		iflags;		/* Internal flags TTY_PORT_ */
 	unsigned char		console:1,	/* port is a console */
 				low_latency:1;	/* optional: tune for latency */
 	struct mutex		mutex;		/* Locking */
@@ -242,6 +243,19 @@ struct tty_port {
 	struct kref		kref;		/* Ref counter */
 };
 
+/* tty_port::iflags bits -- use atomic bit ops */
+#define TTY_PORT_INITIALIZED	0	/* device is initialized */
+#define TTY_PORT_SUSPENDED	1	/* device is suspended */
+#define TTY_PORT_ACTIVE		2	/* device is open */
+
+/*
+ * uart drivers: use the uart_port::status field and the UPSTAT_* defines
+ * for s/w-based flow control steering and carrier detection status
+ */
+#define TTY_PORT_CTS_FLOW	3	/* h/w flow control enabled */
+#define TTY_PORT_CHECK_CD	4	/* carrier detect enabled */
+
+
 /*
  * Where all of the state associated with a tty is kept while the tty
  * is open.  Since the termios state should be kept even if the tty
diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h
index 072e41e45ee2..8e1a4365259f 100644
--- a/include/uapi/linux/tty_flags.h
+++ b/include/uapi/linux/tty_flags.h
@@ -32,7 +32,12 @@
 #define ASYNCB_MAGIC_MULTIPLIER	16 /* Use special CLK or divisor */
 #define ASYNCB_LAST_USER	16
 
-/* Internal flags used only by kernel */
+/*
+ * Internal flags used only by kernel (read-only)
+ *
+ * WARNING: These flags are no longer used and have been superceded by the
+ *	    TTY_PORT_ flags in the iflags field (and not userspace-visible)
+ */
 #define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
 #define ASYNCB_SUSPENDED	30 /* Serial port is suspended */
 #define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
@@ -44,6 +49,7 @@
 #define ASYNCB_CONS_FLOW	23 /* flow control for console  */
 #define ASYNCB_FIRST_KERNEL	22
 
+/* Masks */
 #define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
 #define ASYNC_SUSPENDED		(1U << ASYNCB_SUSPENDED)
 #define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
@@ -72,6 +78,7 @@
 #define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
 #define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
 
+/* These flags are no longer used (and were always masked from userspace) */
 #define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
 #define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
 #define ASYNC_BOOT_AUTOCONF	(1U << ASYNCB_BOOT_AUTOCONF)
-- 
cgit v1.2.3


From 5604a98e2f95d6221852960a3363588f40d78e22 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:21 -0700
Subject: tty: Replace ASYNC_CTS_FLOW bit and update atomically

Replace ASYNC_CTS_FLOW bit in the tty_port::flags field with
TTY_PORT_CTS_FLOW bit in the tty_port::iflags field. Add
tty_port_set_cts_flow() helper to abstract the atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c  |  5 +----
 drivers/tty/amiserial.c            |  6 ++----
 drivers/tty/cyclades.c             | 10 ++++------
 drivers/tty/isicom.c               |  6 ++----
 drivers/tty/mxser.c                |  4 +---
 drivers/tty/synclink.c             |  7 ++-----
 drivers/tty/synclink_gt.c          |  5 +----
 drivers/tty/synclinkmp.c           |  5 +----
 include/linux/tty.h                | 12 ++++++++++--
 net/irda/ircomm/ircomm_tty_ioctl.c |  3 +--
 10 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bcae5bb15751..bdf41ac613dc 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1466,10 +1466,7 @@ static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index e68208eac322..92717b088959 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -727,11 +727,9 @@ static void change_speed(struct tty_struct *tty, struct serial_state *info,
 	info->IER &= ~UART_IER_MSI;
 	if (port->flags & ASYNC_HARDPPS_CD)
 		info->IER |= UART_IER_MSI;
-	if (cflag & CRTSCTS) {
-		port->flags |= ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(port, cflag & CRTSCTS);
+	if (cflag & CRTSCTS)
 		info->IER |= UART_IER_MSI;
-	} else
-		port->flags &= ~ASYNC_CTS_FLOW;
 	if (cflag & CLOCAL)
 		port->flags &= ~ASYNC_CHECK_CD;
 	else {
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index d67e542bab1c..1a12776ba24c 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -2083,13 +2083,11 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 			info->cor1 |= CyPARITY_NONE;
 
 		/* CTS flow control flag */
-		if (cflag & CRTSCTS) {
-			info->port.flags |= ASYNC_CTS_FLOW;
+		tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
+		if (cflag & CRTSCTS)
 			info->cor2 |= CyCtsAE;
-		} else {
-			info->port.flags &= ~ASYNC_CTS_FLOW;
+		else
 			info->cor2 &= ~CyCtsAE;
-		}
 		if (cflag & CLOCAL)
 			info->port.flags &= ~ASYNC_CHECK_CD;
 		else
@@ -2234,7 +2232,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 		}
 		/* As the HW flow control is done in firmware, the driver
 		   doesn't need to care about it */
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+		tty_port_set_cts_flow(&info->port, 0);
 
 		/* XON/XOFF/XANY flow control flags */
 		sw_flow = 0;
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 8bf67630018b..c5f06b54b9ca 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -765,11 +765,9 @@ static void isicom_config_port(struct tty_struct *tty)
 
 	/* flow control settings ...*/
 	flow_ctrl = 0;
-	port->port.flags &= ~ASYNC_CTS_FLOW;
-	if (C_CRTSCTS(tty)) {
-		port->port.flags |= ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&port->port, C_CRTSCTS(tty));
+	if (C_CRTSCTS(tty))
 		flow_ctrl |= ISICOM_CTSRTS;
-	}
 	if (I_IXON(tty))
 		flow_ctrl |= ISICOM_RESPOND_XONXOFF;
 	if (I_IXOFF(tty))
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index f23c2a101688..8f3fdad37ac7 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -711,8 +711,8 @@ static int mxser_change_speed(struct tty_struct *tty,
 	/* CTS flow control flag and modem status interrupts */
 	info->IER &= ~UART_IER_MSI;
 	info->MCR &= ~UART_MCR_AFE;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS) {
-		info->port.flags |= ASYNC_CTS_FLOW;
 		info->IER |= UART_IER_MSI;
 		if ((info->type == PORT_16550A) || (info->board->chip_flag)) {
 			info->MCR |= UART_MCR_AFE;
@@ -744,8 +744,6 @@ static int mxser_change_speed(struct tty_struct *tty,
 				}
 			}
 		}
-	} else {
-		info->port.flags &= ~ASYNC_CTS_FLOW;
 	}
 	outb(info->MCR, info->ioaddr + UART_MCR);
 	if (cflag & CLOCAL) {
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 3768e5c71c0b..0e4290183280 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1966,11 +1966,8 @@ static void mgsl_change_params(struct mgsl_struct *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
-		
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
+
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
 	else
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index ceeaeb703f51..5da69d30f816 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -2576,10 +2576,7 @@ static void change_params(struct slgt_info *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index b0cce4b24e51..7a21491d0c0d 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -2813,10 +2813,7 @@ static void change_params(SLMP_INFO *info)
 	}
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
-	if (cflag & CRTSCTS)
-		info->port.flags |= ASYNC_CTS_FLOW;
-	else
-		info->port.flags &= ~ASYNC_CTS_FLOW;
+	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
 
 	if (cflag & CLOCAL)
 		info->port.flags &= ~ASYNC_CHECK_CD;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4e0dbda05180..989d755b0ae4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -255,7 +255,6 @@ struct tty_port {
 #define TTY_PORT_CTS_FLOW	3	/* h/w flow control enabled */
 #define TTY_PORT_CHECK_CD	4	/* carrier detect enabled */
 
-
 /*
  * Where all of the state associated with a tty is kept while the tty
  * is open.  Since the termios state should be kept even if the tty
@@ -561,9 +560,18 @@ static inline struct tty_port *tty_port_get(struct tty_port *port)
 /* If the cts flow control is enabled, return true. */
 static inline bool tty_port_cts_enabled(struct tty_port *port)
 {
-	return port->flags & ASYNC_CTS_FLOW;
+	return test_bit(TTY_PORT_CTS_FLOW, &port->iflags);
 }
 
+static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_CTS_FLOW, &port->iflags);
+	else
+		clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
+}
+
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8d8fd28ff4d9..1220973c7c43 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -86,15 +86,14 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
 	ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
 
 	/* CTS flow control flag and modem status interrupts */
+	tty_port_set_cts_flow(&self->port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS) {
-		self->port.flags |= ASYNC_CTS_FLOW;
 		self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
 		/* This got me. Bummer. Jean II */
 		if (self->service_type == IRCOMM_3_WIRE_RAW)
 			net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
 					     __func__);
 	} else {
-		self->port.flags &= ~ASYNC_CTS_FLOW;
 		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
 	}
 	if (cflag & CLOCAL)
-- 
cgit v1.2.3


From 807c8d81f4ec441241cafa3034c58df721fee869 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:22 -0700
Subject: tty: Replace ASYNC_NORMAL_ACTIVE bit and update atomically

Replace ASYNC_NORMAL_ACTIVE bit in the tty_port::flags field with
TTY_PORT_ACTIVE bit in the tty_port::iflags field. Introduce helpers
tty_port_set_active() and tty_port_active() to abstract atomic bit ops.

Extract state changes from port lock sections, as this usage is
broken and confused; the state transitions are protected by the
tty lock (which mutually excludes parallel open/close/hangup),
and no user tests the active state while holding the port lock.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/i4l/isdn_tty.c      | 20 +++++++++-----------
 drivers/tty/amiserial.c          |  2 +-
 drivers/tty/rocket.c             |  5 +++--
 drivers/tty/serial/crisv10.c     |  8 ++++----
 drivers/tty/serial/serial_core.c |  8 ++++----
 drivers/tty/synclink.c           |  6 +++---
 drivers/tty/synclink_gt.c        |  6 +++---
 drivers/tty/synclinkmp.c         |  6 +++---
 drivers/tty/tty_port.c           | 12 ++++++------
 include/linux/tty.h              | 12 ++++++++++++
 net/irda/ircomm/ircomm_tty.c     | 10 +++++-----
 11 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index f1edc0814120..d8468f3529a7 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1622,7 +1622,7 @@ isdn_tty_hangup(struct tty_struct *tty)
 		return;
 	isdn_tty_shutdown(info);
 	port->count = 0;
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(port, 0);
 	port->tty = NULL;
 	wake_up_interruptible(&port->open_wait);
 }
@@ -1979,7 +1979,7 @@ isdn_tty_find_icall(int di, int ch, setup_parm *setup)
 #endif
 			if (
 #ifndef FIX_FILE_TRANSFER
-				(info->port.flags & ASYNC_NORMAL_ACTIVE) &&
+			    tty_port_active(&info->port) &&
 #endif
 				(info->isdn_driver == -1) &&
 				(info->isdn_channel == -1) &&
@@ -2018,8 +2018,6 @@ isdn_tty_find_icall(int di, int ch, setup_parm *setup)
 	return (wret == 2) ? 3 : 0;
 }
 
-#define TTY_IS_ACTIVE(info)	(info->port.flags & ASYNC_NORMAL_ACTIVE)
-
 int
 isdn_tty_stat_callback(int i, isdn_ctrl *c)
 {
@@ -2077,7 +2075,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_DCONN ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing == 1) {
 					info->dialing = 2;
 					return 1;
@@ -2088,7 +2086,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_DHUP ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing == 1)
 					isdn_tty_modem_result(RESULT_BUSY, info);
 				if (info->dialing > 1)
@@ -2118,7 +2116,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 			 * waiting for it and
 			 * set DCD-bit of its modem-status.
 			 */
-			if (TTY_IS_ACTIVE(info) ||
+			if (tty_port_active(&info->port) ||
 			    (info->port.blocked_open &&
 			     (info->emu.mdmreg[REG_DCD] & BIT_DCD))) {
 				info->msr |= UART_MSR_DCD;
@@ -2145,7 +2143,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_BHUP ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 #ifdef ISDN_DEBUG_MODEM_HUP
 				printk(KERN_DEBUG "Mhup in ISDN_STAT_BHUP\n");
 #endif
@@ -2157,7 +2155,7 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 #ifdef ISDN_TTY_STAT_DEBUG
 			printk(KERN_DEBUG "tty_STAT_NODCH ttyI%d\n", info->line);
 #endif
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				if (info->dialing) {
 					info->dialing = 0;
 					info->last_l2 = -1;
@@ -2183,14 +2181,14 @@ isdn_tty_stat_callback(int i, isdn_ctrl *c)
 			return 1;
 #ifdef CONFIG_ISDN_TTY_FAX
 		case ISDN_STAT_FAXIND:
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				isdn_tty_fax_command(info, c);
 			}
 			break;
 #endif
 #ifdef CONFIG_ISDN_AUDIO
 		case ISDN_STAT_AUDIO:
-			if (TTY_IS_ACTIVE(info)) {
+			if (tty_port_active(&info->port)) {
 				switch (c->parm.num[0]) {
 				case ISDN_AUDIO_DTMF:
 					if (info->vonline) {
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 92717b088959..80d61658efb0 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -1493,7 +1493,7 @@ static void rs_hangup(struct tty_struct *tty)
 	rs_flush_buffer(tty);
 	shutdown(tty, info);
 	info->tport.count = 0;
-	info->tport.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->tport, 0);
 	info->tport.tty = NULL;
 	wake_up_interruptible(&info->tport.open_wait);
 }
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 0b802cdd70d0..eb8311b20782 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -1042,9 +1042,10 @@ static void rp_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	spin_lock_irq(&port->lock);
-	info->port.flags &= ~(ASYNC_INITIALIZED | ASYNC_NORMAL_ACTIVE);
+	port->flags &= ~ASYNC_INITIALIZED;
 	tty->closing = 0;
 	spin_unlock_irq(&port->lock);
+	tty_port_set_active(port, 0);
 	mutex_unlock(&port->mutex);
 	tty_port_tty_set(port, NULL);
 
@@ -1624,7 +1625,7 @@ static int rp_write(struct tty_struct *tty,
 	/*  Write remaining data into the port's xmit_buf */
 	while (1) {
 		/* Hung up ? */
-		if (!test_bit(ASYNCB_NORMAL_ACTIVE, &info->port.flags))
+		if (!tty_port_active(&info->port))
 			goto end;
 		c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
 		c = min(c, XMIT_BUF_SIZE - info->xmit_head);
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 546990334815..92c8c628e00e 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3648,8 +3648,8 @@ rs_close(struct tty_struct *tty, struct file * filp)
 			schedule_timeout_interruptible(info->port.close_delay);
 		wake_up_interruptible(&info->port.open_wait);
 	}
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	local_irq_restore(flags);
+	tty_port_set_active(&info->port, 0);
 
 	/* port closed */
 
@@ -3732,7 +3732,7 @@ rs_hangup(struct tty_struct *tty)
 	shutdown(info);
 	info->event = 0;
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 0);
 	info->port.tty = NULL;
 	wake_up_interruptible(&info->port.open_wait);
 }
@@ -3756,7 +3756,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 	 * then make the check up front and then exit.
 	 */
 	if ((filp->f_flags & O_NONBLOCK) || tty_io_error(tty)) {
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(&info->port, 1);
 		return 0;
 	}
 
@@ -3825,7 +3825,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 #endif
 	if (retval)
 		return retval;
-	info->port.flags |= ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 1);
 	return 0;
 }
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 64a5c00d7468..2471380fb92e 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1418,12 +1418,12 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 		uart_change_pm(state, UART_PM_STATE_OFF);
 		spin_lock_irq(&port->lock);
 	}
+	spin_unlock_irq(&port->lock);
+	tty_port_set_active(port, 0);
 
 	/*
 	 * Wake up anyone trying to open this port.
 	 */
-	clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
-	spin_unlock_irq(&port->lock);
 	wake_up_interruptible(&port->open_wait);
 
 	mutex_unlock(&port->mutex);
@@ -1501,13 +1501,13 @@ static void uart_hangup(struct tty_struct *tty)
 	pr_debug("uart_hangup(%d)\n", tty->index);
 
 	mutex_lock(&port->mutex);
-	if (port->flags & ASYNC_NORMAL_ACTIVE) {
+	if (tty_port_active(port)) {
 		uart_flush_buffer(tty);
 		uart_shutdown(tty, state);
 		spin_lock_irqsave(&port->lock, flags);
 		port->count = 0;
-		clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 		spin_unlock_irqrestore(&port->lock, flags);
+		tty_port_set_active(port, 0);
 		tty_port_tty_set(port, NULL);
 		if (!uart_console(state->uart_port))
 			uart_change_pm(state, UART_PM_STATE_OFF);
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 0e4290183280..b55f8468cde5 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -3201,7 +3201,7 @@ static void mgsl_hangup(struct tty_struct *tty)
 	shutdown(info);
 	
 	info->port.count = 0;	
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
+	tty_port_set_active(&info->port, 0);
 	info->port.tty = NULL;
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3269,7 +3269,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3338,7 +3338,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 			 __FILE__,__LINE__, tty->driver->name, port->count );
 			 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		
 	return retval;
 	
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 5da69d30f816..c76f546697dc 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -756,9 +756,9 @@ static void hangup(struct tty_struct *tty)
 
 	spin_lock_irqsave(&info->port.lock, flags);
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	info->port.tty = NULL;
 	spin_unlock_irqrestore(&info->port.lock, flags);
+	tty_port_set_active(&info->port, 0);
 	mutex_unlock(&info->port.mutex);
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3268,7 +3268,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3325,7 +3325,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open--;
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
 	return retval;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 7a21491d0c0d..95eddc4d9eb8 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -849,9 +849,9 @@ static void hangup(struct tty_struct *tty)
 
 	spin_lock_irqsave(&info->port.lock, flags);
 	info->port.count = 0;
-	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	info->port.tty = NULL;
 	spin_unlock_irqrestore(&info->port.lock, flags);
+	tty_port_set_active(&info->port, 1);
 	mutex_unlock(&info->port.mutex);
 
 	wake_up_interruptible(&info->port.open_wait);
@@ -3285,7 +3285,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	if (filp->f_flags & O_NONBLOCK || tty_io_error(tty)) {
 		/* nonblock mode is set or port is not enabled */
 		/* just verify that callout device is not active */
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -3352,7 +3352,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 			 __FILE__,__LINE__, tty->driver->name, port->count );
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	return retval;
 }
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 9127c54b803e..130c8cf520cb 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -236,12 +236,12 @@ void tty_port_hangup(struct tty_port *port)
 
 	spin_lock_irqsave(&port->lock, flags);
 	port->count = 0;
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	tty = port->tty;
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 	port->tty = NULL;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 	tty_port_shutdown(port, tty);
 	tty_kref_put(tty);
 	wake_up_interruptible(&port->open_wait);
@@ -365,14 +365,14 @@ int tty_port_block_til_ready(struct tty_port *port,
 	/* if non-blocking mode is set we can pass directly to open unless
 	   the port has just hung up or is in another error state */
 	if (tty_io_error(tty)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 	if (filp->f_flags & O_NONBLOCK) {
 		/* Indicate we are open */
 		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -430,9 +430,9 @@ int tty_port_block_til_ready(struct tty_port *port,
 	if (!tty_hung_up_p(filp))
 		port->count++;
 	port->blocked_open--;
-	if (retval == 0)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
 	spin_unlock_irqrestore(&port->lock, flags);
+	if (retval == 0)
+		tty_port_set_active(port, 1);
 	return retval;
 }
 EXPORT_SYMBOL(tty_port_block_til_ready);
@@ -514,8 +514,8 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 		spin_lock_irqsave(&port->lock, flags);
 		wake_up_interruptible(&port->open_wait);
 	}
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 }
 EXPORT_SYMBOL(tty_port_close_end);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 989d755b0ae4..dbeeb8666ae4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -571,6 +571,18 @@ static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
 }
 
+static inline bool tty_port_active(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_ACTIVE, &port->iflags);
+}
+
+static inline void tty_port_set_active(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_ACTIVE, &port->iflags);
+	else
+		clear_bit(TTY_PORT_ACTIVE, &port->iflags);
+}
 
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 840b82f760ba..681fe0bfe558 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -281,7 +281,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	 * then make the check up front and then exit.
 	 */
 	if (tty_io_error(tty)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -289,7 +289,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		/* nonblock mode is set */
 		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
 		return 0;
 	}
@@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		 __FILE__, __LINE__, tty->driver->name, port->count);
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	return retval;
 }
@@ -925,7 +925,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	ircomm_tty_shutdown(self);
 
 	spin_lock_irqsave(&port->lock, flags);
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	if (port->tty) {
 		set_bit(TTY_IO_ERROR, &port->tty->flags);
 		tty_kref_put(port->tty);
@@ -933,6 +932,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	port->tty = NULL;
 	port->count = 0;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 
 	wake_up_interruptible(&port->open_wait);
 }
@@ -1267,7 +1267,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+	if (tty_port_active(&self->port)) {
 		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
 		sep = '|';
 	}
-- 
cgit v1.2.3


From 2d68655d15bc99981394f7caa769a14b03cac131 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:23 -0700
Subject: tty: Replace ASYNC_CHECK_CD and update atomically

Replace ASYNC_CHECK_CD bit in the tty_port::flags field with
TTY_PORT_CHECK_CD bit in the tty_port::iflags field. Introduce helpers
tty_port_set_check_carrier() and tty_port_check_carrier() to abstract
the atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c   |  8 ++------
 drivers/isdn/i4l/isdn_tty.c         |  8 ++------
 drivers/tty/amiserial.c             |  9 +++------
 drivers/tty/cyclades.c              | 14 ++++----------
 drivers/tty/isicom.c                |  7 ++-----
 drivers/tty/mxser.c                 |  9 +++------
 drivers/tty/synclink.c              |  8 ++------
 drivers/tty/synclink_gt.c           |  8 ++------
 drivers/tty/synclinkmp.c            |  8 ++------
 include/linux/tty.h                 | 13 +++++++++++++
 net/irda/ircomm/ircomm_tty.c        |  4 ++--
 net/irda/ircomm/ircomm_tty_attach.c |  2 +-
 net/irda/ircomm/ircomm_tty_ioctl.c  |  5 +----
 13 files changed, 39 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bdf41ac613dc..bf54f4e23b6f 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1101,7 +1101,7 @@ static void dcd_change(MGSLPC_INFO *info, struct tty_struct *tty)
 	wake_up_interruptible(&info->status_event_wait_q);
 	wake_up_interruptible(&info->event_wait_q);
 
-	if (info->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&info->port)) {
 		if (debug_level >= DEBUG_LEVEL_ISR)
 			printk("%s CD now %s...", info->device_name,
 			       (info->serial_signals & SerialSignal_DCD) ? "on" : "off");
@@ -1467,11 +1467,7 @@ static void mgslpc_change_params(MGSLPC_INFO *info, struct tty_struct *tty)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index d8468f3529a7..023a350a8cd8 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1043,11 +1043,7 @@ isdn_tty_change_speed(modem_info *info)
 	if (!(cflag & PARODD))
 		cval |= UART_LCR_EPAR;
 
-	if (cflag & CLOCAL)
-		port->flags &= ~ASYNC_CHECK_CD;
-	else {
-		port->flags |= ASYNC_CHECK_CD;
-	}
+	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
 }
 
 static int
@@ -2526,7 +2522,7 @@ isdn_tty_modem_result(int code, modem_info *info)
 		if (info->closing || (!info->port.tty))
 			return;
 
-		if (info->port.flags & ASYNC_CHECK_CD)
+		if (tty_port_check_carrier(&info->port))
 			tty_hangup(info->port.tty);
 	}
 }
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 80d61658efb0..b4ab97d56351 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -398,7 +398,7 @@ static void check_modem_status(struct serial_state *info)
 		wake_up_interruptible(&port->delta_msr_wait);
 	}
 
-	if ((port->flags & ASYNC_CHECK_CD) && (dstatus & SER_DCD)) {
+	if (tty_port_check_carrier(port) && (dstatus & SER_DCD)) {
 #if (defined(SERIAL_DEBUG_OPEN) || defined(SERIAL_DEBUG_INTR))
 		printk("ttyS%d CD now %s...", info->line,
 		       (!(status & SER_DCD)) ? "on" : "off");
@@ -730,12 +730,9 @@ static void change_speed(struct tty_struct *tty, struct serial_state *info,
 	tty_port_set_cts_flow(port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS)
 		info->IER |= UART_IER_MSI;
-	if (cflag & CLOCAL)
-		port->flags &= ~ASYNC_CHECK_CD;
-	else {
-		port->flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
+	if (~cflag & CLOCAL)
 		info->IER |= UART_IER_MSI;
-	}
 	/* TBD:
 	 * Does clearing IER_MSI imply that we should disable the VBL interrupt ?
 	 */
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 1a12776ba24c..9d1e19ba25cb 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -714,7 +714,7 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 		wake_up_interruptible(&info->port.delta_msr_wait);
 	}
 
-	if ((mdm_change & CyDCD) && (info->port.flags & ASYNC_CHECK_CD)) {
+	if ((mdm_change & CyDCD) && tty_port_check_carrier(&info->port)) {
 		if (mdm_status & CyDCD)
 			wake_up_interruptible(&info->port.open_wait);
 		else
@@ -1119,7 +1119,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 		case C_CM_MDCD:
 			info->icount.dcd++;
 			delta_count++;
-			if (info->port.flags & ASYNC_CHECK_CD) {
+			if (tty_port_check_carrier(&info->port)) {
 				u32 dcd = fw_ver > 241 ? param :
 					readl(&info->u.cyz.ch_ctrl->rs_status);
 				if (dcd & C_RS_DCD)
@@ -2088,10 +2088,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 			info->cor2 |= CyCtsAE;
 		else
 			info->cor2 &= ~CyCtsAE;
-		if (cflag & CLOCAL)
-			info->port.flags &= ~ASYNC_CHECK_CD;
-		else
-			info->port.flags |= ASYNC_CHECK_CD;
+		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	 /***********************************************
 	    The hardware option, CyRtsAO, presents RTS when
@@ -2250,10 +2247,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 		}
 
 		/* CD sensitivity */
-		if (cflag & CLOCAL)
-			info->port.flags &= ~ASYNC_CHECK_CD;
-		else
-			info->port.flags |= ASYNC_CHECK_CD;
+		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 		if (baud == 0) {	/* baud rate is zero, turn off line */
 			cy_writel(&ch_ctrl->rs_control,
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index c5f06b54b9ca..0b2bae1b2d55 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -577,7 +577,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id)
 		header = inw(base);
 		switch (header & 0xff) {
 		case 0:	/* Change in EIA signals */
-			if (port->port.flags & ASYNC_CHECK_CD) {
+			if (tty_port_check_carrier(&port->port)) {
 				if (port->status & ISI_DCD) {
 					if (!(header & ISI_DCD)) {
 					/* Carrier has been lost  */
@@ -758,10 +758,7 @@ static void isicom_config_port(struct tty_struct *tty)
 		outw(channel_setup, base);
 		InterruptTheCard(base);
 	}
-	if (C_CLOCAL(tty))
-		port->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		port->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&port->port, !C_CLOCAL(tty));
 
 	/* flow control settings ...*/
 	flow_ctrl = 0;
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 8f3fdad37ac7..ab618ef3d171 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -746,12 +746,9 @@ static int mxser_change_speed(struct tty_struct *tty,
 		}
 	}
 	outb(info->MCR, info->ioaddr + UART_MCR);
-	if (cflag & CLOCAL) {
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	} else {
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
+	if (~cflag & CLOCAL)
 		info->IER |= UART_IER_MSI;
-	}
 	outb(info->IER, info->ioaddr + UART_IER);
 
 	/*
@@ -824,7 +821,7 @@ static void mxser_check_modem_status(struct tty_struct *tty,
 	port->mon_data.modem_status = status;
 	wake_up_interruptible(&port->port.delta_msr_wait);
 
-	if ((port->port.flags & ASYNC_CHECK_CD) && (status & UART_MSR_DDCD)) {
+	if (tty_port_check_carrier(&port->port) && (status & UART_MSR_DDCD)) {
 		if (status & UART_MSR_DCD)
 			wake_up_interruptible(&port->port.open_wait);
 	}
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index b55f8468cde5..b67b54a800bb 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1340,7 +1340,7 @@ static void mgsl_isr_io_pin( struct mgsl_struct *info )
 		wake_up_interruptible(&info->status_event_wait_q);
 		wake_up_interruptible(&info->event_wait_q);
 
-		if ( (info->port.flags & ASYNC_CHECK_CD) && 
+		if (tty_port_check_carrier(&info->port) &&
 		     (status & MISCSTATUS_DCD_LATCHED) ) {
 			if ( debug_level >= DEBUG_LEVEL_ISR )
 				printk("%s CD now %s...", info->device_name,
@@ -1967,11 +1967,7 @@ static void mgsl_change_params(struct mgsl_struct *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 	
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index c76f546697dc..333652a8896a 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -2080,7 +2080,7 @@ static void dcd_change(struct slgt_info *info, unsigned short status)
 	wake_up_interruptible(&info->event_wait_q);
 	info->pending_bh |= BH_STATUS;
 
-	if (info->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&info->port)) {
 		if (info->signals & SerialSignal_DCD)
 			wake_up_interruptible(&info->port.open_wait);
 		else {
@@ -2577,11 +2577,7 @@ static void change_params(struct slgt_info *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 95eddc4d9eb8..17bab5f5b858 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -2463,7 +2463,7 @@ static void isr_io_pin( SLMP_INFO *info, u16 status )
 		wake_up_interruptible(&info->status_event_wait_q);
 		wake_up_interruptible(&info->event_wait_q);
 
-		if ( (info->port.flags & ASYNC_CHECK_CD) &&
+		if (tty_port_check_carrier(&info->port) &&
 		     (status & MISCSTATUS_DCD_LATCHED) ) {
 			if ( debug_level >= DEBUG_LEVEL_ISR )
 				printk("%s CD now %s...", info->device_name,
@@ -2814,11 +2814,7 @@ static void change_params(SLMP_INFO *info)
 	info->timeout += HZ/50;		/* Add .02 seconds of slop */
 
 	tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-
-	if (cflag & CLOCAL)
-		info->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		info->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
 
 	/* process tty input control flags */
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index dbeeb8666ae4..4254dfb12fb1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -584,6 +584,19 @@ static inline void tty_port_set_active(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_ACTIVE, &port->iflags);
 }
 
+static inline bool tty_port_check_carrier(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_CHECK_CD, &port->iflags);
+}
+
+static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_CHECK_CD, &port->iflags);
+	else
+		clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 681fe0bfe558..5b7ce599c709 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -999,7 +999,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
 	if (status & IRCOMM_DCE_DELTA_ANY) {
 		/*wake_up_interruptible(&self->delta_msr_wait);*/
 	}
-	if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+	if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) {
 		pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
 			 (status & IRCOMM_CD) ? "on" : "off");
 
@@ -1255,7 +1255,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_CTS_FLOW", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&self->port)) {
 		seq_printf(m, "%cASYNC_CHECK_CD", sep);
 		sep = '|';
 	}
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index 61137f8b5293..0a411019c098 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
 		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
 		ircomm_tty_start_watchdog_timer(self, 3*HZ);
 
-		if (self->port.flags & ASYNC_CHECK_CD) {
+		if (tty_port_check_carrier(&self->port)) {
 			/* Drop carrier */
 			self->settings.dce = IRCOMM_DELTA_CD;
 			ircomm_tty_check_modem_status(self);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 1220973c7c43..e24724db36a2 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -96,10 +96,7 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
 	} else {
 		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
 	}
-	if (cflag & CLOCAL)
-		self->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		self->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL);
 #if 0
 	/*
 	 * Set up parity check flag
-- 
cgit v1.2.3


From 80f02d5424301bf4df195d09b1a664f394435851 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:24 -0700
Subject: tty: Replace ASYNC_SUSPENDED bit and update atomically

Replace ASYNC_SUSPENDED bit in the tty_port::flags field with
TTY_PORT_SUSPENDED bit in the tty_port::iflags field. Introduce helpers
tty_port_set_suspended() and tty_port_suspended() to abstract
atomic bit ops.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/char/con3215.c      | 12 ++++++------
 drivers/tty/serial/serial_core.c |  8 ++++----
 include/linux/tty.h              | 13 +++++++++++++
 3 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index e7e078b3c7e6..114fe2845270 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -289,7 +289,7 @@ static void raw3215_timeout(unsigned long __data)
 
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw->flags &= ~RAW3215_TIMER_RUNS;
-	if (!(raw->port.flags & ASYNC_SUSPENDED)) {
+	if (!tty_port_suspended(&raw->port)) {
 		raw3215_mk_write_req(raw);
 		raw3215_start_io(raw);
 		if ((raw->queued_read || raw->queued_write) &&
@@ -312,7 +312,7 @@ static void raw3215_timeout(unsigned long __data)
 static inline void raw3215_try_io(struct raw3215_info *raw)
 {
 	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-			(raw->port.flags & ASYNC_SUSPENDED))
+	    tty_port_suspended(&raw->port))
 		return;
 	if (raw->queued_read != NULL)
 		raw3215_start_io(raw);
@@ -494,7 +494,7 @@ static void raw3215_make_room(struct raw3215_info *raw, unsigned int length)
 		/* While console is frozen for suspend we have no other
 		 * choice but to drop message from the buffer to make
 		 * room for even more messages. */
-		if (raw->port.flags & ASYNC_SUSPENDED) {
+		if (tty_port_suspended(&raw->port)) {
 			raw3215_drop_line(raw);
 			continue;
 		}
@@ -773,7 +773,7 @@ static int raw3215_pm_stop(struct ccw_device *cdev)
 	raw = dev_get_drvdata(&cdev->dev);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw3215_make_room(raw, RAW3215_BUFFER_SIZE);
-	raw->port.flags |= ASYNC_SUSPENDED;
+	tty_port_set_suspended(&raw->port, 1);
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
 	return 0;
 }
@@ -786,7 +786,7 @@ static int raw3215_pm_start(struct ccw_device *cdev)
 	/* Allow I/O again and flush output buffer. */
 	raw = dev_get_drvdata(&cdev->dev);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
-	raw->port.flags &= ~ASYNC_SUSPENDED;
+	tty_port_set_suspended(&raw->port, 0);
 	raw->flags |= RAW3215_FLUSHING;
 	raw3215_try_io(raw);
 	raw->flags &= ~RAW3215_FLUSHING;
@@ -859,7 +859,7 @@ static void con3215_flush(void)
 	unsigned long flags;
 
 	raw = raw3215[0];  /* console 3215 is the first one */
-	if (raw->port.flags & ASYNC_SUSPENDED)
+	if (tty_port_suspended(&raw->port))
 		/* The console is still frozen for suspend. */
 		if (ccw_device_force_console(raw->cdev))
 			/* Forcing didn't work, no panic message .. */
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2471380fb92e..933606777f45 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -249,7 +249,7 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 	 * a DCD drop (hangup) at just the right time.  Clear suspended bit so
 	 * we don't try to resume a port that has been shutdown.
 	 */
-	clear_bit(ASYNCB_SUSPENDED, &port->flags);
+	tty_port_set_suspended(port, 0);
 
 	/*
 	 * Free the transmit buffer page.
@@ -2007,7 +2007,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 		const struct uart_ops *ops = uport->ops;
 		int tries;
 
-		set_bit(ASYNCB_SUSPENDED, &port->flags);
+		tty_port_set_suspended(port, 1);
 		clear_bit(ASYNCB_INITIALIZED, &port->flags);
 
 		spin_lock_irq(&uport->lock);
@@ -2088,7 +2088,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 			console_start(uport->cons);
 	}
 
-	if (port->flags & ASYNC_SUSPENDED) {
+	if (tty_port_suspended(port)) {
 		const struct uart_ops *ops = uport->ops;
 		int ret;
 
@@ -2118,7 +2118,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 			}
 		}
 
-		clear_bit(ASYNCB_SUSPENDED, &port->flags);
+		tty_port_set_suspended(port, 0);
 	}
 
 	mutex_unlock(&port->mutex);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4254dfb12fb1..7ac5add66c00 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -597,6 +597,19 @@ static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
 }
 
+static inline bool tty_port_suspended(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_SUSPENDED, &port->iflags);
+}
+
+static inline void tty_port_set_suspended(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_SUSPENDED, &port->iflags);
+	else
+		clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
-- 
cgit v1.2.3


From d41861ca19c9e96f12a4f1ebbc8255d00909a232 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 17:53:25 -0700
Subject: tty: Replace ASYNC_INITIALIZED bit and update atomically

Replace ASYNC_INITIALIZED bit in the tty_port::flags field with
TTY_PORT_INITIALIZED bit in the tty_port::iflags field. Introduce helpers
tty_port_set_initialized() and tty_port_initialized() to abstract
atomic bit ops.

Note: the transforms for test_and_set_bit() and test_and_clear_bit()
are unnecessary as the state transitions are already mutually exclusive;
the tty lock prevents concurrent open/close/hangup.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c  | 12 +++++-----
 drivers/ipack/devices/ipoctal.c    |  5 ++---
 drivers/isdn/i4l/isdn_tty.c        | 10 ++++-----
 drivers/s390/char/con3215.c        | 12 +++++-----
 drivers/tty/amiserial.c            | 14 ++++++------
 drivers/tty/cyclades.c             | 14 ++++++------
 drivers/tty/isicom.c               |  6 ++---
 drivers/tty/moxa.c                 | 10 ++++-----
 drivers/tty/mxser.c                | 14 +++++-------
 drivers/tty/n_gsm.c                |  8 +++----
 drivers/tty/rocket.c               | 10 ++++-----
 drivers/tty/serial/crisv10.c       | 17 +++++++-------
 drivers/tty/serial/serial_core.c   | 24 +++++++++++---------
 drivers/tty/synclink.c             | 46 ++++++++++++++++++--------------------
 drivers/tty/synclink_gt.c          | 16 ++++++-------
 drivers/tty/synclinkmp.c           | 16 ++++++-------
 drivers/tty/tty_port.c             | 13 ++++++-----
 drivers/usb/class/cdc-acm.c        |  4 ++--
 drivers/usb/serial/console.c       |  4 ++--
 drivers/usb/serial/generic.c       |  6 ++---
 drivers/usb/serial/mxuport.c       |  6 ++---
 drivers/usb/serial/sierra.c        |  4 ++--
 drivers/usb/serial/usb-serial.c    |  2 +-
 drivers/usb/serial/usb_wwan.c      |  4 ++--
 include/linux/tty.h                | 13 +++++++++++
 net/irda/ircomm/ircomm_tty.c       | 15 +++++++------
 net/irda/ircomm/ircomm_tty_ioctl.c |  2 +-
 27 files changed, 157 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index bf54f4e23b6f..345ca7c7ea74 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1272,7 +1272,7 @@ static int startup(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):startup(%s)\n", __FILE__, __LINE__, info->device_name);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -1311,7 +1311,7 @@ static int startup(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (tty)
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -1322,7 +1322,7 @@ static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -1361,7 +1361,7 @@ static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void mgslpc_program_hw(MGSLPC_INFO *info, struct tty_struct *tty)
@@ -2338,7 +2338,7 @@ static void mgslpc_close(struct tty_struct *tty, struct file * filp)
 	if (tty_port_close_start(port, tty, filp) == 0)
 		goto cleanup;
 
-	if (port->flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(port))
 		mgslpc_wait_until_sent(tty, info->timeout);
 
 	mgslpc_flush_buffer(tty);
@@ -2371,7 +2371,7 @@ static void mgslpc_wait_until_sent(struct tty_struct *tty, int timeout)
 	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_wait_until_sent"))
 		return;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c
index 035d5449227e..75dd15d66df6 100644
--- a/drivers/ipack/devices/ipoctal.c
+++ b/drivers/ipack/devices/ipoctal.c
@@ -629,8 +629,7 @@ static void ipoctal_hangup(struct tty_struct *tty)
 	tty_port_hangup(&channel->tty_port);
 
 	ipoctal_reset_channel(channel);
-
-	clear_bit(ASYNCB_INITIALIZED, &channel->tty_port.flags);
+	tty_port_set_initialized(&channel->tty_port, 0);
 	wake_up_interruptible(&channel->tty_port.open_wait);
 }
 
@@ -642,7 +641,7 @@ static void ipoctal_shutdown(struct tty_struct *tty)
 		return;
 
 	ipoctal_reset_channel(channel);
-	clear_bit(ASYNCB_INITIALIZED, &channel->tty_port.flags);
+	tty_port_set_initialized(&channel->tty_port, 0);
 }
 
 static void ipoctal_cleanup(struct tty_struct *tty)
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 023a350a8cd8..63eaa0a9f8a1 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1049,7 +1049,7 @@ isdn_tty_change_speed(modem_info *info)
 static int
 isdn_tty_startup(modem_info *info)
 {
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 	isdn_lock_drivers();
 #ifdef ISDN_DEBUG_MODEM_OPEN
@@ -1066,7 +1066,7 @@ isdn_tty_startup(modem_info *info)
 	 */
 	isdn_tty_change_speed(info);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 	info->msr |= (UART_MSR_DSR | UART_MSR_CTS);
 	info->send_outstanding = 0;
 	return 0;
@@ -1079,7 +1079,7 @@ isdn_tty_startup(modem_info *info)
 static void
 isdn_tty_shutdown(modem_info *info)
 {
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 #ifdef ISDN_DEBUG_MODEM_OPEN
 	printk(KERN_DEBUG "Shutting down isdnmodem port %d ....\n", info->line);
@@ -1099,7 +1099,7 @@ isdn_tty_shutdown(modem_info *info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 /* isdn_tty_write() is the main send-routine. It is called from the upper
@@ -1577,7 +1577,7 @@ isdn_tty_close(struct tty_struct *tty, struct file *filp)
 	 * interrupt driver to stop checking the data ready bit in the
 	 * line status register.
 	 */
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		tty_wait_until_sent(tty, 3000);	/* 30 seconds timeout */
 		/*
 		 * Before we drop DTR, make sure the UART transmitter
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 114fe2845270..931d10e86837 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -311,8 +311,7 @@ static void raw3215_timeout(unsigned long __data)
  */
 static inline void raw3215_try_io(struct raw3215_info *raw)
 {
-	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-	    tty_port_suspended(&raw->port))
+	if (!tty_port_initialized(&raw->port) || tty_port_suspended(&raw->port))
 		return;
 	if (raw->queued_read != NULL)
 		raw3215_start_io(raw);
@@ -616,10 +615,10 @@ static int raw3215_startup(struct raw3215_info *raw)
 {
 	unsigned long flags;
 
-	if (raw->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&raw->port))
 		return 0;
 	raw->line_pos = 0;
-	raw->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&raw->port, 1);
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 	raw3215_try_io(raw);
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
@@ -635,8 +634,7 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 	DECLARE_WAITQUEUE(wait, current);
 	unsigned long flags;
 
-	if (!(raw->port.flags & ASYNC_INITIALIZED) ||
-	    (raw->flags & RAW3215_FIXED))
+	if (!tty_port_initialized(&raw->port) || (raw->flags & RAW3215_FIXED))
 		return;
 	/* Wait for outstanding requests, then free irq */
 	spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
@@ -650,7 +648,7 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 		spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 		remove_wait_queue(&raw->empty_wait, &wait);
 		set_current_state(TASK_RUNNING);
-		raw->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&raw->port, 1);
 	}
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
 }
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index b4ab97d56351..208f573495dc 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -525,7 +525,7 @@ static int startup(struct tty_struct *tty, struct serial_state *info)
 
 	local_irq_save(flags);
 
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		free_page(page);
 		goto errout;
 	}
@@ -586,7 +586,7 @@ static int startup(struct tty_struct *tty, struct serial_state *info)
 	 */
 	change_speed(tty, info, NULL);
 
-	port->flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(port, 1);
 	local_irq_restore(flags);
 	return 0;
 
@@ -604,7 +604,7 @@ static void shutdown(struct tty_struct *tty, struct serial_state *info)
 	unsigned long	flags;
 	struct serial_state *state;
 
-	if (!(info->tport.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->tport))
 		return;
 
 	state = info;
@@ -645,7 +645,7 @@ static void shutdown(struct tty_struct *tty, struct serial_state *info)
 
 	set_bit(TTY_IO_ERROR, &tty->flags);
 
-	info->tport.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->tport, 0);
 	local_irq_restore(flags);
 }
 
@@ -1084,7 +1084,7 @@ static int set_serial_info(struct tty_struct *tty, struct serial_state *state,
 	port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
 check_and_exit:
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		if (change_spd) {
 			if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
 				tty->alt_speed = 57600;
@@ -1390,7 +1390,7 @@ static void rs_close(struct tty_struct *tty, struct file * filp)
 	 * line status register.
 	 */
 	state->read_status_mask &= ~UART_LSR_DR;
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 	        /* disable receive interrupts */
 	        custom.intena = IF_RBF;
 		mb();
@@ -1538,7 +1538,7 @@ static inline void line_info(struct seq_file *m, int line,
 
 	local_irq_save(flags);
 	status = ciab.pra;
-	control = (state->tport.flags & ASYNC_INITIALIZED) ? state->MCR : status;
+	control = tty_port_initialized(&state->tport) ? state->MCR : status;
 	local_irq_restore(flags);
 
 	stat_buf[0] = 0;
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 9d1e19ba25cb..3840d6b421c4 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -1279,7 +1279,7 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 
 	spin_lock_irqsave(&card->card_lock, flags);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		goto errout;
 
 	if (!info->type) {
@@ -1364,7 +1364,7 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 		/* enable send, recv, modem !!! */
 	}
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	clear_bit(TTY_IO_ERROR, &tty->flags);
 	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
@@ -1424,7 +1424,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 	struct cyclades_card *card;
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	card = info->card;
@@ -1448,7 +1448,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		   some later date (after testing)!!! */
 
 		set_bit(TTY_IO_ERROR, &tty->flags);
-		info->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&info->port, 0);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	} else {
 #ifdef CY_DEBUG_OPEN
@@ -1473,7 +1473,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 			tty_port_lower_dtr_rts(&info->port);
 
 		set_bit(TTY_IO_ERROR, &tty->flags);
-		info->port.flags &= ~ASYNC_INITIALIZED;
+		tty_port_set_initialized(&info->port, 0);
 
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	}
@@ -1711,7 +1711,7 @@ static void cy_do_close(struct tty_port *port)
 		/* Stop accepting input */
 		cyy_writeb(info, CyCAR, channel & 0x03);
 		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyRxData);
-		if (info->port.flags & ASYNC_INITIALIZED) {
+		if (tty_port_initialized(&info->port)) {
 			/* Waiting for on-board buffers to be empty before
 			   closing the port */
 			spin_unlock_irqrestore(&card->card_lock, flags);
@@ -2334,7 +2334,7 @@ cy_set_serial_info(struct cyclades_port *info, struct tty_struct *tty,
 	info->port.closing_wait = new_serial.closing_wait * HZ / 100;
 
 check_and_exit:
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		cy_set_line_char(info, tty);
 		ret = 0;
 	} else {
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 0b2bae1b2d55..b70187b46d9d 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -438,8 +438,8 @@ static void isicom_tx(unsigned long _data)
 
 	for (; count > 0; count--, port++) {
 		/* port not active or tx disabled to force flow control */
-		if (!(port->port.flags & ASYNC_INITIALIZED) ||
-				!(port->status & ISI_TXOK))
+		if (!tty_port_initialized(&port->port) ||
+			!(port->status & ISI_TXOK))
 			continue;
 
 		txcount = min_t(short, TX_SIZE, port->xmit_cnt);
@@ -553,7 +553,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id)
 		return IRQ_HANDLED;
 	}
 	port = card->ports + channel;
-	if (!(port->port.flags & ASYNC_INITIALIZED)) {
+	if (!tty_port_initialized(&port->port)) {
 		outw(0x0000, base+0x04); /* enable interrupts */
 		spin_unlock(&card->card_lock);
 		return IRQ_HANDLED;
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index ce521d3f58cb..60d37b225589 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -912,7 +912,7 @@ static void moxa_board_deinit(struct moxa_board_conf *brd)
 
 	/* pci hot-un-plug support */
 	for (a = 0; a < brd->numPorts; a++)
-		if (brd->ports[a].port.flags & ASYNC_INITIALIZED)
+		if (tty_port_initialized(&brd->ports[a].port))
 			tty_port_tty_hangup(&brd->ports[a].port, false);
 
 	for (a = 0; a < MAX_PORTS_PER_BOARD; a++)
@@ -921,7 +921,7 @@ static void moxa_board_deinit(struct moxa_board_conf *brd)
 	while (1) {
 		opened = 0;
 		for (a = 0; a < brd->numPorts; a++)
-			if (brd->ports[a].port.flags & ASYNC_INITIALIZED)
+			if (tty_port_initialized(&brd->ports[a].port))
 				opened++;
 		mutex_unlock(&moxa_openlock);
 		if (!opened)
@@ -1192,13 +1192,13 @@ static int moxa_open(struct tty_struct *tty, struct file *filp)
 	tty->driver_data = ch;
 	tty_port_tty_set(&ch->port, tty);
 	mutex_lock(&ch->port.mutex);
-	if (!(ch->port.flags & ASYNC_INITIALIZED)) {
+	if (!tty_port_initialized(&ch->port)) {
 		ch->statusflags = 0;
 		moxa_set_tty_param(tty, &tty->termios);
 		MoxaPortLineCtrl(ch, 1, 1);
 		MoxaPortEnable(ch);
 		MoxaSetFifo(ch, ch->type == PORT_16550A);
-		ch->port.flags |= ASYNC_INITIALIZED;
+		tty_port_set_initialized(&ch->port, 1);
 	}
 	mutex_unlock(&ch->port.mutex);
 	mutex_unlock(&moxa_openlock);
@@ -1379,7 +1379,7 @@ static int moxa_poll_port(struct moxa_port *p, unsigned int handle,
 {
 	struct tty_struct *tty = tty_port_tty_get(&p->port);
 	void __iomem *ofsAddr;
-	unsigned int inited = p->port.flags & ASYNC_INITIALIZED;
+	unsigned int inited = tty_port_initialized(&p->port);
 	u16 intr;
 
 	if (tty) {
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index ab618ef3d171..7e8c27bf1ac8 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1081,12 +1081,10 @@ static void mxser_close(struct tty_struct *tty, struct file *filp)
 	mutex_lock(&port->mutex);
 	mxser_close_port(port);
 	mxser_flush_buffer(tty);
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(port);
-	}
+	if (tty_port_initialized(port) && C_HUPCL(tty))
+		tty_port_lower_dtr_rts(port);
 	mxser_shutdown_port(port);
-	clear_bit(ASYNCB_INITIALIZED, &port->flags);
+	tty_port_set_initialized(port, 0);
 	mutex_unlock(&port->mutex);
 	info->closing = 0;
 	/* Right now the tty_port set is done outside of the close_end helper
@@ -1282,7 +1280,7 @@ static int mxser_set_serial_info(struct tty_struct *tty,
 
 	process_txrx_fifo(info);
 
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
 		if (flags != (port->flags & ASYNC_SPD_MASK)) {
 			spin_lock_irqsave(&info->slock, sl_flags);
 			mxser_change_speed(tty, NULL);
@@ -1291,7 +1289,7 @@ static int mxser_set_serial_info(struct tty_struct *tty,
 	} else {
 		retval = mxser_activate(port, tty);
 		if (retval == 0)
-			set_bit(ASYNCB_INITIALIZED, &port->flags);
+			tty_port_set_initialized(port, 1);
 	}
 	return retval;
 }
@@ -2251,7 +2249,7 @@ static irqreturn_t mxser_interrupt(int irq, void *dev_id)
 				iir &= MOXA_MUST_IIR_MASK;
 				tty = tty_port_tty_get(&port->port);
 				if (!tty || port->closing ||
-				    !(port->port.flags & ASYNC_INITIALIZED)) {
+				    !tty_port_initialized(&port->port)) {
 					status = inb(port->ioaddr + UART_LSR);
 					outb(0x27, port->ioaddr + UART_FCR);
 					inb(port->ioaddr + UART_MSR);
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 365dfd8bc42b..9f7a7bbff57d 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2949,7 +2949,7 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 	dlci->modem_rx = 0;
 	/* We could in theory open and close before we wait - eg if we get
 	   a DM straight back. This is ok as that will have caused a hangup */
-	set_bit(ASYNCB_INITIALIZED, &port->flags);
+	tty_port_set_initialized(port, 1);
 	/* Start sending off SABM messages */
 	gsm_dlci_begin_open(dlci);
 	/* And wait for virtual carrier */
@@ -2972,10 +2972,8 @@ static void gsmtty_close(struct tty_struct *tty, struct file *filp)
 	if (tty_port_close_start(&dlci->port, tty, filp) == 0)
 		return;
 	gsm_dlci_begin_close(dlci);
-	if (test_bit(ASYNCB_INITIALIZED, &dlci->port.flags)) {
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(&dlci->port);
-	}
+	if (tty_port_initialized(&dlci->port) && C_HUPCL(tty))
+		tty_port_lower_dtr_rts(&dlci->port);
 	tty_port_close_end(&dlci->port, tty);
 	tty_port_tty_set(&dlci->port, NULL);
 	return;
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index eb8311b20782..7f3b1db88061 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -495,7 +495,7 @@ static void rp_handle_port(struct r_port *info)
 	if (!info)
 		return;
 
-	if ((info->port.flags & ASYNC_INITIALIZED) == 0) {
+	if (!tty_port_initialized(&info->port)) {
 		printk(KERN_WARNING "rp: WARNING: rp_handle_port called with "
 				"info->flags & NOT_INIT\n");
 		return;
@@ -920,7 +920,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (!tty_port_initialized(port)) {
 		cp = &info->channel;
 		sSetRxTrigger(cp, TRIG_1);
 		if (sGetChanStatus(cp) & CD_ACT)
@@ -944,7 +944,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 		sEnRxFIFO(cp);
 		sEnTransmit(cp);
 
-		set_bit(ASYNCB_INITIALIZED, &info->port.flags);
+		tty_port_set_initialized(&info->port, 1);
 
 		/*
 		 * Set up the tty->alt_speed kludge
@@ -1042,9 +1042,9 @@ static void rp_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	spin_lock_irq(&port->lock);
-	port->flags &= ~ASYNC_INITIALIZED;
 	tty->closing = 0;
 	spin_unlock_irq(&port->lock);
+	tty_port_set_initialized(port, 0);
 	tty_port_set_active(port, 0);
 	mutex_unlock(&port->mutex);
 	tty_port_tty_set(port, NULL);
@@ -1513,7 +1513,7 @@ static void rp_hangup(struct tty_struct *tty)
 	sDisCTSFlowCtl(cp);
 	sDisTxSoftFlowCtl(cp);
 	sClrTxXOFF(cp);
-	clear_bit(ASYNCB_INITIALIZED, &info->port.flags);
+	tty_port_set_initialized(&info->port, 0);
 
 	wake_up_interruptible(&info->port.open_wait);
 }
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 92c8c628e00e..315c84979b18 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -2599,7 +2599,7 @@ startup(struct e100_serial * info)
 
 	/* if it was already initialized, skip this */
 
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		local_irq_restore(flags);
 		free_page(xmit_page);
 		return 0;
@@ -2703,7 +2703,7 @@ startup(struct e100_serial * info)
 	e100_rts(info, 1);
 	e100_dtr(info, 1);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	local_irq_restore(flags);
 	return 0;
@@ -2745,7 +2745,7 @@ shutdown(struct e100_serial * info)
 		info->tr_running = 0;
 	}
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 #ifdef SERIAL_DEBUG_OPEN
@@ -2776,7 +2776,7 @@ shutdown(struct e100_serial * info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 	local_irq_restore(flags);
 }
 
@@ -3273,9 +3273,9 @@ set_serial_info(struct e100_serial *info,
 	info->port.low_latency = (info->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
  check_and_exit:
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port))
 		change_speed(info);
-	} else
+	else
 		retval = startup(info);
 	return retval;
 }
@@ -3628,7 +3628,7 @@ rs_close(struct tty_struct *tty, struct file * filp)
 	e100_disable_rx(info);
 	e100_disable_rx_irq(info);
 
-	if (info->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&info->port)) {
 		/*
 		 * Before we drop DTR, make sure the UART transmitter
 		 * has completely drained; this is especially
@@ -3787,8 +3787,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 		e100_dtr(info, 1);
 		local_irq_restore(flags);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) ||
-		    !(info->port.flags & ASYNC_INITIALIZED)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(&info->port)) {
 #ifdef SERIAL_DO_RESTART
 			if (info->port.flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 933606777f45..0c48051db172 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -196,7 +196,7 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state,
 	struct tty_port *port = &state->port;
 	int retval;
 
-	if (port->flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(port))
 		return 0;
 
 	/*
@@ -207,7 +207,7 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state,
 
 	retval = uart_port_startup(tty, state, init_hw);
 	if (!retval) {
-		set_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 1);
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 	} else if (retval > 0)
 		retval = 0;
@@ -231,7 +231,9 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	if (test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
+		tty_port_set_initialized(port, 0);
+
 		/*
 		 * Turn off DTR and RTS early.
 		 */
@@ -886,7 +888,7 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
 	retval = 0;
 	if (uport->type == PORT_UNKNOWN)
 		goto exit;
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		if (((old_flags ^ uport->flags) & UPF_SPD_MASK) ||
 		    old_custom_divisor != uport->custom_divisor) {
 			/*
@@ -1390,7 +1392,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * At this point, we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts.
 	 */
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		spin_lock_irq(&uport->lock);
 		uport->ops->stop_rx(uport);
 		spin_unlock_irq(&uport->lock);
@@ -2003,12 +2005,12 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 
 	uport->suspended = 1;
 
-	if (port->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(port)) {
 		const struct uart_ops *ops = uport->ops;
 		int tries;
 
 		tty_port_set_suspended(port, 1);
-		clear_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 0);
 
 		spin_lock_irq(&uport->lock);
 		ops->stop_tx(uport);
@@ -2107,7 +2109,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 				ops->set_mctrl(uport, uport->mctrl);
 				ops->start_tx(uport);
 				spin_unlock_irq(&uport->lock);
-				set_bit(ASYNCB_INITIALIZED, &port->flags);
+				tty_port_set_initialized(port, 1);
 			} else {
 				/*
 				 * Failed to resume - maybe hardware went away?
@@ -2248,10 +2250,10 @@ static int uart_poll_init(struct tty_driver *driver, int line, char *options)
 		ret = 0;
 		mutex_lock(&tport->mutex);
 		/*
-		 * We don't set ASYNCB_INITIALIZED as we only initialized the
-		 * hw, e.g. state->xmit is still uninitialized.
+		 * We don't set initialized as we only initialized the hw,
+		 * e.g. state->xmit is still uninitialized.
 		 */
-		if (!test_bit(ASYNCB_INITIALIZED, &tport->flags))
+		if (!tty_port_initialized(tport))
 			ret = port->ops->poll_init(port);
 		mutex_unlock(&tport->mutex);
 		if (ret)
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index b67b54a800bb..bc4bc1ff775e 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -1749,13 +1749,13 @@ static irqreturn_t mgsl_interrupt(int dummy, void *dev_id)
 static int startup(struct mgsl_struct * info)
 {
 	int retval = 0;
-	
+
 	if ( debug_level >= DEBUG_LEVEL_INFO )
 		printk("%s(%d):mgsl_startup(%s)\n",__FILE__,__LINE__,info->device_name);
-		
-	if (info->port.flags & ASYNC_INITIALIZED)
+
+	if (tty_port_initialized(&info->port))
 		return 0;
-	
+
 	if (!info->xmit_buf) {
 		/* allocate a page of memory for a transmit buffer */
 		info->xmit_buf = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -1788,14 +1788,13 @@ static int startup(struct mgsl_struct * info)
 
 	/* program hardware for current parameters */
 	mgsl_change_params(info);
-	
+
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
-	
+	tty_port_set_initialized(&info->port, 1);
+
 	return 0;
-	
 }	/* end of startup() */
 
 /* shutdown()
@@ -1808,8 +1807,8 @@ static int startup(struct mgsl_struct * info)
 static void shutdown(struct mgsl_struct * info)
 {
 	unsigned long flags;
-	
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -1853,13 +1852,12 @@ static void shutdown(struct mgsl_struct * info)
 
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 
-	mgsl_release_resources(info);	
-	
+	mgsl_release_resources(info);
+
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
-	
+	tty_port_set_initialized(&info->port, 0);
 }	/* end of shutdown() */
 
 static void mgsl_program_hw(struct mgsl_struct *info)
@@ -3084,7 +3082,7 @@ static void mgsl_close(struct tty_struct *tty, struct file * filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		mgsl_wait_until_sent(tty, info->timeout);
 	mgsl_flush_buffer(tty);
 	tty_ldisc_flush(tty);
@@ -3122,15 +3120,15 @@ static void mgsl_wait_until_sent(struct tty_struct *tty, int timeout)
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):mgsl_wait_until_sent(%s) entry\n",
 			 __FILE__,__LINE__, info->device_name );
-      
+
 	if (mgsl_paranoia_check(info, tty->name, "mgsl_wait_until_sent"))
 		return;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
-	 
+
 	orig_jiffies = jiffies;
-      
+
 	/* Set check interval to 1/5 of estimated time to
 	 * send a character, and make it at least 1. The check
 	 * interval should also be less than the timeout.
@@ -3290,14 +3288,14 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 	port->count--;
 	spin_unlock_irqrestore(&info->irq_spinlock, flags);
 	port->blocked_open++;
-	
+
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
-		
+
 		set_current_state(TASK_INTERRUPTIBLE);
-		
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 333652a8896a..82c98b820335 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -726,7 +726,7 @@ static void close(struct tty_struct *tty, struct file *filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		wait_until_sent(tty, info->timeout);
 	flush_buffer(tty);
 	tty_ldisc_flush(tty);
@@ -893,7 +893,7 @@ static void wait_until_sent(struct tty_struct *tty, int timeout)
 	if (sanity_check(info, tty->name, "wait_until_sent"))
 		return;
 	DBGINFO(("%s wait_until_sent entry\n", info->device_name));
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
@@ -2421,7 +2421,7 @@ static int startup(struct slgt_info *info)
 {
 	DBGINFO(("%s startup\n", info->device_name));
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -2442,7 +2442,7 @@ static int startup(struct slgt_info *info)
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -2454,7 +2454,7 @@ static void shutdown(struct slgt_info *info)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	DBGINFO(("%s shutdown\n", info->device_name));
@@ -2489,7 +2489,7 @@ static void shutdown(struct slgt_info *info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void program_hw(struct slgt_info *info)
@@ -3287,12 +3287,12 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 17bab5f5b858..6dcfc2089373 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -812,7 +812,7 @@ static void close(struct tty_struct *tty, struct file *filp)
 		goto cleanup;
 
 	mutex_lock(&info->port.mutex);
- 	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
  		wait_until_sent(tty, info->timeout);
 
 	flush_buffer(tty);
@@ -1061,7 +1061,7 @@ static void wait_until_sent(struct tty_struct *tty, int timeout)
 	if (sanity_check(info, tty->name, "wait_until_sent"))
 		return;
 
-	if (!test_bit(ASYNCB_INITIALIZED, &info->port.flags))
+	if (!tty_port_initialized(&info->port))
 		goto exit;
 
 	orig_jiffies = jiffies;
@@ -2636,7 +2636,7 @@ static int startup(SLMP_INFO * info)
 	if ( debug_level >= DEBUG_LEVEL_INFO )
 		printk("%s(%d):%s tx_releaseup()\n",__FILE__,__LINE__,info->device_name);
 
-	if (info->port.flags & ASYNC_INITIALIZED)
+	if (tty_port_initialized(&info->port))
 		return 0;
 
 	if (!info->tx_buf) {
@@ -2662,7 +2662,7 @@ static int startup(SLMP_INFO * info)
 	if (info->port.tty)
 		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags |= ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 1);
 
 	return 0;
 }
@@ -2673,7 +2673,7 @@ static void shutdown(SLMP_INFO * info)
 {
 	unsigned long flags;
 
-	if (!(info->port.flags & ASYNC_INITIALIZED))
+	if (!tty_port_initialized(&info->port))
 		return;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
@@ -2705,7 +2705,7 @@ static void shutdown(SLMP_INFO * info)
 	if (info->port.tty)
 		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
 
-	info->port.flags &= ~ASYNC_INITIALIZED;
+	tty_port_set_initialized(&info->port, 0);
 }
 
 static void program_hw(SLMP_INFO *info)
@@ -3308,12 +3308,12 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 130c8cf520cb..c3f9d93ba227 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -204,7 +204,8 @@ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty)
 	if (port->console)
 		goto out;
 
-	if (test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
+		tty_port_set_initialized(port, 0);
 		/*
 		 * Drop DTR/RTS if HUPCL is set. This causes any attached
 		 * modem to hang up the line.
@@ -393,13 +394,13 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	while (1) {
 		/* Indicate we are open */
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
 		/* Check for a hangup or uninitialised port.
 							Return accordingly */
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
@@ -480,7 +481,7 @@ int tty_port_close_start(struct tty_port *port,
 
 	tty->closing = 1;
 
-	if (test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (tty_port_initialized(port)) {
 		/* Don't block on a stalled port, just pull the chain */
 		if (tty->flow_stopped)
 			tty_driver_flush_buffer(tty);
@@ -578,7 +579,7 @@ int tty_port_open(struct tty_port *port, struct tty_struct *tty,
 
 	mutex_lock(&port->mutex);
 
-	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+	if (!tty_port_initialized(port)) {
 		clear_bit(TTY_IO_ERROR, &tty->flags);
 		if (port->ops->activate) {
 			int retval = port->ops->activate(port, tty);
@@ -587,7 +588,7 @@ int tty_port_open(struct tty_port *port, struct tty_struct *tty,
 				return retval;
 			}
 		}
-		set_bit(ASYNCB_INITIALIZED, &port->flags);
+		tty_port_set_initialized(port, 1);
 	}
 	mutex_unlock(&port->mutex);
 	return tty_port_block_til_ready(port, tty, filp);
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index a6c4a1b895bd..94a14f5dc4d4 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1680,7 +1680,7 @@ static int acm_resume(struct usb_interface *intf)
 	if (--acm->susp_count)
 		goto out;
 
-	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags)) {
+	if (tty_port_initialized(&acm->port)) {
 		rv = usb_submit_urb(acm->ctrlurb, GFP_ATOMIC);
 
 		for (;;) {
@@ -1710,7 +1710,7 @@ static int acm_reset_resume(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
 
-	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags))
+	if (tty_port_initialized(&acm->port))
 		tty_port_tty_hangup(&acm->port, false);
 
 	return acm_resume(intf);
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index a66b01bb1fa1..8967715fe6fc 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -127,7 +127,7 @@ static int usb_console_setup(struct console *co, char *options)
 	info->port = port;
 
 	++port->port.count;
-	if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags)) {
+	if (!tty_port_initialized(&port->port)) {
 		if (serial->type->set_termios) {
 			/*
 			 * allocate a fake tty so the driver can initialize
@@ -168,7 +168,7 @@ static int usb_console_setup(struct console *co, char *options)
 			tty_port_tty_set(&port->port, NULL);
 			tty_kref_put(tty);
 		}
-		set_bit(ASYNCB_INITIALIZED, &port->port.flags);
+		tty_port_set_initialized(&port->port, 1);
 	}
 	/* Now that any required fake tty operations are completed restore
 	 * the tty port count */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 54e170dd3dad..ae8c0365abd6 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -473,7 +473,7 @@ static bool usb_serial_generic_msr_changed(struct tty_struct *tty,
 	 * Use tty-port initialised flag to detect all hangups including the
 	 * one generated at USB-device disconnect.
 	 */
-	if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+	if (!tty_port_initialized(&port->port))
 		return true;
 
 	spin_lock_irqsave(&port->lock, flags);
@@ -503,7 +503,7 @@ int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg)
 
 	ret = wait_event_interruptible(port->port.delta_msr_wait,
 			usb_serial_generic_msr_changed(tty, arg, &cnow));
-	if (!ret && !test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+	if (!ret && !tty_port_initialized(&port->port))
 		ret = -EIO;
 
 	return ret;
@@ -606,7 +606,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		if (port->bulk_in_size) {
diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c
index 31a8b47f1ac6..3722d6c1ba77 100644
--- a/drivers/usb/serial/mxuport.c
+++ b/drivers/usb/serial/mxuport.c
@@ -503,7 +503,7 @@ static void mxuport_process_read_urb_demux_data(struct urb *urb)
 			return;
 		}
 
-		if (test_bit(ASYNCB_INITIALIZED, &demux_port->port.flags)) {
+		if (tty_port_initialized(&demux_port->port)) {
 			ch = data + HEADER_SIZE;
 			mxuport_process_read_urb_data(demux_port, ch, rcv_len);
 		} else {
@@ -544,7 +544,7 @@ static void mxuport_process_read_urb_demux_event(struct urb *urb)
 		}
 
 		demux_port = serial->port[rcv_port];
-		if (test_bit(ASYNCB_INITIALIZED, &demux_port->port.flags)) {
+		if (tty_port_initialized(&demux_port->port)) {
 			ch = data + HEADER_SIZE;
 			rcv_event = get_unaligned_be16(data + 2);
 			mxuport_process_read_urb_event(demux_port, ch,
@@ -1339,7 +1339,7 @@ static int mxuport_resume(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		r = usb_serial_generic_write_start(port, GFP_NOIO);
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 07d1ecd564f7..e1994e264cc0 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -776,7 +776,7 @@ static void sierra_close(struct usb_serial_port *port)
 
 	/*
 	 * Need to take susp_lock to make sure port is not already being
-	 * resumed, but no need to hold it due to ASYNC_INITIALIZED.
+	 * resumed, but no need to hold it due to initialized
 	 */
 	spin_lock_irq(&intfdata->susp_lock);
 	if (--intfdata->open_ports == 0)
@@ -1039,7 +1039,7 @@ static int sierra_resume(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
 
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		err = sierra_submit_delayed_urbs(port);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 46f1f13b41f1..3f253aec0c16 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -254,7 +254,7 @@ static int serial_open(struct tty_struct *tty, struct file *filp)
  *
  * Shut down a USB serial port. Serialized against activate by the
  * tport mutex and kept to matching open/close pairs
- * of calls by the ASYNCB_INITIALIZED flag.
+ * of calls by the initialized flag.
  *
  * Not called if tty is console.
  */
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index be9cb61b4d19..3dfdfc81254b 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -464,7 +464,7 @@ void usb_wwan_close(struct usb_serial_port *port)
 
 	/*
 	 * Need to take susp_lock to make sure port is not already being
-	 * resumed, but no need to hold it due to ASYNC_INITIALIZED.
+	 * resumed, but no need to hold it due to initialized
 	 */
 	spin_lock_irq(&intfdata->susp_lock);
 	if (--intfdata->open_ports == 0)
@@ -682,7 +682,7 @@ int usb_wwan_resume(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; i++) {
 		port = serial->port[i];
 
-		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		if (!tty_port_initialized(&port->port))
 			continue;
 
 		portdata = usb_get_serial_port_data(port);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7ac5add66c00..bf1bcdb01df0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -610,6 +610,19 @@ static inline void tty_port_set_suspended(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
 }
 
+static inline bool tty_port_initialized(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_INITIALIZED, &port->iflags);
+}
+
+static inline void tty_port_set_initialized(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_INITIALIZED, &port->iflags);
+	else
+		clear_bit(TTY_PORT_INITIALIZED, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 5b7ce599c709..873c4b707d6a 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
 
 	/* Check if already open */
-	if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+	if (tty_port_initialized(&self->port)) {
 		pr_debug("%s(), already open so break out!\n", __func__);
 		return 0;
 	}
+	tty_port_set_initialized(&self->port, 1);
 
 	/* Register with IrCOMM */
 	irda_notify_init(&notify);
@@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 
 	return 0;
 err:
-	clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+	tty_port_set_initialized(&self->port, 0);
 	return ret;
 }
 
@@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) ||
-		    !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
@@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self != NULL, return;);
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
 
-	if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+	if (!tty_port_initialized(&self->port))
 		return;
+	tty_port_set_initialized(&self->port, 0);
 
 	ircomm_tty_detach_cable(self);
 
@@ -1259,7 +1260,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_CHECK_CD", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&self->port)) {
 		seq_printf(m, "%cASYNC_INITIALIZED", sep);
 		sep = '|';
 	}
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index e24724db36a2..d4fdf8f7b471 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -324,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
 
  check_and_exit:
 
-	if (self->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(self)) {
 		if (((old_state.flags & ASYNC_SPD_MASK) !=
 		     (self->flags & ASYNC_SPD_MASK)) ||
 		    (old_driver.custom_divisor != driver->custom_divisor)) {
-- 
cgit v1.2.3


From 9ed19428a51d53477e2b79be3303fa08f8575749 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 9 Apr 2016 18:56:34 -0700
Subject: serial: core: Prevent unsafe uart port access, part 2

For serial core operations not already excluded by holding port->mutex,
use reference counting to protect deferencing the state->uart_port.

Introduce helper functions, uart_port_ref() and uart_port_deref(), to
wrap uart_port access, and helper macros, uart_port_lock() and
uart_port_unlock(), to wrap combination uart_port access with uart
port lock sections.

Port removal in uart_remove_one_port() waits for reference count to
drop to zero before detaching the uart port from struct uart_state.

For functions only reading the tx circular buffer indexes (where the
uart port lock is claimed to prevent concurrent users), a NULL uart
port is simply ignored and the operation completes normally.

For functions change the tx circular buffer indexes (where the uart
port lock is claimed to prevent concurrent users), the operation is
aborted if the uart port is NULL (ie., has been detached).

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 171 +++++++++++++++++++++++++++++----------
 include/linux/serial_core.h      |   2 +
 2 files changed, 130 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index e605f0328182..1887f9c71f85 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -64,6 +64,35 @@ static int uart_dcd_enabled(struct uart_port *uport)
 	return !!(uport->status & UPSTAT_DCD_ENABLE);
 }
 
+static inline struct uart_port *uart_port_ref(struct uart_state *state)
+{
+	if (atomic_add_unless(&state->refcount, 1, 0))
+		return state->uart_port;
+	return NULL;
+}
+
+static inline void uart_port_deref(struct uart_port *uport)
+{
+	if (uport && atomic_dec_and_test(&uport->state->refcount))
+		wake_up(&uport->state->remove_wait);
+}
+
+#define uart_port_lock(state, flags)					\
+	({								\
+		struct uart_port *__uport = uart_port_ref(state);	\
+		if (__uport)						\
+			spin_lock_irqsave(&__uport->lock, flags);	\
+		__uport;						\
+	})
+
+#define uart_port_unlock(uport, flags)					\
+	({								\
+		struct uart_port *__uport = uport;			\
+		if (__uport)						\
+			spin_unlock_irqrestore(&__uport->lock, flags);	\
+		uart_port_deref(__uport);				\
+	})
+
 static inline struct uart_port *uart_port_check(struct uart_state *state)
 {
 #ifdef CONFIG_LOCKDEP
@@ -90,12 +119,13 @@ void uart_write_wakeup(struct uart_port *port)
 static void uart_stop(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
-	spin_lock_irqsave(&port->lock, flags);
-	port->ops->stop_tx(port);
-	spin_unlock_irqrestore(&port->lock, flags);
+	port = uart_port_lock(state, flags);
+	if (port)
+		port->ops->stop_tx(port);
+	uart_port_unlock(port, flags);
 }
 
 static void __uart_start(struct tty_struct *tty)
@@ -103,19 +133,19 @@ static void __uart_start(struct tty_struct *tty)
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->uart_port;
 
-	if (!uart_tx_stopped(port))
+	if (port && !uart_tx_stopped(port))
 		port->ops->start_tx(port);
 }
 
 static void uart_start(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
-	spin_lock_irqsave(&port->lock, flags);
+	port = uart_port_lock(state, flags);
 	__uart_start(tty);
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 }
 
 static void
@@ -496,7 +526,7 @@ static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 static int uart_put_char(struct tty_struct *tty, unsigned char c)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	struct circ_buf *circ;
 	unsigned long flags;
 	int ret = 0;
@@ -505,13 +535,13 @@ static int uart_put_char(struct tty_struct *tty, unsigned char c)
 	if (!circ->buf)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
-	if (uart_circ_chars_free(circ) != 0) {
+	port = uart_port_lock(state, flags);
+	if (port && uart_circ_chars_free(circ) != 0) {
 		circ->buf[circ->head] = c;
 		circ->head = (circ->head + 1) & (UART_XMIT_SIZE - 1);
 		ret = 1;
 	}
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
@@ -538,14 +568,12 @@ static int uart_write(struct tty_struct *tty,
 		return -EL3HLT;
 	}
 
-	port = state->uart_port;
 	circ = &state->xmit;
-
 	if (!circ->buf)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
-	while (1) {
+	port = uart_port_lock(state, flags);
+	while (port) {
 		c = CIRC_SPACE_TO_END(circ->head, circ->tail, UART_XMIT_SIZE);
 		if (count < c)
 			c = count;
@@ -559,32 +587,33 @@ static int uart_write(struct tty_struct *tty,
 	}
 
 	__uart_start(tty);
-	spin_unlock_irqrestore(&port->lock, flags);
-
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
 static int uart_write_room(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	struct uart_port *port;
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->uart_port->lock, flags);
+	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_free(&state->xmit);
-	spin_unlock_irqrestore(&state->uart_port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
 static int uart_chars_in_buffer(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	struct uart_port *port;
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->uart_port->lock, flags);
+	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_pending(&state->xmit);
-	spin_unlock_irqrestore(&state->uart_port->lock, flags);
+	uart_port_unlock(port, flags);
 	return ret;
 }
 
@@ -603,14 +632,15 @@ static void uart_flush_buffer(struct tty_struct *tty)
 		return;
 	}
 
-	port = state->uart_port;
 	pr_debug("uart_flush_buffer(%d) called\n", tty->index);
 
-	spin_lock_irqsave(&port->lock, flags);
+	port = uart_port_lock(state, flags);
+	if (!port)
+		return;
 	uart_circ_clear(&state->xmit);
 	if (port->ops->flush_buffer)
 		port->ops->flush_buffer(port);
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock(port, flags);
 	tty_wakeup(tty);
 }
 
@@ -621,9 +651,13 @@ static void uart_flush_buffer(struct tty_struct *tty)
 static void uart_send_xchar(struct tty_struct *tty, char ch)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long flags;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (port->ops->send_xchar)
 		port->ops->send_xchar(port, ch);
 	else {
@@ -633,14 +667,19 @@ static void uart_send_xchar(struct tty_struct *tty, char ch)
 			port->ops->start_tx(port);
 		spin_unlock_irqrestore(&port->lock, flags);
 	}
+	uart_port_deref(port);
 }
 
 static void uart_throttle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	upstat_t mask = 0;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
 	if (C_CRTSCTS(tty))
@@ -656,14 +695,20 @@ static void uart_throttle(struct tty_struct *tty)
 
 	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, STOP_CHAR(tty));
+
+	uart_port_deref(port);
 }
 
 static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	upstat_t mask = 0;
 
+	port = uart_port_ref(state);
+	if (!port)
+		return;
+
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
 	if (C_CRTSCTS(tty))
@@ -679,6 +724,8 @@ static void uart_unthrottle(struct tty_struct *tty)
 
 	if (mask & UPSTAT_AUTOXOFF)
 		uart_send_xchar(tty, START_CHAR(tty));
+
+	uart_port_deref(port);
 }
 
 static int uart_get_info(struct tty_port *port, struct serial_struct *retinfo)
@@ -1116,10 +1163,9 @@ static void uart_enable_ms(struct uart_port *uport)
  * FIXME: This wants extracting into a common all driver implementation
  * of TIOCMWAIT using tty_port.
  */
-static int
-uart_wait_modem_status(struct uart_state *state, unsigned long arg)
+static int uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 {
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 	struct tty_port *port = &state->port;
 	DECLARE_WAITQUEUE(wait, current);
 	struct uart_icount cprev, cnow;
@@ -1128,6 +1174,9 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	/*
 	 * note the counters on entry
 	 */
+	uport = uart_port_ref(state);
+	if (!uport)
+		return -EIO;
 	spin_lock_irq(&uport->lock);
 	memcpy(&cprev, &uport->icount, sizeof(struct uart_icount));
 	uart_enable_ms(uport);
@@ -1161,6 +1210,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	}
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&port->delta_msr_wait, &wait);
+	uart_port_deref(uport);
 
 	return ret;
 }
@@ -1176,11 +1226,15 @@ static int uart_get_icount(struct tty_struct *tty,
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_icount cnow;
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 
+	uport = uart_port_ref(state);
+	if (!uport)
+		return -EIO;
 	spin_lock_irq(&uport->lock);
 	memcpy(&cnow, &uport->icount, sizeof(struct uart_icount));
 	spin_unlock_irq(&uport->lock);
+	uart_port_deref(uport);
 
 	icount->cts         = cnow.cts;
 	icount->dsr         = cnow.dsr;
@@ -1481,11 +1535,14 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *port;
 	unsigned long char_time, expire;
 
-	if (port->type == PORT_UNKNOWN || port->fifosize == 0)
+	port = uart_port_ref(state);
+	if (!port || port->type == PORT_UNKNOWN || port->fifosize == 0) {
+		uart_port_deref(port);
 		return;
+	}
 
 	/*
 	 * Set the check interval to be 1/5 of the estimated time to
@@ -1531,6 +1588,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 		if (time_after(jiffies, expire))
 			break;
 	}
+	uart_port_deref(port);
 }
 
 /*
@@ -1591,12 +1649,23 @@ static void uart_port_shutdown(struct tty_port *port)
 static int uart_carrier_raised(struct tty_port *port)
 {
 	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
 	int mctrl;
+
+	uport = uart_port_ref(state);
+	/*
+	 * Should never observe uport == NULL since checks for hangup should
+	 * abort the tty_port_block_til_ready() loop before checking for carrier
+	 * raised -- but report carrier raised if it does anyway so open will
+	 * continue and not sleep
+	 */
+	if (WARN_ON(!uport))
+		return 1;
 	spin_lock_irq(&uport->lock);
 	uart_enable_ms(uport);
 	mctrl = uport->ops->get_mctrl(uport);
 	spin_unlock_irq(&uport->lock);
+	uart_port_deref(uport);
 	if (mctrl & TIOCM_CAR)
 		return 1;
 	return 0;
@@ -1605,12 +1674,18 @@ static int uart_carrier_raised(struct tty_port *port)
 static void uart_dtr_rts(struct tty_port *port, int onoff)
 {
 	struct uart_state *state = container_of(port, struct uart_state, port);
-	struct uart_port *uport = state->uart_port;
+	struct uart_port *uport;
+
+	uport = uart_port_ref(state);
+	if (!uport)
+		return;
 
 	if (onoff)
 		uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
 	else
 		uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
+
+	uart_port_deref(uport);
 }
 
 /*
@@ -2320,12 +2395,15 @@ static int uart_poll_get_char(struct tty_driver *driver, int line)
 	struct uart_driver *drv = driver->driver_state;
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
+	int ret = -1;
 
-	if (!state || !state->uart_port)
-		return -1;
-
-	port = state->uart_port;
-	return port->ops->poll_get_char(port);
+	if (state) {
+		port = uart_port_ref(state);
+		if (port)
+			ret = port->ops->poll_get_char(port);
+		uart_port_deref(port);
+	}
+	return ret;
 }
 
 static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
@@ -2334,14 +2412,17 @@ static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
 
-	if (!state || !state->uart_port)
+	if (!state)
 		return;
 
-	port = state->uart_port;
+	port = uart_port_ref(state);
+	if (!port)
+		return;
 
 	if (ch == '\n')
 		port->ops->poll_put_char(port, '\r');
 	port->ops->poll_put_char(port, ch);
+	uart_port_deref(port);
 }
 #endif
 
@@ -2688,6 +2769,8 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
 	}
 
 	/* Link the port to the driver state table and vice versa */
+	atomic_set(&state->refcount, 1);
+	init_waitqueue_head(&state->remove_wait);
 	state->uart_port = uport;
 	uport->state = state;
 
@@ -2816,6 +2899,8 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport)
 	uport->type = PORT_UNKNOWN;
 
 	mutex_lock(&port->mutex);
+	WARN_ON(atomic_dec_return(&state->refcount) < 0);
+	wait_event(state->remove_wait, !atomic_read(&state->refcount));
 	state->uart_port = NULL;
 	mutex_unlock(&port->mutex);
 out:
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index cbfcf38e220d..fd4ad4dce11a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -281,6 +281,8 @@ struct uart_state {
 	enum uart_pm_state	pm_state;
 	struct circ_buf		xmit;
 
+	atomic_t		refcount;
+	wait_queue_head_t	remove_wait;
 	struct uart_port	*uart_port;
 };
 
-- 
cgit v1.2.3


From 8ede5cce4f0baff77ef63aa3cb3afc65d0317e0b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:08:16 +0200
Subject: tty: vt, make color_table const

This means all ->con_set_palette have to have the second parameter
const too now.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c                     | 2 +-
 drivers/usb/misc/sisusbvga/sisusb_con.c | 2 +-
 drivers/video/console/fbcon.c           | 4 ++--
 drivers/video/console/mdacon.c          | 2 +-
 drivers/video/console/newport_con.c     | 2 +-
 drivers/video/console/sticon.c          | 2 +-
 drivers/video/console/vgacon.c          | 5 ++---
 include/linux/console.h                 | 2 +-
 include/linux/selection.h               | 2 +-
 9 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index d1da391febb4..2c71b3bde174 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1039,7 +1039,7 @@ struct vc_data *vc_deallocate(unsigned int currcons)
 #define VT100ID "\033[?1;2c"
 #define VT102ID "\033[?6c"
 
-unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
+const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
 				       8,12,10,14, 9,13,11,15 };
 
 /* the default colour table, for VGA+ colour systems */
diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index ace343088915..afa853209f1d 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -601,7 +601,7 @@ sisusbcon_save_screen(struct vc_data *c)
 
 /* interface routine */
 static int
-sisusbcon_set_palette(struct vc_data *c, unsigned char *table)
+sisusbcon_set_palette(struct vc_data *c, const unsigned char *table)
 {
 	struct sisusb_usb_data *sisusb;
 	int i, j;
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 6e92917ba77a..afd3301ac40c 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -170,7 +170,7 @@ static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
 			int height, int width);
 static int fbcon_switch(struct vc_data *vc);
 static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch);
-static int fbcon_set_palette(struct vc_data *vc, unsigned char *table);
+static int fbcon_set_palette(struct vc_data *vc, const unsigned char *table);
 static int fbcon_scrolldelta(struct vc_data *vc, int lines);
 
 /*
@@ -2652,7 +2652,7 @@ static struct fb_cmap palette_cmap = {
 	0, 16, palette_red, palette_green, palette_blue, NULL
 };
 
-static int fbcon_set_palette(struct vc_data *vc, unsigned char *table)
+static int fbcon_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	int i, j, k, depth;
diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c
index 296e94561556..8edc062536a8 100644
--- a/drivers/video/console/mdacon.c
+++ b/drivers/video/console/mdacon.c
@@ -481,7 +481,7 @@ static int mdacon_switch(struct vc_data *c)
 	return 1;	/* redrawing needed */
 }
 
-static int mdacon_set_palette(struct vc_data *c, unsigned char *table)
+static int mdacon_set_palette(struct vc_data *c, const unsigned char *table)
 {
 	return -EINVAL;
 }
diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c
index bb4e96255974..0553dfe684ef 100644
--- a/drivers/video/console/newport_con.c
+++ b/drivers/video/console/newport_con.c
@@ -574,7 +574,7 @@ static int newport_font_set(struct vc_data *vc, struct console_font *font, unsig
 	return newport_set_font(vc->vc_num, font);
 }
 
-static int newport_set_palette(struct vc_data *vc, unsigned char *table)
+static int newport_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	return -EINVAL;
 }
diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c
index 026fd1215933..e440c2d9fe7c 100644
--- a/drivers/video/console/sticon.c
+++ b/drivers/video/console/sticon.c
@@ -79,7 +79,7 @@ static const char *sticon_startup(void)
     return "STI console";
 }
 
-static int sticon_set_palette(struct vc_data *c, unsigned char *table)
+static int sticon_set_palette(struct vc_data *c, const unsigned char *table)
 {
     return -EINVAL;
 }
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 517f565b65d7..8bf911002cba 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -80,7 +80,6 @@ static void vgacon_deinit(struct vc_data *c);
 static void vgacon_cursor(struct vc_data *c, int mode);
 static int vgacon_switch(struct vc_data *c);
 static int vgacon_blank(struct vc_data *c, int blank, int mode_switch);
-static int vgacon_set_palette(struct vc_data *vc, unsigned char *table);
 static int vgacon_scrolldelta(struct vc_data *c, int lines);
 static int vgacon_set_origin(struct vc_data *c);
 static void vgacon_save_screen(struct vc_data *c);
@@ -847,7 +846,7 @@ static int vgacon_switch(struct vc_data *c)
 	return 0;		/* Redrawing not needed */
 }
 
-static void vga_set_palette(struct vc_data *vc, unsigned char *table)
+static void vga_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 	int i, j;
 
@@ -860,7 +859,7 @@ static void vga_set_palette(struct vc_data *vc, unsigned char *table)
 	}
 }
 
-static int vgacon_set_palette(struct vc_data *vc, unsigned char *table)
+static int vgacon_set_palette(struct vc_data *vc, const unsigned char *table)
 {
 #ifdef CAN_LOAD_PALETTE
 	if (vga_video_type != VIDEO_TYPE_VGAC || vga_palette_blanked
diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af2451e..137ac1a1c16f 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -47,7 +47,7 @@ struct consw {
 	int	(*con_font_copy)(struct vc_data *, int);
 	int     (*con_resize)(struct vc_data *, unsigned int, unsigned int,
 			       unsigned int);
-	int	(*con_set_palette)(struct vc_data *, unsigned char *);
+	int	(*con_set_palette)(struct vc_data *, const unsigned char *);
 	int	(*con_scrolldelta)(struct vc_data *, int);
 	int	(*con_set_origin)(struct vc_data *);
 	void	(*con_save_screen)(struct vc_data *);
diff --git a/include/linux/selection.h b/include/linux/selection.h
index 85193aa8c1e3..7e6c4450b8a5 100644
--- a/include/linux/selection.h
+++ b/include/linux/selection.h
@@ -24,7 +24,7 @@ extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry);
 
 extern int console_blanked;
 
-extern unsigned char color_table[];
+extern const unsigned char color_table[];
 extern int default_red[];
 extern int default_grn[];
 extern int default_blu[];
-- 
cgit v1.2.3


From 91e74ca5e7ac4ec6c61b84d6618eb5e401f852f0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:08:17 +0200
Subject: tty: vt, use proper type for default colors

Every user of default_red, default_grn, and default_blu treats them as
unsigned char. So make it really unsigned char.

And indent the initializers and module_param properly.

This saves ~ 100 bytes of data.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c       | 27 +++++++++++++++++----------
 include/linux/selection.h |  6 +++---
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 2c71b3bde174..8f9f8ed3ed09 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1043,16 +1043,23 @@ const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7,
 				       8,12,10,14, 9,13,11,15 };
 
 /* the default colour table, for VGA+ colour systems */
-int default_red[] = {0x00,0xaa,0x00,0xaa,0x00,0xaa,0x00,0xaa,
-    0x55,0xff,0x55,0xff,0x55,0xff,0x55,0xff};
-int default_grn[] = {0x00,0x00,0xaa,0x55,0x00,0x00,0xaa,0xaa,
-    0x55,0x55,0xff,0xff,0x55,0x55,0xff,0xff};
-int default_blu[] = {0x00,0x00,0x00,0x00,0xaa,0xaa,0xaa,0xaa,
-    0x55,0x55,0x55,0x55,0xff,0xff,0xff,0xff};
-
-module_param_array(default_red, int, NULL, S_IRUGO | S_IWUSR);
-module_param_array(default_grn, int, NULL, S_IRUGO | S_IWUSR);
-module_param_array(default_blu, int, NULL, S_IRUGO | S_IWUSR);
+unsigned char default_red[] = {
+	0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa,
+	0x55, 0xff, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff
+};
+module_param_array(default_red, byte, NULL, S_IRUGO | S_IWUSR);
+
+unsigned char default_grn[] = {
+	0x00, 0x00, 0xaa, 0x55, 0x00, 0x00, 0xaa, 0xaa,
+	0x55, 0x55, 0xff, 0xff, 0x55, 0x55, 0xff, 0xff
+};
+module_param_array(default_grn, byte, NULL, S_IRUGO | S_IWUSR);
+
+unsigned char default_blu[] = {
+	0x00, 0x00, 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa,
+	0x55, 0x55, 0x55, 0x55, 0xff, 0xff, 0xff, 0xff
+};
+module_param_array(default_blu, byte, NULL, S_IRUGO | S_IWUSR);
 
 /*
  * gotoxy() must verify all boundaries, because the arguments
diff --git a/include/linux/selection.h b/include/linux/selection.h
index 7e6c4450b8a5..8e4624efdb6f 100644
--- a/include/linux/selection.h
+++ b/include/linux/selection.h
@@ -25,9 +25,9 @@ extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry);
 extern int console_blanked;
 
 extern const unsigned char color_table[];
-extern int default_red[];
-extern int default_grn[];
-extern int default_blu[];
+extern unsigned char default_red[];
+extern unsigned char default_grn[];
+extern unsigned char default_blu[];
 
 extern unsigned short *screen_pos(struct vc_data *vc, int w_offset, int viewed);
 extern u16 screen_glyph(struct vc_data *vc, int offset);
-- 
cgit v1.2.3


From 144ef5c2df9b473dad7eab375adcf5b11d0b1e47 Mon Sep 17 00:00:00 2001
From: Wan Ahmad Zainie <wan.ahmad.zainie.wan.mohamad@intel.com>
Date: Wed, 6 Apr 2016 12:06:51 +0800
Subject: serial: 8250: export get_mctrl function

Exposes get_mctrl() function so that it can be overriden with platform
specific implementation.

Signed-off-by: Wan Ahmad Zainie <wan.ahmad.zainie.wan.mohamad@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c |  3 +++
 drivers/tty/serial/8250/8250_port.c | 10 +++++++++-
 include/linux/serial_8250.h         |  2 ++
 include/linux/serial_core.h         |  1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 2f4f5ee651db..0fbd7c033a25 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -830,6 +830,7 @@ static int serial8250_probe(struct platform_device *dev)
 		uart.port.handle_irq	= p->handle_irq;
 		uart.port.handle_break	= p->handle_break;
 		uart.port.set_termios	= p->set_termios;
+		uart.port.get_mctrl	= p->get_mctrl;
 		uart.port.pm		= p->pm;
 		uart.port.dev		= &dev->dev;
 		uart.port.irqflags	|= irqflag;
@@ -1022,6 +1023,8 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		/*  Possibly override set_termios call */
 		if (up->port.set_termios)
 			uart->port.set_termios = up->port.set_termios;
+		if (up->port.get_mctrl)
+			uart->port.get_mctrl = up->port.get_mctrl;
 		if (up->port.set_mctrl)
 			uart->port.set_mctrl = up->port.set_mctrl;
 		if (up->port.startup)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 00ad2637b08c..a803ddfd5a59 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1882,7 +1882,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
 }
 
-static unsigned int serial8250_get_mctrl(struct uart_port *port)
+unsigned int serial8250_do_get_mctrl(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int status;
@@ -1903,6 +1903,14 @@ static unsigned int serial8250_get_mctrl(struct uart_port *port)
 		ret |= TIOCM_CTS;
 	return ret;
 }
+EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
+
+static unsigned int serial8250_get_mctrl(struct uart_port *port)
+{
+	if (port->get_mctrl)
+		return port->get_mctrl(port);
+	return serial8250_do_get_mctrl(port);
+}
 
 void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 434879759725..48ec7651989b 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -36,6 +36,7 @@ struct plat_serial8250_port {
 	void		(*set_termios)(struct uart_port *,
 			               struct ktermios *new,
 			               struct ktermios *old);
+	unsigned int	(*get_mctrl)(struct uart_port *);
 	int		(*handle_irq)(struct uart_port *);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned old);
@@ -148,6 +149,7 @@ extern int early_serial8250_setup(struct earlycon_device *device,
 					 const char *options);
 extern void serial8250_do_set_termios(struct uart_port *port,
 		struct ktermios *termios, struct ktermios *old);
+extern unsigned int serial8250_do_get_mctrl(struct uart_port *port);
 extern int serial8250_do_startup(struct uart_port *port);
 extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index fd4ad4dce11a..a3d7c0d4a03e 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -123,6 +123,7 @@ struct uart_port {
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
 				               struct ktermios *old);
+	unsigned int		(*get_mctrl)(struct uart_port *);
 	void			(*set_mctrl)(struct uart_port *, unsigned int);
 	int			(*startup)(struct uart_port *port);
 	void			(*shutdown)(struct uart_port *port);
-- 
cgit v1.2.3


From e4234a1fc343ca35f852bc527fae56fade879d4a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 31 Mar 2016 11:45:06 +0100
Subject: kernfs: Move faulting copy_user operations outside of the mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A fault in a user provided buffer may lead anywhere, and lockdep warns
that we have a potential deadlock between the mm->mmap_sem and the
kernfs file mutex:

[   82.811702] ======================================================
[   82.811705] [ INFO: possible circular locking dependency detected ]
[   82.811709] 4.5.0-rc4-gfxbench+ #1 Not tainted
[   82.811711] -------------------------------------------------------
[   82.811714] kms_setmode/5859 is trying to acquire lock:
[   82.811717]  (&dev->struct_mutex){+.+.+.}, at: [<ffffffff8150d9c1>] drm_gem_mmap+0x1a1/0x270
[   82.811731]
but task is already holding lock:
[   82.811734]  (&mm->mmap_sem){++++++}, at: [<ffffffff8117b364>] vm_mmap_pgoff+0x44/0xa0
[   82.811745]
which lock already depends on the new lock.

[   82.811749]
the existing dependency chain (in reverse order) is:
[   82.811752]
-> #3 (&mm->mmap_sem){++++++}:
[   82.811761]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811766]        [<ffffffff8118bc65>] __might_fault+0x75/0xa0
[   82.811771]        [<ffffffff8124da4a>] kernfs_fop_write+0x8a/0x180
[   82.811787]        [<ffffffff811d1023>] __vfs_write+0x23/0xe0
[   82.811792]        [<ffffffff811d1d74>] vfs_write+0xa4/0x190
[   82.811797]        [<ffffffff811d2c14>] SyS_write+0x44/0xb0
[   82.811801]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.811807]
-> #2 (s_active#6){++++.+}:
[   82.811814]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811819]        [<ffffffff8124c070>] __kernfs_remove+0x210/0x2f0
[   82.811823]        [<ffffffff8124d040>] kernfs_remove_by_name_ns+0x40/0xa0
[   82.811828]        [<ffffffff8124e9e0>] sysfs_remove_file_ns+0x10/0x20
[   82.811832]        [<ffffffff815318d4>] device_del+0x124/0x250
[   82.811837]        [<ffffffff81531a19>] device_unregister+0x19/0x60
[   82.811841]        [<ffffffff8153c051>] cpu_cache_sysfs_exit+0x51/0xb0
[   82.811846]        [<ffffffff8153c628>] cacheinfo_cpu_callback+0x38/0x70
[   82.811851]        [<ffffffff8109ae89>] notifier_call_chain+0x39/0xa0
[   82.811856]        [<ffffffff8109aef9>] __raw_notifier_call_chain+0x9/0x10
[   82.811860]        [<ffffffff810786de>] cpu_notify+0x1e/0x40
[   82.811865]        [<ffffffff81078779>] cpu_notify_nofail+0x9/0x20
[   82.811869]        [<ffffffff81078ac3>] _cpu_down+0x233/0x340
[   82.811874]        [<ffffffff81079019>] disable_nonboot_cpus+0xc9/0x350
[   82.811878]        [<ffffffff810d2e11>] suspend_devices_and_enter+0x5a1/0xb50
[   82.811883]        [<ffffffff810d3903>] pm_suspend+0x543/0x8d0
[   82.811888]        [<ffffffff810d1b77>] state_store+0x77/0xe0
[   82.811892]        [<ffffffff813fa68f>] kobj_attr_store+0xf/0x20
[   82.811897]        [<ffffffff8124e740>] sysfs_kf_write+0x40/0x50
[   82.811902]        [<ffffffff8124dafc>] kernfs_fop_write+0x13c/0x180
[   82.811906]        [<ffffffff811d1023>] __vfs_write+0x23/0xe0
[   82.811910]        [<ffffffff811d1d74>] vfs_write+0xa4/0x190
[   82.811914]        [<ffffffff811d2c14>] SyS_write+0x44/0xb0
[   82.811918]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.811923]
-> #1 (cpu_hotplug.lock){+.+.+.}:
[   82.811929]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.811933]        [<ffffffff817b6f72>] mutex_lock_nested+0x62/0x3b0
[   82.811940]        [<ffffffff810784c1>] get_online_cpus+0x61/0x80
[   82.811944]        [<ffffffff811170eb>] stop_machine+0x1b/0xe0
[   82.811949]        [<ffffffffa0178edd>] gen8_ggtt_insert_entries__BKL+0x2d/0x30 [i915]
[   82.812009]        [<ffffffffa017d3a6>] ggtt_bind_vma+0x46/0x70 [i915]
[   82.812045]        [<ffffffffa017eb70>] i915_vma_bind+0x140/0x290 [i915]
[   82.812081]        [<ffffffffa01862b9>] i915_gem_object_do_pin+0x899/0xb00 [i915]
[   82.812117]        [<ffffffffa0186555>] i915_gem_object_pin+0x35/0x40 [i915]
[   82.812154]        [<ffffffffa019a23e>] intel_init_pipe_control+0xbe/0x210 [i915]
[   82.812192]        [<ffffffffa0197312>] intel_logical_rings_init+0xe2/0xde0 [i915]
[   82.812232]        [<ffffffffa0186fe3>] i915_gem_init+0xf3/0x130 [i915]
[   82.812278]        [<ffffffffa02097ed>] i915_driver_load+0xf2d/0x1770 [i915]
[   82.812318]        [<ffffffff81512474>] drm_dev_register+0xa4/0xb0
[   82.812323]        [<ffffffff8151467e>] drm_get_pci_dev+0xce/0x1e0
[   82.812328]        [<ffffffffa01472cf>] i915_pci_probe+0x2f/0x50 [i915]
[   82.812360]        [<ffffffff8143f907>] pci_device_probe+0x87/0xf0
[   82.812366]        [<ffffffff81535f89>] driver_probe_device+0x229/0x450
[   82.812371]        [<ffffffff81536233>] __driver_attach+0x83/0x90
[   82.812375]        [<ffffffff81533c61>] bus_for_each_dev+0x61/0xa0
[   82.812380]        [<ffffffff81535879>] driver_attach+0x19/0x20
[   82.812384]        [<ffffffff8153535f>] bus_add_driver+0x1ef/0x290
[   82.812388]        [<ffffffff81536e9b>] driver_register+0x5b/0xe0
[   82.812393]        [<ffffffff8143e83b>] __pci_register_driver+0x5b/0x60
[   82.812398]        [<ffffffff81514866>] drm_pci_init+0xd6/0x100
[   82.812402]        [<ffffffffa027c094>] 0xffffffffa027c094
[   82.812406]        [<ffffffff810003de>] do_one_initcall+0xae/0x1d0
[   82.812412]        [<ffffffff811595a0>] do_init_module+0x5b/0x1cb
[   82.812417]        [<ffffffff81106160>] load_module+0x1c20/0x2480
[   82.812422]        [<ffffffff81106bae>] SyS_finit_module+0x7e/0xa0
[   82.812428]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.812433]
-> #0 (&dev->struct_mutex){+.+.+.}:
[   82.812439]        [<ffffffff810cbe59>] __lock_acquire+0x1fc9/0x20f0
[   82.812443]        [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.812456]        [<ffffffff8150d9e7>] drm_gem_mmap+0x1c7/0x270
[   82.812460]        [<ffffffff81196a14>] mmap_region+0x334/0x580
[   82.812466]        [<ffffffff81196fc4>] do_mmap+0x364/0x410
[   82.812470]        [<ffffffff8117b38d>] vm_mmap_pgoff+0x6d/0xa0
[   82.812474]        [<ffffffff811950f4>] SyS_mmap_pgoff+0x184/0x220
[   82.812479]        [<ffffffff8100a0fd>] SyS_mmap+0x1d/0x20
[   82.812484]        [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73
[   82.812489]
other info that might help us debug this:

[   82.812493] Chain exists of:
  &dev->struct_mutex --> s_active#6 --> &mm->mmap_sem

[   82.812502]  Possible unsafe locking scenario:

[   82.812506]        CPU0                    CPU1
[   82.812508]        ----                    ----
[   82.812510]   lock(&mm->mmap_sem);
[   82.812514]                                lock(s_active#6);
[   82.812519]                                lock(&mm->mmap_sem);
[   82.812522]   lock(&dev->struct_mutex);
[   82.812526]
 *** DEADLOCK ***

[   82.812531] 1 lock held by kms_setmode/5859:
[   82.812533]  #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8117b364>] vm_mmap_pgoff+0x44/0xa0
[   82.812541]
stack backtrace:
[   82.812547] CPU: 0 PID: 5859 Comm: kms_setmode Not tainted 4.5.0-rc4-gfxbench+ #1
[   82.812550] Hardware name:                  /NUC5CPYB, BIOS PYBSWCEL.86A.0040.2015.0814.1353 08/14/2015
[   82.812553]  0000000000000000 ffff880079407bf0 ffffffff813f8505 ffffffff825fb270
[   82.812560]  ffffffff825c4190 ffff880079407c30 ffffffff810c84ac ffff880079407c90
[   82.812566]  ffff8800797ed328 ffff8800797ecb00 0000000000000001 ffff8800797ed350
[   82.812573] Call Trace:
[   82.812578]  [<ffffffff813f8505>] dump_stack+0x67/0x92
[   82.812582]  [<ffffffff810c84ac>] print_circular_bug+0x1fc/0x310
[   82.812586]  [<ffffffff810cbe59>] __lock_acquire+0x1fc9/0x20f0
[   82.812590]  [<ffffffff810cc883>] lock_acquire+0xc3/0x1d0
[   82.812594]  [<ffffffff8150d9c1>] ? drm_gem_mmap+0x1a1/0x270
[   82.812599]  [<ffffffff8150d9e7>] drm_gem_mmap+0x1c7/0x270
[   82.812603]  [<ffffffff8150d9c1>] ? drm_gem_mmap+0x1a1/0x270
[   82.812608]  [<ffffffff81196a14>] mmap_region+0x334/0x580
[   82.812612]  [<ffffffff81196fc4>] do_mmap+0x364/0x410
[   82.812616]  [<ffffffff8117b38d>] vm_mmap_pgoff+0x6d/0xa0
[   82.812629]  [<ffffffff811950f4>] SyS_mmap_pgoff+0x184/0x220
[   82.812633]  [<ffffffff8100a0fd>] SyS_mmap+0x1d/0x20
[   82.812637]  [<ffffffff817bb81b>] entry_SYSCALL_64_fastpath+0x16/0x73

Highly unlikely though this scenario is, we can avoid the issue entirely
by moving the copy operation from out under the kernfs_get_active()
tracking by assigning the preallocated buffer its own mutex. The
temporary buffer allocation doesn't require mutex locking as it is
entirely local.

The locked section was extended by the addition of the preallocated buf
to speed up md user operations in

commit 2b75869bba676c248d8d25ae6d2bd9221dfffdb6
Author: NeilBrown <neilb@suse.de>
Date:   Mon Oct 13 16:41:28 2014 +1100

    sysfs/kernfs: allow attributes to request write buffer be pre-allocated.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94350
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: NeilBrown <neilb@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 51 ++++++++++++++++++++++++++++----------------------
 include/linux/kernfs.h |  1 +
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7247252ee9b1..e1574008adc9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	char *buf;
 
 	buf = of->prealloc_buf;
-	if (!buf)
+	if (buf)
+		mutex_lock(&of->prealloc_mutex);
+	else
 		buf = kmalloc(len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	/*
 	 * @of->mutex nests outside active ref and is used both to ensure that
-	 * the ops aren't called concurrently for the same open file, and
-	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
 	if (!kernfs_get_active(of->kn)) {
@@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	else
 		len = -EINVAL;
 
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
 	if (len < 0)
-		goto out_unlock;
+		goto out_free;
 
 	if (copy_to_user(user_buf, buf, len)) {
 		len = -EFAULT;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	*ppos += len;
 
- out_unlock:
-	kernfs_put_active(of->kn);
-	mutex_unlock(&of->mutex);
  out_free:
-	if (buf != of->prealloc_buf)
+	if (buf == of->prealloc_buf)
+		mutex_unlock(&of->prealloc_mutex);
+	else
 		kfree(buf);
 	return len;
 }
@@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 	}
 
 	buf = of->prealloc_buf;
-	if (!buf)
+	if (buf)
+		mutex_lock(&of->prealloc_mutex);
+	else
 		buf = kmalloc(len + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
 	/*
 	 * @of->mutex nests outside active ref and is used both to ensure that
-	 * the ops aren't called concurrently for the same open file, and
-	 * to provide exclusive access to ->prealloc_buf (when that exists).
+	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
 	if (!kernfs_get_active(of->kn)) {
@@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 		goto out_free;
 	}
 
-	if (copy_from_user(buf, user_buf, len)) {
-		len = -EFAULT;
-		goto out_unlock;
-	}
-	buf[len] = '\0';	/* guarantee string termination */
-
 	ops = kernfs_ops(of->kn);
 	if (ops->write)
 		len = ops->write(of, buf, len, *ppos);
 	else
 		len = -EINVAL;
 
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
 	if (len > 0)
 		*ppos += len;
 
-out_unlock:
-	kernfs_put_active(of->kn);
-	mutex_unlock(&of->mutex);
 out_free:
-	if (buf != of->prealloc_buf)
+	if (buf == of->prealloc_buf)
+		mutex_unlock(&of->prealloc_mutex);
+	else
 		kfree(buf);
 	return len;
 }
@@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 		error = -ENOMEM;
 		if (!of->prealloc_buf)
 			goto err_free;
+		mutex_init(&of->prealloc_mutex);
 	}
 
 	/*
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c44242f39..d306e282bb1d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -177,6 +177,7 @@ struct kernfs_open_file {
 
 	/* private fields, do not use outside kernfs proper */
 	struct mutex		mutex;
+	struct mutex		prealloc_mutex;
 	int			event;
 	struct list_head	list;
 	char			*prealloc_buf;
-- 
cgit v1.2.3


From a6341f000024cdf1ec14dc26743a409a17378db5 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:46 -0700
Subject: Drivers: hv: vmbus: Introduce functions for estimating room in the
 ring buffer

Introduce separate functions for estimating how much can be read from
and written to the ring buffer.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 25 ++++---------------------
 include/linux/hyperv.h   | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index a40a73a7b71d..544362c6a9ca 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -38,8 +38,6 @@ void hv_begin_read(struct hv_ring_buffer_info *rbi)
 
 u32 hv_end_read(struct hv_ring_buffer_info *rbi)
 {
-	u32 read;
-	u32 write;
 
 	rbi->ring_buffer->interrupt_mask = 0;
 	mb();
@@ -49,9 +47,7 @@ u32 hv_end_read(struct hv_ring_buffer_info *rbi)
 	 * If it is not, we raced and we need to process new
 	 * incoming messages.
 	 */
-	hv_get_ringbuffer_availbytes(rbi, &read, &write);
-
-	return read;
+	return hv_get_bytes_to_read(rbi);
 }
 
 /*
@@ -106,9 +102,6 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
 static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 {
 	u32 cur_write_sz;
-	u32 r_size;
-	u32 write_loc;
-	u32 read_loc = rbi->ring_buffer->read_index;
 	u32 pending_sz;
 
 	/*
@@ -125,14 +118,11 @@ static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 	mb();
 
 	pending_sz = rbi->ring_buffer->pending_send_sz;
-	write_loc = rbi->ring_buffer->write_index;
 	/* If the other end is not blocked on write don't bother. */
 	if (pending_sz == 0)
 		return false;
 
-	r_size = rbi->ring_datasize;
-	cur_write_sz = write_loc >= read_loc ? r_size - (write_loc - read_loc) :
-			read_loc - write_loc;
+	cur_write_sz = hv_get_bytes_to_write(rbi);
 
 	if (cur_write_sz >= pending_sz)
 		return true;
@@ -332,7 +322,6 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
 {
 	int i = 0;
 	u32 bytes_avail_towrite;
-	u32 bytes_avail_toread;
 	u32 totalbytes_towrite = 0;
 
 	u32 next_write_location;
@@ -348,9 +337,7 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
 	if (lock)
 		spin_lock_irqsave(&outring_info->ring_lock, flags);
 
-	hv_get_ringbuffer_availbytes(outring_info,
-				&bytes_avail_toread,
-				&bytes_avail_towrite);
+	bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 
 	/*
 	 * If there is only room for the packet, assume it is full.
@@ -401,7 +388,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
 		       u64 *requestid, bool *signal, bool raw)
 {
-	u32 bytes_avail_towrite;
 	u32 bytes_avail_toread;
 	u32 next_read_location = 0;
 	u64 prev_indices = 0;
@@ -417,10 +403,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
 	*buffer_actual_len = 0;
 	*requestid = 0;
 
-	hv_get_ringbuffer_availbytes(inring_info,
-				&bytes_avail_toread,
-				&bytes_avail_towrite);
-
+	bytes_avail_toread = hv_get_bytes_to_read(inring_info);
 	/* Make sure there is something to read */
 	if (bytes_avail_toread < sizeof(desc)) {
 		/*
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index aa0fadce9308..66226ceade37 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -151,6 +151,33 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
 	*read = dsize - *write;
 }
 
+static inline u32 hv_get_bytes_to_read(struct hv_ring_buffer_info *rbi)
+{
+	u32 read_loc, write_loc, dsize, read;
+
+	dsize = rbi->ring_datasize;
+	read_loc = rbi->ring_buffer->read_index;
+	write_loc = READ_ONCE(rbi->ring_buffer->write_index);
+
+	read = write_loc >= read_loc ? (write_loc - read_loc) :
+		(dsize - read_loc) + write_loc;
+
+	return read;
+}
+
+static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi)
+{
+	u32 read_loc, write_loc, dsize, write;
+
+	dsize = rbi->ring_datasize;
+	read_loc = READ_ONCE(rbi->ring_buffer->read_index);
+	write_loc = rbi->ring_buffer->write_index;
+
+	write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
+		read_loc - write_loc;
+	return write;
+}
+
 /*
  * VMBUS version is 32 bit entity broken up into
  * two 16 bit quantities: major_number. minor_number.
-- 
cgit v1.2.3


From 5cc472477f928fb8584eb8e08245c9cf9002d74a Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:49 -0700
Subject: Drivers: hv: vmbus: Export the vmbus_set_event() API

In preparation for moving some ring buffer functionality out of the
vmbus driver, export the API for signaling the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/connection.c   | 1 +
 drivers/hv/hyperv_vmbus.h | 2 --
 include/linux/hyperv.h    | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index d02f1373dd98..fcf8a02dc0ea 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -495,3 +495,4 @@ void vmbus_set_event(struct vmbus_channel *channel)
 
 	hv_do_hypercall(HVCALL_SIGNAL_EVENT, channel->sig_event, NULL);
 }
+EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 12321b93a756..e5c586fab0e5 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -667,8 +667,6 @@ void vmbus_disconnect(void);
 
 int vmbus_post_msg(void *buffer, size_t buflen);
 
-void vmbus_set_event(struct vmbus_channel *channel);
-
 void vmbus_on_event(unsigned long data);
 void vmbus_on_msg_dpc(unsigned long data);
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 66226ceade37..40fd608475f7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1365,4 +1365,5 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
 				  const uuid_le *shv_host_servie_id);
+void vmbus_set_event(struct vmbus_channel *channel);
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 687f32e6d9bd1d63c5e557e877809eb446f1a6e8 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:50 -0700
Subject: Drivers: hv: vmbus: Move some ring buffer functions to hyperv.h

In preparation for implementing APIs for in-place consumption of VMBUS
packets, movve some ring buffer functionality into hyperv.h

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 55 ------------------------------------------------
 include/linux/hyperv.h   | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 8f518af3074d..dd255c9b9420 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -84,52 +84,6 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
 	return false;
 }
 
-/*
- * To optimize the flow management on the send-side,
- * when the sender is blocked because of lack of
- * sufficient space in the ring buffer, potential the
- * consumer of the ring buffer can signal the producer.
- * This is controlled by the following parameters:
- *
- * 1. pending_send_sz: This is the size in bytes that the
- *    producer is trying to send.
- * 2. The feature bit feat_pending_send_sz set to indicate if
- *    the consumer of the ring will signal when the ring
- *    state transitions from being full to a state where
- *    there is room for the producer to send the pending packet.
- */
-
-static bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
-{
-	u32 cur_write_sz;
-	u32 pending_sz;
-
-	/*
-	 * Issue a full memory barrier before making the signaling decision.
-	 * Here is the reason for having this barrier:
-	 * If the reading of the pend_sz (in this function)
-	 * were to be reordered and read before we commit the new read
-	 * index (in the calling function)  we could
-	 * have a problem. If the host were to set the pending_sz after we
-	 * have sampled pending_sz and go to sleep before we commit the
-	 * read index, we could miss sending the interrupt. Issue a full
-	 * memory barrier to address this.
-	 */
-	virt_mb();
-
-	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
-	/* If the other end is not blocked on write don't bother. */
-	if (pending_sz == 0)
-		return false;
-
-	cur_write_sz = hv_get_bytes_to_write(rbi);
-
-	if (cur_write_sz >= pending_sz)
-		return true;
-
-	return false;
-}
-
 /* Get the next write location for the specified ring buffer. */
 static inline u32
 hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
@@ -180,15 +134,6 @@ hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
 	ring_info->ring_buffer->read_index = next_read_location;
 }
 
-
-/* Get the start of the ring buffer. */
-static inline void *
-hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
-{
-	return (void *)ring_info->ring_buffer->buffer;
-}
-
-
 /* Get the size of the ring buffer. */
 static inline u32
 hv_get_ring_buffersize(struct hv_ring_buffer_info *ring_info)
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 40fd608475f7..eb7c0b215ba4 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1366,4 +1366,58 @@ extern __u32 vmbus_proto_version;
 int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
 				  const uuid_le *shv_host_servie_id);
 void vmbus_set_event(struct vmbus_channel *channel);
+
+/* Get the start of the ring buffer. */
+static inline void *
+hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
+{
+	return (void *)ring_info->ring_buffer->buffer;
+}
+
+/*
+ * To optimize the flow management on the send-side,
+ * when the sender is blocked because of lack of
+ * sufficient space in the ring buffer, potential the
+ * consumer of the ring buffer can signal the producer.
+ * This is controlled by the following parameters:
+ *
+ * 1. pending_send_sz: This is the size in bytes that the
+ *    producer is trying to send.
+ * 2. The feature bit feat_pending_send_sz set to indicate if
+ *    the consumer of the ring will signal when the ring
+ *    state transitions from being full to a state where
+ *    there is room for the producer to send the pending packet.
+ */
+
+static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
+{
+	u32 cur_write_sz;
+	u32 pending_sz;
+
+	/*
+	 * Issue a full memory barrier before making the signaling decision.
+	 * Here is the reason for having this barrier:
+	 * If the reading of the pend_sz (in this function)
+	 * were to be reordered and read before we commit the new read
+	 * index (in the calling function)  we could
+	 * have a problem. If the host were to set the pending_sz after we
+	 * have sampled pending_sz and go to sleep before we commit the
+	 * read index, we could miss sending the interrupt. Issue a full
+	 * memory barrier to address this.
+	 */
+	virt_mb();
+
+	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
+	/* If the other end is not blocked on write don't bother. */
+	if (pending_sz == 0)
+		return false;
+
+	cur_write_sz = hv_get_bytes_to_write(rbi);
+
+	if (cur_write_sz >= pending_sz)
+		return true;
+
+	return false;
+}
+
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From ab028db41ca9174caab7f9e3fc0a2e7f4a418410 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 2 Apr 2016 17:59:51 -0700
Subject: Drivers: hv: vmbus: Implement APIs to support "in place" consumption
 of vmbus packets

Implement APIs for in-place consumption of vmbus packets. Currently, each
packet is copied and processed one at a time and as part of processing
each packet we potentially may signal the host (if it is waiting for
room to produce a packet).

These APIs help batched in-place processing of vmbus packets.
We also optimize host signaling by having a separate API to signal
the end of in-place consumption. With netvsc using these APIs,
on an iperf run on average I see about 20X reduction in checks to
signal the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c |  1 +
 include/linux/hyperv.h   | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index dd255c9b9420..fe586bf74e17 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -132,6 +132,7 @@ hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
 		    u32 next_read_location)
 {
 	ring_info->ring_buffer->read_index = next_read_location;
+	ring_info->priv_read_index = next_read_location;
 }
 
 /* Get the size of the ring buffer. */
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index eb7c0b215ba4..590fee6a2e6f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -126,6 +126,8 @@ struct hv_ring_buffer_info {
 
 	u32 ring_datasize;		/* < ring_size */
 	u32 ring_data_startoffset;
+	u32 priv_write_index;
+	u32 priv_read_index;
 };
 
 /*
@@ -1420,4 +1422,88 @@ static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 	return false;
 }
 
+/*
+ * An API to support in-place processing of incoming VMBUS packets.
+ */
+#define VMBUS_PKT_TRAILER	8
+
+static inline struct vmpacket_descriptor *
+get_next_pkt_raw(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	void *ring_buffer = hv_get_ring_buffer(ring_info);
+	struct vmpacket_descriptor *cur_desc;
+	u32 packetlen;
+	u32 dsize = ring_info->ring_datasize;
+	u32 delta = read_loc - ring_info->ring_buffer->read_index;
+	u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
+
+	if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	if ((read_loc + sizeof(*cur_desc)) > dsize)
+		return NULL;
+
+	cur_desc = ring_buffer + read_loc;
+	packetlen = cur_desc->len8 << 3;
+
+	/*
+	 * If the packet under consideration is wrapping around,
+	 * return failure.
+	 */
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1))
+		return NULL;
+
+	return cur_desc;
+}
+
+/*
+ * A helper function to step through packets "in-place"
+ * This API is to be called after each successful call
+ * get_next_pkt_raw().
+ */
+static inline void put_pkt_raw(struct vmbus_channel *channel,
+				struct vmpacket_descriptor *desc)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	u32 packetlen = desc->len8 << 3;
+	u32 dsize = ring_info->ring_datasize;
+
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize)
+		BUG();
+	/*
+	 * Include the packet trailer.
+	 */
+	ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
+}
+
+/*
+ * This call commits the read index and potentially signals the host.
+ * Here is the pattern for using the "in-place" consumption APIs:
+ *
+ * while (get_next_pkt_raw() {
+ *	process the packet "in-place";
+ *	put_pkt_raw();
+ * }
+ * if (packets processed in place)
+ *	commit_rd_index();
+ */
+static inline void commit_rd_index(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	/*
+	 * Make sure all reads are done before we update the read index since
+	 * the writer may start writing to the read area once the read index
+	 * is updated.
+	 */
+	virt_rmb();
+	ring_info->ring_buffer->read_index = ring_info->priv_read_index;
+
+	if (hv_need_to_signal_on_read(ring_info))
+		vmbus_set_event(channel);
+}
+
+
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 97fb77dc87582300fa3c141b63699f853576cab1 Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Tue, 5 Apr 2016 10:22:51 -0700
Subject: drivers:hv: Make a function to free mmio regions through vmbus

This patch introduces a function that reverses everything
done by vmbus_allocate_mmio().  Existing code just called
release_mem_region().  Future patches in this series
require a more complex sequence of actions, so this function
is introduced to wrap those actions.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c | 15 +++++++++++++++
 include/linux/hyperv.h |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 799518b3cdc5..60553c156f90 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1187,6 +1187,21 @@ exit:
 }
 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
 
+/**
+ * vmbus_free_mmio() - Free a memory-mapped I/O range.
+ * @start:		Base address of region to release.
+ * @size:		Size of the range to be allocated
+ *
+ * This function releases anything requested by
+ * vmbus_mmio_allocate().
+ */
+void vmbus_free_mmio(resource_size_t start, resource_size_t size)
+{
+	release_mem_region(start, size);
+
+}
+EXPORT_SYMBOL_GPL(vmbus_free_mmio);
+
 /**
  * vmbus_cpu_number_to_vp_number() - Map CPU to VP.
  * @cpu_number: CPU number in Linux terms
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 590fee6a2e6f..b10954a66939 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1120,7 +1120,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 			resource_size_t min, resource_size_t max,
 			resource_size_t size, resource_size_t align,
 			bool fb_overlap_ok);
-
+void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 int vmbus_cpu_number_to_vp_number(int cpu_number);
 u64 hv_do_hypercall(u64 control, void *input, void *output);
 
-- 
cgit v1.2.3


From 05d1a717ec0430c916a749b94eb90ab74bbfa356 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 29 Feb 2016 19:52:05 -0500
Subject: ima: add support for creating files using the mknodat syscall

Commit 3034a14 "ima: pass 'opened' flag to identify newly created files"
stopped identifying empty files as new files.  However new empty files
can be created using the mknodat syscall.  On systems with IMA-appraisal
enabled, these empty files are not labeled with security.ima extended
attributes properly, preventing them from subsequently being opened in
order to write the file data contents.  This patch defines a new hook
named ima_post_path_mknod() to mark these empty files, created using
mknodat, as new in order to allow the file data contents to be written.

In addition, files with security.ima xattrs containing a file signature
are considered "immutable" and can not be modified.  The file contents
need to be written, before signing the file.  This patch relaxes this
requirement for new files, allowing the file signature to be written
before the file contents.

Changelog:
- defer identifying files with signatures stored as security.ima
  (based on Dmitry Rozhkov's comments)
- removing tests (eg. dentry, dentry->d_inode, inode->i_size == 0)
  (based on Al's review)

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Al Viro <<viro@zeniv.linux.org.uk>
Tested-by: Dmitry Rozhkov <dmitry.rozhkov@linux.intel.com>
---
 fs/namei.c                            |  2 ++
 include/linux/ima.h                   |  6 ++++++
 security/integrity/ima/ima_appraise.c |  5 +++++
 security/integrity/ima/ima_main.c     | 25 ++++++++++++++++++++++++-
 4 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 1d9ca2d5dff6..b4bd06839446 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3608,6 +3608,8 @@ retry:
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			if (!error)
+				ima_post_path_mknod(dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index e6516cbbe9bf..0eb7c2e7f0d6 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -21,6 +21,7 @@ extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
+extern void ima_post_path_mknod(struct dentry *dentry);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -54,6 +55,11 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	return 0;
 }
 
+static inline void ima_post_path_mknod(struct dentry *dentry)
+{
+	return;
+}
+
 #endif /* CONFIG_IMA */
 
 #ifdef CONFIG_IMA_APPRAISE
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index d2f28a0c8614..1bcbc12e03d9 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -275,6 +275,11 @@ out:
 		     xattr_value->type != EVM_IMA_XATTR_DIGSIG)) {
 			if (!ima_fix_xattr(dentry, iint))
 				status = INTEGRITY_PASS;
+		} else if ((inode->i_size == 0) &&
+			   (iint->flags & IMA_NEW_FILE) &&
+			   (xattr_value &&
+			    xattr_value->type == EVM_IMA_XATTR_DIGSIG)) {
+			status = INTEGRITY_PASS;
 		}
 		integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename,
 				    op, cause, rc, 0);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 391f41751021..68b26c340acd 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -246,7 +246,8 @@ static int process_measurement(struct file *file, char *buf, loff_t size,
 		ima_audit_measurement(iint, pathname);
 
 out_digsig:
-	if ((mask & MAY_WRITE) && (iint->flags & IMA_DIGSIG))
+	if ((mask & MAY_WRITE) && (iint->flags & IMA_DIGSIG) &&
+	     !(iint->flags & IMA_NEW_FILE))
 		rc = -EACCES;
 	kfree(xattr_value);
 out_free:
@@ -315,6 +316,28 @@ int ima_file_check(struct file *file, int mask, int opened)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
+/**
+ * ima_post_path_mknod - mark as a new inode
+ * @dentry: newly created dentry
+ *
+ * Mark files created via the mknodat syscall as new, so that the
+ * file data can be written later.
+ */
+void ima_post_path_mknod(struct dentry *dentry)
+{
+	struct integrity_iint_cache *iint;
+	struct inode *inode = dentry->d_inode;
+	int must_appraise;
+
+	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	if (!must_appraise)
+		return;
+
+	iint = integrity_inode_get(inode);
+	if (iint)
+		iint->flags |= IMA_NEW_FILE;
+}
+
 /**
  * ima_read_file - pre-measure/appraise hook decision based on policy
  * @file: pointer to the file to be measured/appraised/audit
-- 
cgit v1.2.3


From 0f40fbbcc34e093255a2b2d70b6b0fb48c3f39aa Mon Sep 17 00:00:00 2001
From: Brian Bloniarz <brian.bloniarz@gmail.com>
Date: Sun, 6 Mar 2016 13:16:30 -0800
Subject: Fix OpenSSH pty regression on close

OpenSSH expects the (non-blocking) read() of pty master to return
EAGAIN only if it has received all of the slave-side output after
it has received SIGCHLD. This used to work on pre-3.12 kernels.

This fix effectively forces non-blocking read() and poll() to
block for parallel i/o to complete for all ttys. It also unwinds
these changes:

1) f8747d4a466ab2cafe56112c51b3379f9fdb7a12
   tty: Fix pty master read() after slave closes

2) 52bce7f8d4fc633c9a9d0646eef58ba6ae9a3b73
   pty, n_tty: Simplify input processing on final close

3) 1a48632ffed61352a7810ce089dc5a8bcd505a60
   pty: Fix input race when closing

Inspired by analysis and patch from Marc Aurele La France <tsi@tuyoix.net>

Reported-by: Volth <openssh@volth.com>
Reported-by: Marc Aurele La France <tsi@tuyoix.net>
BugLink: https://bugzilla.mindrot.org/show_bug.cgi?id=52
BugLink: https://bugzilla.mindrot.org/show_bug.cgi?id=2492
Signed-off-by: Brian Bloniarz <brian.bloniarz@gmail.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/serial/tty.txt |  3 --
 drivers/tty/n_hdlc.c         |  4 +--
 drivers/tty/n_tty.c          | 70 +++++++++++++++++++++-----------------------
 drivers/tty/pty.c            |  4 +--
 drivers/tty/tty_buffer.c     | 34 ++++-----------------
 include/linux/tty.h          |  2 +-
 6 files changed, 43 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt
index 798cba82c762..b48780977a68 100644
--- a/Documentation/serial/tty.txt
+++ b/Documentation/serial/tty.txt
@@ -210,9 +210,6 @@ TTY_IO_ERROR		If set, causes all subsequent userspace read/write
 
 TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
 
-TTY_OTHER_DONE		Device is a pty and the other side has closed and
-			all pending input processing has been completed.
-
 TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
 			smaller chunks.
 
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index bcaba17688f6..a7fa016f31eb 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -599,7 +599,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 	add_wait_queue(&tty->read_wait, &wait);
 
 	for (;;) {
-		if (test_bit(TTY_OTHER_DONE, &tty->flags)) {
+		if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
 			ret = -EIO;
 			break;
 		}
@@ -827,7 +827,7 @@ static unsigned int n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp,
 		/* set bits for operations that won't block */
 		if (n_hdlc->rx_buf_list.head)
 			mask |= POLLIN | POLLRDNORM;	/* readable */
-		if (test_bit(TTY_OTHER_DONE, &tty->flags))
+		if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 			mask |= POLLHUP;
 		if (tty_hung_up_p(filp))
 			mask |= POLLHUP;
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index fb76a7d80e7e..bdf0e6e89991 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1917,18 +1917,6 @@ static inline int input_available_p(struct tty_struct *tty, int poll)
 		return ldata->commit_head - ldata->read_tail >= amt;
 }
 
-static inline int check_other_done(struct tty_struct *tty)
-{
-	int done = test_bit(TTY_OTHER_DONE, &tty->flags);
-	if (done) {
-		/* paired with cmpxchg() in check_other_closed(); ensures
-		 * read buffer head index is not stale
-		 */
-		smp_mb__after_atomic();
-	}
-	return done;
-}
-
 /**
  *	copy_from_read_buf	-	copy read data directly
  *	@tty: terminal device
@@ -2124,7 +2112,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 	struct n_tty_data *ldata = tty->disc_data;
 	unsigned char __user *b = buf;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int c, done;
+	int c;
 	int minimum, time;
 	ssize_t retval = 0;
 	long timeout;
@@ -2183,32 +2171,35 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		done = check_other_done(tty);
-
 		if (!input_available_p(tty, 0)) {
-			if (done) {
-				retval = -EIO;
-				break;
-			}
-			if (tty_hung_up_p(file))
-				break;
-			if (!timeout)
-				break;
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				break;
-			}
-			if (signal_pending(current)) {
-				retval = -ERESTARTSYS;
-				break;
-			}
 			up_read(&tty->termios_rwsem);
+			tty_buffer_flush_work(tty->port);
+			down_read(&tty->termios_rwsem);
+			if (!input_available_p(tty, 0)) {
+				if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
+					retval = -EIO;
+					break;
+				}
+				if (tty_hung_up_p(file))
+					break;
+				if (!timeout)
+					break;
+				if (file->f_flags & O_NONBLOCK) {
+					retval = -EAGAIN;
+					break;
+				}
+				if (signal_pending(current)) {
+					retval = -ERESTARTSYS;
+					break;
+				}
+				up_read(&tty->termios_rwsem);
 
-			timeout = wait_woken(&wait, TASK_INTERRUPTIBLE,
-					     timeout);
+				timeout = wait_woken(&wait, TASK_INTERRUPTIBLE,
+						timeout);
 
-			down_read(&tty->termios_rwsem);
-			continue;
+				down_read(&tty->termios_rwsem);
+				continue;
+			}
 		}
 
 		if (ldata->icanon && !L_EXTPROC(tty)) {
@@ -2386,12 +2377,17 @@ static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
 
 	poll_wait(file, &tty->read_wait, wait);
 	poll_wait(file, &tty->write_wait, wait);
-	if (check_other_done(tty))
-		mask |= POLLHUP;
 	if (input_available_p(tty, 1))
 		mask |= POLLIN | POLLRDNORM;
+	else {
+		tty_buffer_flush_work(tty->port);
+		if (input_available_p(tty, 1))
+			mask |= POLLIN | POLLRDNORM;
+	}
 	if (tty->packet && tty->link->ctrl_status)
 		mask |= POLLPRI | POLLIN | POLLRDNORM;
+	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
+		mask |= POLLHUP;
 	if (tty_hung_up_p(file))
 		mask |= POLLHUP;
 	if (tty->ops->write && !tty_is_writelocked(tty) &&
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a8a292fd564f..ee0e84798399 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -59,7 +59,7 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 	if (!tty->link)
 		return;
 	set_bit(TTY_OTHER_CLOSED, &tty->link->flags);
-	tty_flip_buffer_push(tty->link->port);
+	wake_up_interruptible(&tty->link->read_wait);
 	wake_up_interruptible(&tty->link->write_wait);
 	if (tty->driver->subtype == PTY_TYPE_MASTER) {
 		set_bit(TTY_OTHER_CLOSED, &tty->flags);
@@ -247,9 +247,7 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 		goto out;
 
 	clear_bit(TTY_IO_ERROR, &tty->flags);
-	/* TTY_OTHER_CLOSED must be cleared before TTY_OTHER_DONE */
 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
-	clear_bit(TTY_OTHER_DONE, &tty->link->flags);
 	set_bit(TTY_THROTTLED, &tty->flags);
 	return 0;
 
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index a946e49a2626..aa80dc94ddc2 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -37,29 +37,6 @@
 
 #define TTY_BUFFER_PAGE	(((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~0xFF)
 
-/*
- * If all tty flip buffers have been processed by flush_to_ldisc() or
- * dropped by tty_buffer_flush(), check if the linked pty has been closed.
- * If so, wake the reader/poll to process
- */
-static inline void check_other_closed(struct tty_struct *tty)
-{
-	unsigned long flags, old;
-
-	/* transition from TTY_OTHER_CLOSED => TTY_OTHER_DONE must be atomic */
-	for (flags = ACCESS_ONCE(tty->flags);
-	     test_bit(TTY_OTHER_CLOSED, &flags);
-	     ) {
-		old = flags;
-		__set_bit(TTY_OTHER_DONE, &flags);
-		flags = cmpxchg(&tty->flags, old, flags);
-		if (old == flags) {
-			wake_up_interruptible(&tty->read_wait);
-			break;
-		}
-	}
-}
-
 /**
  *	tty_buffer_lock_exclusive	-	gain exclusive access to buffer
  *	tty_buffer_unlock_exclusive	-	release exclusive access
@@ -254,8 +231,6 @@ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld)
 	if (ld && ld->ops->flush_buffer)
 		ld->ops->flush_buffer(tty);
 
-	check_other_closed(tty);
-
 	atomic_dec(&buf->priority);
 	mutex_unlock(&buf->lock);
 }
@@ -522,10 +497,8 @@ static void flush_to_ldisc(struct work_struct *work)
 		 */
 		count = smp_load_acquire(&head->commit) - head->read;
 		if (!count) {
-			if (next == NULL) {
-				check_other_closed(tty);
+			if (next == NULL)
 				break;
-			}
 			buf->head = next;
 			tty_buffer_free(port, head);
 			continue;
@@ -614,3 +587,8 @@ bool tty_buffer_cancel_work(struct tty_port *port)
 {
 	return cancel_work_sync(&port->buf.work);
 }
+
+void tty_buffer_flush_work(struct tty_port *port)
+{
+	flush_work(&port->buf.work);
+}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bf1bcdb01df0..d82bb9281d12 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -351,7 +351,6 @@ struct tty_file_private {
 #define TTY_OTHER_CLOSED 	2	/* Other side (if any) has closed */
 #define TTY_EXCLUSIVE 		3	/* Exclusive open mode */
 #define TTY_DO_WRITE_WAKEUP 	5	/* Call write_wakeup after queuing new */
-#define TTY_OTHER_DONE		6	/* Closed pty has completed input processing */
 #define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
 #define TTY_PTY_LOCK 		16	/* pty private */
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
@@ -480,6 +479,7 @@ extern void tty_buffer_init(struct tty_port *port);
 extern void tty_buffer_set_lock_subclass(struct tty_port *port);
 extern bool tty_buffer_restart_work(struct tty_port *port);
 extern bool tty_buffer_cancel_work(struct tty_port *port);
+extern void tty_buffer_flush_work(struct tty_port *port);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
 extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
-- 
cgit v1.2.3


From 795ddd18d38f9762fbfefceab9aa16caef0cf431 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Sun, 24 Apr 2016 20:28:05 +0100
Subject: nvmem: core: remove regmap dependency

nvmem uses regmap_raw_read/write apis to read/write data from providers,
regmap raw apis stopped working with recent kernels which removed raw
accessors on mmio bus. This resulted in broken nvmem for providers
which are based on regmap mmio bus. This issue can be fixed temporarly
by moving to other regmap apis, but we might hit same issue in future.
Moving to interfaces based on read/write callbacks from providers would
be more robust.

This patch removes regmap dependency from nvmem and introduces
read/write callbacks from the providers.

Without this patch nvmem providers like qfprom based on regmap mmio
bus would not work.

Reported-by: Rajendra Nayak <rjendra@qti.qualcomm.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/Kconfig          |  1 -
 drivers/nvmem/core.c           | 67 +++++++++++++++++++++++++-----------------
 include/linux/nvmem-provider.h | 10 +++++++
 3 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
index 9c0c59d3b22b..15c58a5ff7ec 100644
--- a/drivers/nvmem/Kconfig
+++ b/drivers/nvmem/Kconfig
@@ -1,6 +1,5 @@
 menuconfig NVMEM
 	tristate "NVMEM Support"
-	select REGMAP
 	help
 	  Support for NVMEM(Non Volatile Memory) devices like EEPROM, EFUSES...
 
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 0de3d878c439..bb4ea123547f 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -23,12 +23,10 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/nvmem-provider.h>
 #include <linux/of.h>
-#include <linux/regmap.h>
 #include <linux/slab.h>
 
 struct nvmem_device {
 	const char		*name;
-	struct regmap		*regmap;
 	struct module		*owner;
 	struct device		dev;
 	int			stride;
@@ -41,6 +39,9 @@ struct nvmem_device {
 	int			flags;
 	struct bin_attribute	eeprom;
 	struct device		*base_dev;
+	nvmem_reg_read_t	reg_read;
+	nvmem_reg_write_t	reg_write;
+	void *priv;
 };
 
 #define FLAG_COMPAT		BIT(0)
@@ -66,6 +67,23 @@ static struct lock_class_key eeprom_lock_key;
 #endif
 
 #define to_nvmem_device(d) container_of(d, struct nvmem_device, dev)
+static int nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
+			  void *val, size_t bytes)
+{
+	if (nvmem->reg_read)
+		return nvmem->reg_read(nvmem->priv, offset, val, bytes);
+
+	return -EINVAL;
+}
+
+static int nvmem_reg_write(struct nvmem_device *nvmem, unsigned int offset,
+			   void *val, size_t bytes)
+{
+	if (nvmem->reg_write)
+		return nvmem->reg_write(nvmem->priv, offset, val, bytes);
+
+	return -EINVAL;
+}
 
 static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *attr,
@@ -93,7 +111,7 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 
 	count = round_down(count, nvmem->word_size);
 
-	rc = regmap_raw_read(nvmem->regmap, pos, buf, count);
+	rc = nvmem_reg_read(nvmem, pos, buf, count);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -127,7 +145,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
 
 	count = round_down(count, nvmem->word_size);
 
-	rc = regmap_raw_write(nvmem->regmap, pos, buf, count);
+	rc = nvmem_reg_write(nvmem, pos, buf, count);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -421,18 +439,11 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 {
 	struct nvmem_device *nvmem;
 	struct device_node *np;
-	struct regmap *rm;
 	int rval;
 
 	if (!config->dev)
 		return ERR_PTR(-EINVAL);
 
-	rm = dev_get_regmap(config->dev, NULL);
-	if (!rm) {
-		dev_err(config->dev, "Regmap not found\n");
-		return ERR_PTR(-EINVAL);
-	}
-
 	nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL);
 	if (!nvmem)
 		return ERR_PTR(-ENOMEM);
@@ -444,14 +455,16 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	}
 
 	nvmem->id = rval;
-	nvmem->regmap = rm;
 	nvmem->owner = config->owner;
-	nvmem->stride = regmap_get_reg_stride(rm);
-	nvmem->word_size = regmap_get_val_bytes(rm);
-	nvmem->size = regmap_get_max_register(rm) + nvmem->stride;
+	nvmem->stride = config->stride;
+	nvmem->word_size = config->word_size;
+	nvmem->size = config->size;
 	nvmem->dev.type = &nvmem_provider_type;
 	nvmem->dev.bus = &nvmem_bus_type;
 	nvmem->dev.parent = config->dev;
+	nvmem->priv = config->priv;
+	nvmem->reg_read = config->reg_read;
+	nvmem->reg_write = config->reg_write;
 	np = config->dev->of_node;
 	nvmem->dev.of_node = np;
 	dev_set_name(&nvmem->dev, "%s%d",
@@ -948,7 +961,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	rc = regmap_raw_read(nvmem->regmap, cell->offset, buf, cell->bytes);
+	rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -977,7 +990,7 @@ void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
 	u8 *buf;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return ERR_PTR(-EINVAL);
 
 	buf = kzalloc(cell->bytes, GFP_KERNEL);
@@ -1014,7 +1027,7 @@ static inline void *nvmem_cell_prepare_write_buffer(struct nvmem_cell *cell,
 		*b <<= bit_offset;
 
 		/* setup the first byte with lsb bits from nvmem */
-		rc = regmap_raw_read(nvmem->regmap, cell->offset, &v, 1);
+		rc = nvmem_reg_read(nvmem, cell->offset, &v, 1);
 		*b++ |= GENMASK(bit_offset - 1, 0) & v;
 
 		/* setup rest of the byte if any */
@@ -1031,7 +1044,7 @@ static inline void *nvmem_cell_prepare_write_buffer(struct nvmem_cell *cell,
 	/* if it's not end on byte boundary */
 	if ((nbits + bit_offset) % BITS_PER_BYTE) {
 		/* setup the last byte with msb bits from nvmem */
-		rc = regmap_raw_read(nvmem->regmap,
+		rc = nvmem_reg_read(nvmem,
 				    cell->offset + cell->bytes - 1, &v, 1);
 		*p |= GENMASK(7, (nbits + bit_offset) % BITS_PER_BYTE) & v;
 
@@ -1054,7 +1067,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 	struct nvmem_device *nvmem = cell->nvmem;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap || nvmem->read_only ||
+	if (!nvmem || nvmem->read_only ||
 	    (cell->bit_offset == 0 && len != cell->bytes))
 		return -EINVAL;
 
@@ -1064,7 +1077,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 			return PTR_ERR(buf);
 	}
 
-	rc = regmap_raw_write(nvmem->regmap, cell->offset, buf, cell->bytes);
+	rc = nvmem_reg_write(nvmem, cell->offset, buf, cell->bytes);
 
 	/* free the tmp buffer */
 	if (cell->bit_offset || cell->nbits)
@@ -1094,7 +1107,7 @@ ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
 	int rc;
 	ssize_t len;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
@@ -1124,7 +1137,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
 	struct nvmem_cell cell;
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
@@ -1152,10 +1165,10 @@ int nvmem_device_read(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
-	rc = regmap_raw_read(nvmem->regmap, offset, buf, bytes);
+	rc = nvmem_reg_read(nvmem, offset, buf, bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
@@ -1180,10 +1193,10 @@ int nvmem_device_write(struct nvmem_device *nvmem,
 {
 	int rc;
 
-	if (!nvmem || !nvmem->regmap)
+	if (!nvmem)
 		return -EINVAL;
 
-	rc = regmap_raw_write(nvmem->regmap, offset, buf, bytes);
+	rc = nvmem_reg_write(nvmem, offset, buf, bytes);
 
 	if (IS_ERR_VALUE(rc))
 		return rc;
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index a4fcc90b0f20..cd93416d762e 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -14,6 +14,10 @@
 
 struct nvmem_device;
 struct nvmem_cell_info;
+typedef int (*nvmem_reg_read_t)(void *priv, unsigned int offset,
+				void *val, size_t bytes);
+typedef int (*nvmem_reg_write_t)(void *priv, unsigned int offset,
+				 void *val, size_t bytes);
 
 struct nvmem_config {
 	struct device		*dev;
@@ -24,6 +28,12 @@ struct nvmem_config {
 	int			ncells;
 	bool			read_only;
 	bool			root_only;
+	nvmem_reg_read_t	reg_read;
+	nvmem_reg_write_t	reg_write;
+	int	size;
+	int	word_size;
+	int	stride;
+	void	*priv;
 	/* To be only used by old driver/misc/eeprom drivers */
 	bool			compat;
 	struct device		*base_dev;
-- 
cgit v1.2.3


From 1f62ff34a90471d1b735bac2c79e894afc7c59bc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Mar 2016 22:19:40 +0100
Subject: driver-core: use 'dev' argument in dev_dbg_ratelimited stub

dev_dbg_ratelimited() is a macro that ignores its first argument when DEBUG is
not set, which can lead to unused variable warnings:

ethernet/mellanox/mlxsw/pci.c: In function 'mlxsw_pci_cqe_sdq_handle':
ethernet/mellanox/mlxsw/pci.c:646:18: warning: unused variable 'pdev' [-Wunused-variable]
ethernet/mellanox/mlxsw/pci.c: In function 'mlxsw_pci_cqe_rdq_handle':
ethernet/mellanox/mlxsw/pci.c:671:18: warning: unused variable 'pdev' [-Wunused-variable]

The macro already ensures that all its other arguments are silently
ignored by the compiler without triggering a warning, through the
use of the no_printk() macro, but the dev argument is not passed into
that.

This changes the definition to use the same trick as no_printk() with
an if(0) that leads the compiler to not evaluate the side-effects but
still see that 'dev' might not be unused.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Fixes: 6f586e663e3b ("driver-core: Shut up dev_dbg_reatelimited() without DEBUG")
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 002c59728dbe..07f74c246cac 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1293,8 +1293,11 @@ do {									\
 		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
 } while (0)
 #else
-#define dev_dbg_ratelimited(dev, fmt, ...)			\
-	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#define dev_dbg_ratelimited(dev, fmt, ...)				\
+do {									\
+	if (0)								\
+		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
+} while (0)
 #endif
 
 #ifdef VERBOSE_DEBUG
-- 
cgit v1.2.3


From 1af5bb491fbb41c8dab9d728a92758dd6a28afd4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:51:56 -0700
Subject: filemap: remove the pos argument to generic_file_direct_write

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    | 9 ++++-----
 fs/fuse/file.c     | 2 +-
 include/linux/fs.h | 2 +-
 mm/filemap.c       | 5 +++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8d7b5a45c005..6c376311a9d7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1703,18 +1703,17 @@ again:
 	return num_written ? num_written : ret;
 }
 
-static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-				    struct iov_iter *from,
-				    loff_t pos)
+static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
+	loff_t pos = iocb->ki_pos;
 	ssize_t written;
 	ssize_t written_buffered;
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, from, pos);
+	written = generic_file_direct_write(iocb, from);
 
 	if (written < 0 || !iov_iter_count(from))
 		return written;
@@ -1832,7 +1831,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
-		num_written = __btrfs_direct_write(iocb, from, pos);
+		num_written = __btrfs_direct_write(iocb, from);
 	} else {
 		num_written = __btrfs_buffered_write(file, from, pos);
 		if (num_written > 0)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 719924d6c706..7e8c4603d43a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1186,7 +1186,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos = iocb->ki_pos;
-		written = generic_file_direct_write(iocb, from, pos);
+		written = generic_file_direct_write(iocb, from);
 		if (written < 0 || !iov_iter_count(from))
 			goto out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..e9eaa2074061 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2703,7 +2703,7 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
+extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 5885925cdb5b..e7108c31346d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2499,11 +2499,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode	*inode = mapping->host;
+	loff_t		pos = iocb->ki_pos;
 	ssize_t		written;
 	size_t		write_len;
 	pgoff_t		end;
@@ -2717,7 +2718,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos, endbyte;
 
-		written = generic_file_direct_write(iocb, from, iocb->ki_pos);
+		written = generic_file_direct_write(iocb, from);
 		/*
 		 * If the write stopped short of completing, fall back to
 		 * buffered writes.  Some filesystems do this for writes to
-- 
cgit v1.2.3


From c8b8e32d700fe943a935e435ae251364d016c497 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:51:58 -0700
Subject: direct-io: eliminate the offset argument to ->direct_IO

Including blkdev_direct_IO and dax_do_io.  It has to be ki_pos to actually
work, so eliminate the superflous argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking          |  2 +-
 Documentation/filesystems/vfs.txt          |  2 +-
 drivers/staging/lustre/lustre/llite/rw26.c |  4 ++--
 fs/9p/vfs_addr.c                           |  3 ++-
 fs/affs/file.c                             |  5 +++--
 fs/block_dev.c                             |  6 +++---
 fs/btrfs/inode.c                           |  6 +++---
 fs/ceph/addr.c                             |  3 +--
 fs/cifs/file.c                             |  2 +-
 fs/dax.c                                   |  4 ++--
 fs/direct-io.c                             |  7 ++++---
 fs/exofs/inode.c                           |  3 +--
 fs/ext2/inode.c                            |  8 ++++----
 fs/ext4/ext4.h                             |  3 +--
 fs/ext4/indirect.c                         | 12 ++++++------
 fs/ext4/inode.c                            | 18 +++++++++---------
 fs/f2fs/data.c                             |  6 +++---
 fs/fat/inode.c                             |  6 +++---
 fs/fuse/file.c                             |  3 ++-
 fs/gfs2/aops.c                             |  6 +++---
 fs/hfs/inode.c                             |  7 +++----
 fs/hfsplus/inode.c                         |  7 +++----
 fs/jfs/inode.c                             |  7 +++----
 fs/nfs/direct.c                            | 17 +++++++----------
 fs/nfs/file.c                              |  2 +-
 fs/nilfs2/inode.c                          |  4 ++--
 fs/ocfs2/aops.c                            |  9 ++++-----
 fs/reiserfs/inode.c                        |  7 +++----
 fs/udf/file.c                              |  3 +--
 fs/udf/inode.c                             |  7 +++----
 fs/xfs/xfs_aops.c                          |  7 +++----
 fs/xfs/xfs_file.c                          |  2 +-
 include/linux/dax.h                        |  2 +-
 include/linux/fs.h                         |  9 ++++-----
 include/linux/nfs_fs.h                     |  5 ++---
 mm/filemap.c                               |  5 ++---
 mm/page_io.c                               |  2 +-
 37 files changed, 99 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 619af9bfdcb3..75eea7ce3d7c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -194,7 +194,7 @@ prototypes:
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	int (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4164bd6397a2..c61a223ef3ff 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,7 +591,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 69aa15e8e3ef..0c3459c1a518 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -358,14 +358,14 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
  */
 #define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) *	  \
 		       PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1))
-static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t file_offset)
+static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct lu_env *env;
 	struct cl_io *io;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ccc_object *obj = cl_inode2ccc(inode);
+	loff_t file_offset = iocb->ki_pos;
 	ssize_t count = iov_iter_count(iter);
 	ssize_t tot_bytes = 0, result = 0;
 	struct ll_inode_info *lli = ll_i2info(inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index ac9225e86bf3..c37fb9c08970 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -245,9 +245,10 @@ static int v9fs_launder_page(struct page *page)
  *
  */
 static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
+	loff_t pos = iocb->ki_pos;
 	ssize_t n;
 	int err = 0;
 	if (iov_iter_rw(iter) == WRITE) {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0cde550050e8..0deec9cc2362 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -389,12 +389,13 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
 }
 
 static ssize_t
-affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (iov_iter_rw(iter) == WRITE) {
@@ -404,7 +405,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 			return 0;
 	}
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, affs_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		affs_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..9e1f3fe25753 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -162,15 +162,15 @@ static struct inode *bdev_file_inode(struct file *file)
 }
 
 static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 
 	if (IS_DAX(inode))
-		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+		return dax_do_io(iocb, inode, iter, blkdev_get_block,
 				NULL, DIO_SKIP_DIO_COUNT);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2aaba58b4856..352d4e1dc985 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8541,13 +8541,13 @@ out:
 	return retval;
 }
 
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_data dio_data = { 0 };
+	loff_t offset = iocb->ki_pos;
 	size_t count = 0;
 	int flags = 0;
 	bool wakeup = true;
@@ -8607,7 +8607,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	ret = __blockdev_direct_IO(iocb, inode,
 				   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-				   iter, offset, btrfs_get_blocks_direct, NULL,
+				   iter, btrfs_get_blocks_direct, NULL,
 				   btrfs_submit_direct, flags);
 	if (iov_iter_rw(iter) == WRITE) {
 		current->journal_info = NULL;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4801571f51cb..43098cd9602b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1292,8 +1292,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
  * intercept O_DIRECT reads and writes early, this function should
  * never get called.
  */
-static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t pos)
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 {
 	WARN_ON(1);
 	return -EINVAL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c03d0744648b..cb070aa88e57 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3854,7 +3854,7 @@ void cifs_oplock_break(struct work_struct *work)
  * Direct IO is not yet supported in the cached mode. 
  */
 static ssize_t
-cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 {
         /*
          * FIXME
diff --git a/fs/dax.c b/fs/dax.c
index 75ba46d82a76..0dbe4e0f16fe 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -244,7 +244,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * @iocb: The control block for this I/O
  * @inode: The file which the I/O is directed at
  * @iter: The addresses to do I/O from or to
- * @pos: The file offset where the I/O starts
  * @get_block: The filesystem method used to translate file offsets to blocks
  * @end_io: A filesystem callback for I/O completion
  * @flags: See below
@@ -257,11 +256,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * is in progress.
  */
 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
+		  struct iov_iter *iter, get_block_t get_block,
 		  dio_iodone_t end_io, int flags)
 {
 	struct buffer_head bh;
 	ssize_t retval = -EINVAL;
+	loff_t pos = iocb->ki_pos;
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 472037732daf..8949d3e35756 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1113,7 +1113,7 @@ static inline int drop_refcount(struct dio *dio)
 static inline ssize_t
 do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		      struct block_device *bdev, struct iov_iter *iter,
-		      loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+		      get_block_t get_block, dio_iodone_t end_io,
 		      dio_submit_t submit_io, int flags)
 {
 	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
@@ -1121,6 +1121,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	loff_t end = offset + count;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
@@ -1328,7 +1329,7 @@ out:
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
-			     loff_t offset, get_block_t get_block,
+			     get_block_t get_block,
 			     dio_iodone_t end_io, dio_submit_t submit_io,
 			     int flags)
 {
@@ -1344,7 +1345,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
-	return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block,
+	return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
 				     end_io, submit_io, flags);
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 49e1bd00b4ec..9dc4c6dbf3c9 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -960,8 +960,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
 
 
  /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	return 0;
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6bd58e6ff038..b675610391b8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -854,20 +854,20 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 }
 
 static ssize_t
-ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+		ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
 				DIO_LOCKING);
 	else
-		ret = blockdev_direct_IO(iocb, inode, iter, offset,
-					 ext2_get_block);
+		ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		ext2_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 349afebe21ee..72f4c9e00e97 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2581,8 +2581,7 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset);
+extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de5..627b7e8f9ef3 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -659,12 +659,12 @@ out:
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			   loff_t offset)
+ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	loff_t offset = iocb->ki_pos;
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
@@ -707,21 +707,21 @@ retry:
 			goto locked;
 		}
 		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
+			ret = dax_do_io(iocb, inode, iter,
 					ext4_dio_get_block, NULL, 0);
 		else
 			ret = __blockdev_direct_IO(iocb, inode,
 						   inode->i_sb->s_bdev, iter,
-						   offset, ext4_dio_get_block,
+						   ext4_dio_get_block,
 						   NULL, NULL, 0);
 		inode_dio_end(inode);
 	} else {
 locked:
 		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
+			ret = dax_do_io(iocb, inode, iter,
 					ext4_dio_get_block, NULL, DIO_LOCKING);
 		else
-			ret = blockdev_direct_IO(iocb, inode, iter, offset,
+			ret = blockdev_direct_IO(iocb, inode, iter,
 						 ext4_dio_get_block);
 
 		if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 981a1fc30eaa..79b298d397b4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3334,12 +3334,12 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * if the machine crashes during the write.
  *
  */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset)
+static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
+	loff_t offset = iocb->ki_pos;
 	size_t count = iov_iter_count(iter);
 	int overwrite = 0;
 	get_block_t *get_block_func = NULL;
@@ -3348,7 +3348,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	/* Use the old path for reads and writes beyond i_size. */
 	if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-		return ext4_ind_direct_IO(iocb, iter, offset);
+		return ext4_ind_direct_IO(iocb, iter);
 
 	BUG_ON(iocb->private == NULL);
 
@@ -3400,11 +3400,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
 	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
+		ret = dax_do_io(iocb, inode, iter, get_block_func,
 				ext4_end_io_dio, dio_flags);
 	else
 		ret = __blockdev_direct_IO(iocb, inode,
-					   inode->i_sb->s_bdev, iter, offset,
+					   inode->i_sb->s_bdev, iter,
 					   get_block_func,
 					   ext4_end_io_dio, NULL, dio_flags);
 
@@ -3431,12 +3431,12 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3456,9 +3456,9 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ret = ext4_ext_direct_IO(iocb, iter, offset);
+		ret = ext4_ext_direct_IO(iocb, iter);
 	else
-		ret = ext4_ind_direct_IO(iocb, iter, offset);
+		ret = ext4_ind_direct_IO(iocb, iter);
 	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
 	return ret;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 53fec0872e60..a4c5da5bfe1e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1655,12 +1655,12 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 	return 0;
 }
 
-static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	int err;
 
 	err = check_direct_IO(inode, iter, offset);
@@ -1672,7 +1672,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
-	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+	err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
 	if (err < 0 && iov_iter_rw(iter) == WRITE)
 		f2fs_write_failed(mapping, offset + count);
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 226281068a46..3bcf57925dca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -244,13 +244,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
 	return err;
 }
 
-static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (iov_iter_rw(iter) == WRITE) {
@@ -272,7 +272,7 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		fat_write_failed(mapping, offset + count);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e8c4603d43a..02279073bf64 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2837,7 +2837,7 @@ static inline loff_t fuse_round_up(loff_t off)
 }
 
 static ssize_t
-fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	ssize_t ret = 0;
@@ -2848,6 +2848,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 	struct inode *inode;
 	loff_t i_size;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	struct fuse_io_priv *io;
 	bool is_sync = is_sync_kiocb(iocb);
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1bbbee945f46..8524c0e322fc 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1042,13 +1042,13 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
 
 
-static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct address_space *mapping = inode->i_mapping;
 	struct gfs2_inode *ip = GFS2_I(inode);
+	loff_t offset = iocb->ki_pos;
 	struct gfs2_holder gh;
 	int rv;
 
@@ -1099,7 +1099,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				  offset, gfs2_get_block_direct, NULL, NULL, 0);
+				  gfs2_get_block_direct, NULL, NULL, 0);
 out:
 	gfs2_glock_dq(&gh);
 	gfs2_holder_uninit(&gh);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index cb1e5faa2fb7..c82331a9cf9b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -124,8 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -133,7 +132,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -141,7 +140,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			hfs_write_failed(mapping, end);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b28f39865c3a..2ad34a5eb5ad 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -122,8 +122,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				 loff_t offset)
+static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -131,7 +130,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -139,7 +138,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			hfsplus_write_failed(mapping, end);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9d9bae63ae2a..f6a2a78121b0 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -332,8 +332,7 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, jfs_get_block);
 }
 
-static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -341,7 +340,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -349,7 +348,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			jfs_write_failed(mapping, end);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c93826e4a8c6..346b5d85ce92 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -250,7 +250,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
  * shunt off direct read and write requests before the VFS gets them,
  * so this method is only ever called for swap.
  */
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 
@@ -261,7 +261,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (iov_iter_rw(iter) == READ)
-		return nfs_file_direct_read(iocb, iter, pos);
+		return nfs_file_direct_read(iocb, iter);
 	return nfs_file_direct_write(iocb, iter);
 }
 
@@ -545,7 +545,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers into which to read data
- * @pos: byte offset in file where reading starts
  *
  * We use this function for direct reads instead of calling
  * generic_file_aio_read() in order to avoid gfar's check to see if
@@ -561,8 +560,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
-				loff_t pos)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -574,7 +572,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
-		file, count, (long long) pos);
+		file, count, (long long) iocb->ki_pos);
 
 	result = 0;
 	if (!count)
@@ -594,7 +592,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 
 	dreq->inode = inode;
 	dreq->bytes_left = count;
-	dreq->io_start = pos;
+	dreq->io_start = iocb->ki_pos;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	l_ctx = nfs_get_lock_context(dreq->ctx);
 	if (IS_ERR(l_ctx)) {
@@ -606,14 +604,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 		dreq->iocb = iocb;
 
 	NFS_I(inode)->read_io += count;
-	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
+	result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
 	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0)
-			iocb->ki_pos = pos + result;
+			iocb->ki_pos += result;
 	}
 
 	nfs_direct_req_release(dreq);
@@ -969,7 +967,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers from which to write data
- * @pos: byte offset in file where writing starts
  *
  * We use this function for direct writes instead of calling
  * generic_file_aio_write() in order to avoid taking the inode
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index be01095b97ae..717a8d6af52d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -164,7 +164,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_read(iocb, to, iocb->ki_pos);
+		return nfs_file_direct_read(iocb, to);
 
 	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 534631358b13..cfebcd2fc7f3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -305,7 +305,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 }
 
 static ssize_t
-nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 
@@ -313,7 +313,7 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
+	return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block);
 }
 
 const struct address_space_operations nilfs_aops = {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ad1577348a92..6c66c62d4a7e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2423,13 +2423,11 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
 	return 0;
 }
 
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file)->i_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	loff_t end = offset + iter->count;
 	get_block_t *get_block;
 
 	/*
@@ -2440,7 +2438,8 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	/* Fallback to buffered I/O if we do not support append dio. */
-	if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+	if (iocb->ki_pos + iter->count > i_size_read(inode) &&
+	    !ocfs2_supports_append_dio(osb))
 		return 0;
 
 	if (iov_iter_rw(iter) == READ)
@@ -2449,7 +2448,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		get_block = ocfs2_dio_get_block;
 
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-				    iter, offset, get_block,
+				    iter, get_block,
 				    ocfs2_dio_end_io, NULL, 0);
 }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d5c2e9c865de..825455d3e4ba 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3279,15 +3279,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * We thank Mingming Cao for helping us understand in great detail what
  * to do in this section of the code.
  */
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset)
+static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset,
+	ret = blockdev_direct_IO(iocb, inode, iter,
 				 reiserfs_get_blocks_direct_io);
 
 	/*
@@ -3296,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
 			truncate_setsize(inode, isize);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 877ba1c9b461..7ab8d8196e90 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -99,8 +99,7 @@ static int udf_adinicb_write_begin(struct file *file,
 	return 0;
 }
 
-static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				     loff_t offset)
+static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	/* Fallback to buffered I/O. */
 	return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2dc461eeb415..f323aff740ef 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -214,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -223,9 +222,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block);
 	if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
-		udf_write_failed(mapping, offset + count);
+		udf_write_failed(mapping, iocb->ki_pos + count);
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e49b2406d15d..c535887c60a8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1406,8 +1406,7 @@ xfs_end_io_direct_write(
 STATIC ssize_t
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
-	struct iov_iter		*iter,
-	loff_t			offset)
+	struct iov_iter		*iter)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	dio_iodone_t		*endio = NULL;
@@ -1420,12 +1419,12 @@ xfs_vm_direct_IO(
 	}
 
 	if (IS_DAX(inode)) {
-		return dax_do_io(iocb, inode, iter, offset,
+		return dax_do_io(iocb, inode, iter,
 				 xfs_get_blocks_direct, endio, 0);
 	}
 
 	bdev = xfs_find_bdev_for_inode(inode);
-	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+	return  __blockdev_direct_IO(iocb, inode, bdev, iter,
 			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 92f72fb05497..5de047ab2411 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -794,7 +794,7 @@ xfs_file_dio_aio_write(
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 
 	data = *from;
-	ret = mapping->a_ops->direct_IO(iocb, &data, iocb->ki_pos);
+	ret = mapping->a_ops->direct_IO(iocb, &data);
 
 	/* see generic_file_direct_write() for why this is necessary */
 	if (mapping->nrpages) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59ab505..982a6c4a62f3 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e9eaa2074061..e6b2de159736 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -394,7 +394,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	/*
 	 * migrate the contents of a page to the specified target. If
 	 * migrate_mode is MIGRATE_ASYNC, it must not block.
@@ -2766,18 +2766,17 @@ void dio_end_io(struct bio *bio, int error);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
-			     loff_t offset, get_block_t get_block,
+			     get_block_t get_block,
 			     dio_iodone_t end_io, dio_submit_t submit_io,
 			     int flags);
 
 static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 					 struct inode *inode,
-					 struct iov_iter *iter, loff_t offset,
+					 struct iov_iter *iter,
 					 get_block_t get_block)
 {
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				    offset, get_block, NULL, NULL,
-				    DIO_LOCKING | DIO_SKIP_HOLES);
+			get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
 }
 #endif
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 67300f8e5f2f..cede8f6a7e2d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -445,10 +445,9 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file)
 /*
  * linux/fs/nfs/direct.c
  */
-extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *, loff_t);
+extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *);
 extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-			struct iov_iter *iter,
-			loff_t pos);
+			struct iov_iter *iter);
 extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
 			struct iov_iter *iter);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index e7108c31346d..cb36db9f4107 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1853,8 +1853,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 					iocb->ki_pos + count - 1);
 		if (!retval) {
 			struct iov_iter data = *iter;
-			retval = mapping->a_ops->direct_IO(iocb, &data,
-					iocb->ki_pos);
+			retval = mapping->a_ops->direct_IO(iocb, &data);
 		}
 
 		if (retval > 0) {
@@ -2538,7 +2537,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	data = *from;
-	written = mapping->a_ops->direct_IO(iocb, &data, pos);
+	written = mapping->a_ops->direct_IO(iocb, &data);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
diff --git a/mm/page_io.c b/mm/page_io.c
index cd92e3d67a32..89275601d399 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -279,7 +279,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 		set_page_writeback(page);
 		unlock_page(page);
-		ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
+		ret = mapping->a_ops->direct_IO(&kiocb, &from);
 		if (ret == PAGE_SIZE) {
 			count_vm_event(PSWPOUT);
 			ret = 0;
-- 
cgit v1.2.3


From dde0c2e79848298cc25621ad080d47f94dbd7cce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:52:00 -0700
Subject: fs: add IOCB_SYNC and IOCB_DSYNC

This will allow us to do per-I/O sync file writes, as required by a lot
of fileservers or storage targets.

XXX: Will need a few additional audits for O_DSYNC

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     |  2 +-
 fs/btrfs/file.c    |  2 +-
 fs/cifs/file.c     |  2 +-
 fs/direct-io.c     |  2 +-
 fs/ext4/file.c     |  2 +-
 fs/f2fs/file.c     |  2 +-
 fs/gfs2/file.c     |  5 ++++-
 fs/nfs/direct.c    |  2 +-
 fs/ntfs/file.c     |  2 +-
 fs/udf/file.c      |  2 +-
 fs/xfs/xfs_file.c  |  2 +-
 include/linux/fs.h | 14 ++++++++++----
 mm/filemap.c       |  2 +-
 13 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9e1f3fe25753..d8dc3512e927 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1662,7 +1662,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = __generic_file_write_iter(iocb, from);
 	if (ret > 0) {
 		ssize_t err;
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6c376311a9d7..35ce146cceec 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1852,7 +1852,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	spin_unlock(&BTRFS_I(inode)->lock);
 	if (num_written > 0) {
-		err = generic_write_sync(file, pos, num_written);
+		err = generic_write_sync(iocb, pos, num_written);
 		if (err < 0)
 			num_written = err;
 	}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cb070aa88e57..b22b68ccfbe5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2688,7 +2688,7 @@ out:
 	inode_unlock(inode);
 
 	if (rc > 0) {
-		ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+		ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc);
 		if (err < 0)
 			rc = err;
 	}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c61314b84b01..f7bcc0193dee 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -268,7 +268,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 		if (dio->rw & WRITE) {
 			int err;
 
-			err = generic_write_sync(dio->iocb->ki_filp, offset,
+			err = generic_write_sync(dio->iocb, offset,
 						 transferred);
 			if (err < 0 && ret > 0)
 				ret = err;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index fa2208bae2e1..1417e129be51 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -172,7 +172,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 443e07705c2a..51ed8388e66c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1885,7 +1885,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 208efc70ad49..5a7d69609309 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -895,7 +895,10 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
 		mark_inode_dirty(inode);
 	}
 
-	return generic_write_sync(file, pos, count);
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+		return vfs_fsync_range(file, pos, pos + count - 1,
+			       (file->f_flags & __O_SYNC) ? 0 : 1);
+	return 0;
 
 out_trans_fail:
 	gfs2_inplace_release(ip);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 346b5d85ce92..be86de9a77d7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1054,7 +1054,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 			if (i_size_read(inode) < iocb->ki_pos)
 				i_size_write(inode, iocb->ki_pos);
 			spin_unlock(&inode->i_lock);
-			generic_write_sync(file, pos, result);
+			generic_write_sync(iocb, pos, result);
 		}
 	}
 	nfs_direct_req_release(dreq);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 91117ada8528..10dc38cc02bb 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1953,7 +1953,7 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	current->backing_dev_info = NULL;
 	inode_unlock(vi);
 	if (likely(written > 0)) {
-		err = generic_write_sync(file, iocb->ki_pos, written);
+		err = generic_write_sync(iocb, iocb->ki_pos, written);
 		if (err < 0)
 			written = 0;
 	}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7ab8d8196e90..8e3d1ae53b11 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -152,7 +152,7 @@ out:
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
-		err = generic_write_sync(file, iocb->ki_pos - retval, retval);
+		err = generic_write_sync(iocb, iocb->ki_pos - retval, retval);
 		if (err < 0)
 			retval = err;
 	}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5de047ab2411..b5d70e77195d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -908,7 +908,7 @@ xfs_file_write_iter(
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
 		/* Handle various SYNC-type writes */
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e6b2de159736..310ca1ed9293 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -323,6 +323,8 @@ struct writeback_control;
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
 #define IOCB_HIPRI		(1 << 3)
+#define IOCB_DSYNC		(1 << 4)
+#define IOCB_SYNC		(1 << 5)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2485,12 +2487,12 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);
-static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count)
 {
-	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
+	if (!(iocb->ki_flags & IOCB_DSYNC))
 		return 0;
-	return vfs_fsync_range(file, pos, pos + count - 1,
-			       (file->f_flags & __O_SYNC) ? 0 : 1);
+	return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1,
+			       (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
 }
 extern void emergency_sync(void);
 extern void emergency_remount(void);
@@ -2942,6 +2944,10 @@ static inline int iocb_flags(struct file *file)
 		res |= IOCB_APPEND;
 	if (io_is_direct(file))
 		res |= IOCB_DIRECT;
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+		res |= IOCB_DSYNC;
+	if (file->f_flags & __O_SYNC)
+		res |= IOCB_SYNC;
 	return res;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index cb36db9f4107..8345d6d3436a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2794,7 +2794,7 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
-- 
cgit v1.2.3


From e259221763a40403d5bb232209998e8c45804ab8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:52:01 -0700
Subject: fs: simplify the generic_write_sync prototype

The kiocb already has the new position, so use that.  The only interesting
case is AIO, where we currently don't bother updating ki_pos.  We're about
to free the kiocb after we're done, so we might as well update it to make
everyone's life simpler.

While we're at it also return the bytes written argument passed in if
we were successful so that the boilerplate error switch code in the
callers can go away.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     |  8 ++------
 fs/btrfs/file.c    |  7 ++-----
 fs/cifs/file.c     |  7 ++-----
 fs/direct-io.c     | 17 +++++++++--------
 fs/ext4/file.c     |  9 ++-------
 fs/f2fs/file.c     |  9 ++-------
 fs/nfs/direct.c    |  4 +++-
 fs/ntfs/file.c     |  7 ++-----
 fs/udf/file.c      |  4 +---
 fs/xfs/xfs_file.c  |  6 +-----
 include/linux/fs.h | 24 ++++++++++++++++++------
 mm/filemap.c       |  9 ++-------
 12 files changed, 46 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d8dc3512e927..a063d4d8ac39 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1660,12 +1660,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	blk_start_plug(&plug);
 	ret = __generic_file_write_iter(iocb, from);
-	if (ret > 0) {
-		ssize_t err;
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	blk_finish_plug(&plug);
 	return ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 35ce146cceec..ea9f10bb089c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1851,11 +1851,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	spin_unlock(&BTRFS_I(inode)->lock);
-	if (num_written > 0) {
-		err = generic_write_sync(iocb, pos, num_written);
-		if (err < 0)
-			num_written = err;
-	}
+	if (num_written > 0)
+		num_written = generic_write_sync(iocb, num_written);
 
 	if (sync)
 		atomic_dec(&BTRFS_I(inode)->sync_writers);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b22b68ccfbe5..9b51d4936a29 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2687,11 +2687,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 out:
 	inode_unlock(inode);
 
-	if (rc > 0) {
-		ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc);
-		if (err < 0)
-			rc = err;
-	}
+	if (rc > 0)
+		rc = generic_write_sync(iocb, rc);
 	up_read(&cinode->lock_sem);
 	return rc;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f7bcc0193dee..3bf3f20f8ecc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -256,6 +256,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (dio->end_io) {
 		int err;
 
+		// XXX: ki_pos??
 		err = dio->end_io(dio->iocb, offset, ret, dio->private);
 		if (err)
 			ret = err;
@@ -265,15 +266,15 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 		inode_dio_end(dio->inode);
 
 	if (is_async) {
-		if (dio->rw & WRITE) {
-			int err;
-
-			err = generic_write_sync(dio->iocb, offset,
-						 transferred);
-			if (err < 0 && ret > 0)
-				ret = err;
-		}
+		/*
+		 * generic_write_sync expects ki_pos to have been updated
+		 * already, but the submission path only does this for
+		 * synchronous I/O.
+		 */
+		dio->iocb->ki_pos += transferred;
 
+		if (dio->rw & WRITE)
+			ret = generic_write_sync(dio->iocb,  transferred);
 		dio->iocb->ki_complete(dio->iocb, ret, 0);
 	}
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1417e129be51..00ff6912adb3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -169,13 +169,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	if (o_direct)
 		blk_finish_plug(&plug);
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 51ed8388e66c..28f75a1fe4a7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1882,13 +1882,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	return ret;
 }
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index be86de9a77d7..0b9fca040b0c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1054,7 +1054,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 			if (i_size_read(inode) < iocb->ki_pos)
 				i_size_write(inode, iocb->ki_pos);
 			spin_unlock(&inode->i_lock);
-			generic_write_sync(iocb, pos, result);
+
+			/* XXX: should check the generic_write_sync retval */
+			generic_write_sync(iocb, result);
 		}
 	}
 	nfs_direct_req_release(dreq);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 10dc38cc02bb..5622ed5a201e 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1952,12 +1952,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = ntfs_perform_write(file, from, iocb->ki_pos);
 	current->backing_dev_info = NULL;
 	inode_unlock(vi);
-	if (likely(written > 0)) {
-		err = generic_write_sync(iocb, iocb->ki_pos, written);
-		if (err < 0)
-			written = 0;
-	}
 	iocb->ki_pos += written;
+	if (likely(written > 0))
+		written = generic_write_sync(iocb, written);
 	return written ? written : err;
 }
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 8e3d1ae53b11..632570617327 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -152,9 +152,7 @@ out:
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
-		err = generic_write_sync(iocb, iocb->ki_pos - retval, retval);
-		if (err < 0)
-			retval = err;
+		retval = generic_write_sync(iocb, retval);
 	}
 
 	return retval;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b5d70e77195d..cd3540997d65 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -903,14 +903,10 @@ xfs_file_write_iter(
 		ret = xfs_file_buffered_aio_write(iocb, from);
 
 	if (ret > 0) {
-		ssize_t err;
-
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
 		/* Handle various SYNC-type writes */
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
+		ret = generic_write_sync(iocb, ret);
 	}
 	return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 310ca1ed9293..f6a8ed864651 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2487,13 +2487,25 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);
-static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count)
-{
-	if (!(iocb->ki_flags & IOCB_DSYNC))
-		return 0;
-	return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1,
-			       (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
+
+/*
+ * Sync the bytes written if this was a synchronous write.  Expect ki_pos
+ * to already be updated for the write, and will return either the amount
+ * of bytes passed in, or an error if syncing the file failed.
+ */
+static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
+{
+	if (iocb->ki_flags & IOCB_DSYNC) {
+		int ret = vfs_fsync_range(iocb->ki_filp,
+				iocb->ki_pos - count, iocb->ki_pos - 1,
+				(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
+		if (ret)
+			return ret;
+	}
+
+	return count;
 }
+
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 #ifdef CONFIG_BLOCK
diff --git a/mm/filemap.c b/mm/filemap.c
index 8345d6d3436a..182b21825255 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2791,13 +2791,8 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_write_iter);
-- 
cgit v1.2.3


From 8ac0fba2da41620f4931a1007c71b0d4723eb02a Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 17:50:29 -0400
Subject: isa: Decouple X86_32 dependency from the ISA Kconfig option

The introduction of the ISA_BUS option blocks the compilation of ISA
drivers on non-x86 platforms. The ISA_BUS configuration option should
not be necessary if the X86_32 dependency can be decoupled from the ISA
configuration option. This patch both removes the ISA_BUS configuration
option entirely and removes the X86_32 dependency from the ISA
configuration option.

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig      | 10 ++--------
 drivers/base/Makefile |  2 +-
 include/linux/isa.h   |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a5977986f38b..280e5ebae2f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2472,16 +2472,8 @@ config ISA_DMA_API
 	  Enables ISA-style DMA support for devices requiring such controllers.
 	  If unsure, say Y.
 
-config ISA_BUS
-	bool "ISA bus support"
-	help
-	  Enables ISA bus support for devices requiring such controllers.
-
-if X86_32
-
 config ISA
 	bool "ISA support"
-	depends on ISA_BUS
 	---help---
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -2489,6 +2481,8 @@ config ISA
 	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
 	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
 
+if X86_32
+
 config EISA
 	bool "EISA support"
 	depends on ISA
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 4ebfb81cc7e9..6b2a84e7f2be 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_ISA_BUS)	+= isa.o
+obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
diff --git a/include/linux/isa.h b/include/linux/isa.h
index 2a02862775eb..b0270e3814c8 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,7 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
-#ifdef CONFIG_ISA_BUS
+#ifdef CONFIG_ISA
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
 #else
-- 
cgit v1.2.3


From 03dc76ca1ee5d02401d5a22ed7ddf15b5e9dfe76 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Thu, 28 Apr 2016 20:20:52 -0400
Subject: qed: add infrastructure for device self tests.

This patch adds the functionality and APIs needed for selftests.
It adds the ability to configure the link-mode which is required for the
implementation of loopback tests. It adds the APIs for clock test,
register test, interrupt test and memory test.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         | 13 ++++
 drivers/net/ethernet/qlogic/qed/qed_main.c        | 28 +++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         | 42 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         | 22 +++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.c    | 76 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.h    | 40 ++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sp.h          | 10 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 21 +++++++
 include/linux/qed/qed_if.h                        | 47 ++++++++++++++
 10 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 5c2fd57236fe..aafa6692e62f 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
-	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o
+	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
+	 qed_selftest.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 5aa78a9ae17f..c4fae71bed11 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -3857,6 +3857,7 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_PHY_CORE_WRITE             0x000e0000
 #define DRV_MSG_CODE_SET_VERSION                0x000f0000
 
+#define DRV_MSG_CODE_BIST_TEST                  0x001e0000
 #define DRV_MSG_CODE_SET_LED_MODE               0x00200000
 
 #define DRV_MSG_SEQ_NUMBER_MASK                 0x0000ffff
@@ -3914,6 +3915,18 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_SET_LED_MODE_ON            0x1
 #define DRV_MB_PARAM_SET_LED_MODE_OFF           0x2
 
+#define DRV_MB_PARAM_BIST_UNKNOWN_TEST          0
+#define DRV_MB_PARAM_BIST_REGISTER_TEST         1
+#define DRV_MB_PARAM_BIST_CLOCK_TEST            2
+
+#define DRV_MB_PARAM_BIST_RC_UNKNOWN            0
+#define DRV_MB_PARAM_BIST_RC_PASSED             1
+#define DRV_MB_PARAM_BIST_RC_FAILED             2
+#define DRV_MB_PARAM_BIST_RC_INVALID_PARAMETER          3
+
+#define DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT      0
+#define DRV_MB_PARAM_BIST_TEST_INDEX_MASK       0x000000FF
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK                        0xffff0000
 #define FW_MSG_CODE_DRV_LOAD_ENGINE             0x10100000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1918b83f0a97..1b758bdec587 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -28,6 +28,7 @@
 #include "qed_dev_api.h"
 #include "qed_mcp.h"
 #include "qed_hw.h"
+#include "qed_selftest.h"
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -976,6 +977,25 @@ static int qed_set_link(struct qed_dev *cdev,
 		else
 			link_params->pause.forced_tx = false;
 	}
+	if (params->override_flags & QED_LINK_OVERRIDE_LOOPBACK_MODE) {
+		switch (params->loopback_mode) {
+		case QED_LINK_LOOPBACK_INT_PHY:
+			link_params->loopback_mode = PMM_LOOPBACK_INT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT_PHY:
+			link_params->loopback_mode = PMM_LOOPBACK_EXT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT:
+			link_params->loopback_mode = PMM_LOOPBACK_EXT;
+			break;
+		case QED_LINK_LOOPBACK_MAC:
+			link_params->loopback_mode = PMM_LOOPBACK_MAC;
+			break;
+		default:
+			link_params->loopback_mode = PMM_LOOPBACK_NONE;
+			break;
+		}
+	}
 
 	rc = qed_mcp_set_link(hwfn, ptt, params->link_up);
 
@@ -1182,7 +1202,15 @@ static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
 	return status;
 }
 
+struct qed_selftest_ops qed_selftest_ops_pass = {
+	.selftest_memory = &qed_selftest_memory,
+	.selftest_interrupt = &qed_selftest_interrupt,
+	.selftest_register = &qed_selftest_register,
+	.selftest_clock = &qed_selftest_clock,
+};
+
 const struct qed_common_ops qed_common_ops_pass = {
+	.selftest = &qed_selftest_ops_pass,
 	.probe = &qed_probe,
 	.remove = &qed_remove,
 	.set_power_state = &qed_set_power_state,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index cb46dbdf47dd..2f8309d772c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1017,3 +1017,45 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 
 	return rc;
 }
+
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param = 0, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_REGISTER_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_CLOCK_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 608bcb2403cb..5f218eed0541 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -245,6 +245,28 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    enum qed_led_mode mode);
 
+/**
+ * @brief Bist register test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt);
+
+/**
+ * @brief Bist clock test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.c b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
new file mode 100644
index 000000000000..a342bfe4280d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
@@ -0,0 +1,76 @@
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+int qed_selftest_memory(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_interrupt(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_register(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_register_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+int qed_selftest_clock(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_clock_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.h b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
new file mode 100644
index 000000000000..50eb0b49950f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
@@ -0,0 +1,40 @@
+#ifndef _QED_SELFTEST_API_H
+#define _QED_SELFTEST_API_H
+#include <linux/types.h>
+
+/**
+ * @brief qed_selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_memory(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_interrupt(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_register(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_clock(struct qed_dev *cdev);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 4b91cb32f317..eec137f40895 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -369,4 +369,14 @@ int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_tunn_update_params *p_tunn,
 			      enum spq_mode comp_mode,
 			      struct qed_spq_comp_cb *p_comp_data);
+/**
+ * @brief qed_sp_heartbeat_ramrod - Send empty Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn);
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 7ccd96e5802b..9f9bc10d0f6c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -428,3 +428,24 @@ int qed_sp_pf_stop(struct qed_hwfn *p_hwfn)
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_EMPTY, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index e5de42b62976..d72c832a9397 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -110,6 +110,7 @@ struct qed_link_params {
 #define QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS      BIT(1)
 #define QED_LINK_OVERRIDE_SPEED_FORCED_SPEED    BIT(2)
 #define QED_LINK_OVERRIDE_PAUSE_CONFIG          BIT(3)
+#define QED_LINK_OVERRIDE_LOOPBACK_MODE         BIT(4)
 	u32	override_flags;
 	bool	autoneg;
 	u32	adv_speeds;
@@ -118,6 +119,12 @@ struct qed_link_params {
 #define QED_LINK_PAUSE_RX_ENABLE                BIT(1)
 #define QED_LINK_PAUSE_TX_ENABLE                BIT(2)
 	u32	pause_config;
+#define QED_LINK_LOOPBACK_NONE                  BIT(0)
+#define QED_LINK_LOOPBACK_INT_PHY               BIT(1)
+#define QED_LINK_LOOPBACK_EXT_PHY               BIT(2)
+#define QED_LINK_LOOPBACK_EXT                   BIT(3)
+#define QED_LINK_LOOPBACK_MAC                   BIT(4)
+	u32	loopback_mode;
 };
 
 struct qed_link_output {
@@ -158,7 +165,47 @@ struct qed_common_cb_ops {
 			       struct qed_link_output	*link);
 };
 
+struct qed_selftest_ops {
+/**
+ * @brief selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_interrupt)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_memory)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_register)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_clock)(struct qed_dev *cdev);
+};
+
 struct qed_common_ops {
+	struct qed_selftest_ops *selftest;
+
 	struct qed_dev*	(*probe)(struct pci_dev *dev,
 				 enum qed_protocol protocol,
 				 u32 dp_module, u8 dp_level);
-- 
cgit v1.2.3


From 2e65060e803e046fc9b5ed0107494a452424845e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:38 +0300
Subject: dmaengine: dw: revisit data_width property

There several changes are done here:

- Convert the property to be in bytes

  Besides that this is a common practice for such property, the use of a value
  in bytes much more convenient than handling the encoded one.

- Rename data_width to data-width in the device tree bindings

  The change leaves the support for the old format as well just in case someone
  will use a newer kernel with an old device tree blob.

- While here, replace dwc_fast_ffs() by __ffs()

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt |  7 +++-
 arch/arc/boot/dts/abilis_tb10x.dtsi                |  2 +-
 arch/arm/boot/dts/spear13xx.dtsi                   |  4 +--
 drivers/dma/dw/core.c                              | 42 ++++++----------------
 drivers/dma/dw/platform.c                          |  5 ++-
 include/linux/platform_data/dma-dw.h               |  2 +-
 6 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index c99c1ffac199..0f5583293c9c 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -13,6 +13,11 @@ Required properties:
 - chan_priority: priority of channels. 0 (default): increase from chan 0->n, 1:
   increase from chan n->0
 - block_size: Maximum block size supported by the controller
+- data-width: Maximum data width supported by hardware per AHB master
+  (in bytes, power of 2)
+
+
+Deprecated properties:
 - data_width: Maximum data width supported by hardware per AHB master
   (0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
 
@@ -38,7 +43,7 @@ Example:
 		chan_allocation_order = <1>;
 		chan_priority = <1>;
 		block_size = <0xfff>;
-		data_width = <3 3>;
+		data-width = <8 8>;
 	};
 
 DMA clients connected to the Designware DMA controller must use the format
diff --git a/arch/arc/boot/dts/abilis_tb10x.dtsi b/arch/arc/boot/dts/abilis_tb10x.dtsi
index cfb5052239a1..2f53bedb0cde 100644
--- a/arch/arc/boot/dts/abilis_tb10x.dtsi
+++ b/arch/arc/boot/dts/abilis_tb10x.dtsi
@@ -112,7 +112,7 @@
 			chan_allocation_order = <0>;
 			chan_priority = <1>;
 			block_size = <0x7ff>;
-			data_width = <2>;
+			data-width = <4>;
 			clocks = <&ahb_clk>;
 			clock-names = "hclk";
 		};
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index 14594ce8c18a..449acf0d8272 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -117,7 +117,7 @@
 			chan_priority = <1>;
 			block_size = <0xfff>;
 			dma-masters = <2>;
-			data_width = <3 3>;
+			data-width = <8 8>;
 		};
 
 		dma@eb000000 {
@@ -133,7 +133,7 @@
 			chan_allocation_order = <1>;
 			chan_priority = <1>;
 			block_size = <0xfff>;
-			data_width = <3 3>;
+			data-width = <8 8>;
 		};
 
 		fsmc: flash@b0000000 {
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 78522dcf3a7d..992da255b8e6 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -162,21 +162,6 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
 
 /*----------------------------------------------------------------------*/
 
-static inline unsigned int dwc_fast_ffs(unsigned long long v)
-{
-	/*
-	 * We can be a lot more clever here, but this should take care
-	 * of the most common optimization.
-	 */
-	if (!(v & 7))
-		return 3;
-	else if (!(v & 3))
-		return 2;
-	else if (!(v & 1))
-		return 1;
-	return 0;
-}
-
 static inline void dwc_dump_chan_regs(struct dw_dma_chan *dwc)
 {
 	dev_err(chan2dev(&dwc->chan),
@@ -677,11 +662,12 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	struct dw_desc		*prev;
 	size_t			xfer_count;
 	size_t			offset;
+	u8			m_master = dwc->m_master;
 	unsigned int		src_width;
 	unsigned int		dst_width;
-	unsigned int		data_width;
+	unsigned int		data_width = dw->data_width[m_master];
 	u32			ctllo;
-	u8			lms = DWC_LLP_LMS(dwc->m_master);
+	u8			lms = DWC_LLP_LMS(m_master);
 
 	dev_vdbg(chan2dev(chan),
 			"%s: d%pad s%pad l0x%zx f0x%lx\n", __func__,
@@ -694,10 +680,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 	dwc->direction = DMA_MEM_TO_MEM;
 
-	data_width = dw->data_width[dwc->m_master];
-
-	src_width = dst_width = min_t(unsigned int, data_width,
-				      dwc_fast_ffs(src | dest | len));
+	src_width = dst_width = __ffs(data_width | src | dest | len);
 
 	ctllo = DWC_DEFAULT_CTLLO(chan)
 			| DWC_CTLL_DST_WIDTH(dst_width)
@@ -757,11 +740,12 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct dw_desc		*prev;
 	struct dw_desc		*first;
 	u32			ctllo;
-	u8			lms = DWC_LLP_LMS(dwc->m_master);
+	u8			m_master = dwc->m_master;
+	u8			lms = DWC_LLP_LMS(m_master);
 	dma_addr_t		reg;
 	unsigned int		reg_width;
 	unsigned int		mem_width;
-	unsigned int		data_width;
+	unsigned int		data_width = dw->data_width[m_master];
 	unsigned int		i;
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
@@ -787,8 +771,6 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
 
-		data_width = dw->data_width[dwc->m_master];
-
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len, dlen, mem;
@@ -796,8 +778,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 			mem = sg_dma_address(sg);
 			len = sg_dma_len(sg);
 
-			mem_width = min_t(unsigned int,
-					  data_width, dwc_fast_ffs(mem | len));
+			mem_width = __ffs(data_width | mem | len);
 
 slave_sg_todev_fill_desc:
 			desc = dwc_desc_get(dwc);
@@ -843,8 +824,6 @@ slave_sg_todev_fill_desc:
 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
 
-		data_width = dw->data_width[dwc->m_master];
-
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len, dlen, mem;
@@ -852,8 +831,7 @@ slave_sg_todev_fill_desc:
 			mem = sg_dma_address(sg);
 			len = sg_dma_len(sg);
 
-			mem_width = min_t(unsigned int,
-					  data_width, dwc_fast_ffs(mem | len));
+			mem_width = __ffs(data_width | mem | len);
 
 slave_sg_fromdev_fill_desc:
 			desc = dwc_desc_get(dwc);
@@ -1500,7 +1478,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		pdata->nr_masters = (dw_params >> DW_PARAMS_NR_MASTER & 3) + 1;
 		for (i = 0; i < pdata->nr_masters; i++) {
 			pdata->data_width[i] =
-				(dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3) + 2;
+				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
 		}
 		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index e65ebe5ab88f..2420fb7267bc 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -138,9 +138,12 @@ dw_dma_parse_dt(struct platform_device *pdev)
 	if (!of_property_read_u32(np, "block_size", &tmp))
 		pdata->block_size = tmp;
 
-	if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
+	if (!of_property_read_u32_array(np, "data-width", arr, nr_masters)) {
 		for (tmp = 0; tmp < nr_masters; tmp++)
 			pdata->data_width[tmp] = arr[tmp];
+	} else if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
+		for (tmp = 0; tmp < nr_masters; tmp++)
+			pdata->data_width[tmp] = BIT(arr[tmp] & 0x07);
 	}
 
 	return pdata;
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index b881b978e486..ad768111c350 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -43,7 +43,7 @@ struct dw_dma_slave {
  * @block_size: Maximum block size supported by the controller
  * @nr_masters: Number of AHB masters supported by the controller
  * @data_width: Maximum data width supported by hardware per AHB master
- *		(0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
+ *		(in bytes, power of 2)
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
-- 
cgit v1.2.3


From 161c3d04aeca8a5bfffe3902786bdf0ccd8575c0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:39 +0300
Subject: dmaengine: dw: keep entire platform data in struct dw_dma

Keep the entire platform data in the struct dw_dma.
It makes the driver a bit cleaner.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw/core.c                | 30 +++++++++++++++---------------
 drivers/dma/dw/platform.c            |  4 ++--
 drivers/dma/dw/regs.h                |  5 ++---
 include/linux/platform_data/dma-dw.h |  2 +-
 4 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 992da255b8e6..30843a17b438 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -665,7 +665,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	u8			m_master = dwc->m_master;
 	unsigned int		src_width;
 	unsigned int		dst_width;
-	unsigned int		data_width = dw->data_width[m_master];
+	unsigned int		data_width = dw->pdata->data_width[m_master];
 	u32			ctllo;
 	u8			lms = DWC_LLP_LMS(m_master);
 
@@ -745,7 +745,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	dma_addr_t		reg;
 	unsigned int		reg_width;
 	unsigned int		mem_width;
-	unsigned int		data_width = dw->data_width[m_master];
+	unsigned int		data_width = dw->pdata->data_width[m_master];
 	unsigned int		i;
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
@@ -1444,7 +1444,6 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	struct dw_dma		*dw;
 	bool			autocfg = false;
 	unsigned int		dw_params;
-	unsigned int		max_blk_size = 0;
 	unsigned int		i;
 	int			err;
 
@@ -1452,6 +1451,10 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	if (!dw)
 		return -ENOMEM;
 
+	dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL);
+	if (!dw->pdata)
+		return -ENOMEM;
+
 	dw->regs = chip->regs;
 	chip->dw = dw;
 
@@ -1467,11 +1470,8 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			goto err_pdata;
 		}
 
-		pdata = devm_kzalloc(chip->dev, sizeof(*pdata), GFP_KERNEL);
-		if (!pdata) {
-			err = -ENOMEM;
-			goto err_pdata;
-		}
+		/* Reassign the platform data pointer */
+		pdata = dw->pdata;
 
 		/* Get hardware configuration parameters */
 		pdata->nr_channels = (dw_params >> DW_PARAMS_NR_CHAN & 7) + 1;
@@ -1480,7 +1480,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			pdata->data_width[i] =
 				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
 		}
-		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
+		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
 
 		/* Fill platform data with the default values */
 		pdata->is_private = true;
@@ -1490,6 +1490,11 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
 		err = -EINVAL;
 		goto err_pdata;
+	} else {
+		memcpy(dw->pdata, pdata, sizeof(*dw->pdata));
+
+		/* Reassign the platform data pointer */
+		pdata = dw->pdata;
 	}
 
 	dw->chan = devm_kcalloc(chip->dev, pdata->nr_channels, sizeof(*dw->chan),
@@ -1499,11 +1504,6 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		goto err_pdata;
 	}
 
-	/* Get hardware configuration parameters */
-	dw->nr_masters = pdata->nr_masters;
-	for (i = 0; i < dw->nr_masters; i++)
-		dw->data_width[i] = pdata->data_width[i];
-
 	/* Calculate all channel mask before DMA setup */
 	dw->all_chan_mask = (1 << pdata->nr_channels) - 1;
 
@@ -1570,7 +1570,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 			 * up to 0x0a for 4095.
 			 */
 			dwc->block_size =
-				(4 << ((max_blk_size >> 4 * i) & 0xf)) - 1;
+				(4 << ((pdata->block_size >> 4 * i) & 0xf)) - 1;
 			dwc->nollp =
 				(dwc_params >> DWC_PARAMS_MBLK_EN & 0x1) == 0;
 		} else {
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 2420fb7267bc..0a49011633b9 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -47,8 +47,8 @@ static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec,
 
 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
-		    slave.m_master >= dw->nr_masters ||
-		    slave.p_master >= dw->nr_masters))
+		    slave.m_master >= dw->pdata->nr_masters ||
+		    slave.p_master >= dw->pdata->nr_masters))
 		return NULL;
 
 	dma_cap_zero(cap);
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 0ab02eb23bfc..4b7bd7834046 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -281,9 +281,8 @@ struct dw_dma {
 	u8			all_chan_mask;
 	u8			in_use;
 
-	/* hardware configuration */
-	unsigned char		nr_masters;
-	unsigned char		data_width[DW_DMA_MAX_NR_MASTERS];
+	/* platform data */
+	struct dw_dma_platform_data	*pdata;
 };
 
 static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw)
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index ad768111c350..d15d8ba8cc24 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -55,7 +55,7 @@ struct dw_dma_platform_data {
 #define CHAN_PRIORITY_ASCENDING		0	/* chan0 highest */
 #define CHAN_PRIORITY_DESCENDING	1	/* chan7 highest */
 	unsigned char	chan_priority;
-	unsigned short	block_size;
+	unsigned int	block_size;
 	unsigned char	nr_masters;
 	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
 };
-- 
cgit v1.2.3


From 3a14c66d43d018baed96ceb74f9ab548878c09b8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 27 Apr 2016 14:15:40 +0300
Subject: dmaengine: dw: pass platform data via struct dw_dma_chip

We pass struct dw_dma_chip to dw_dma_probe() anyway, thus we may use it to
pass a platform data as well.

While here, constify the source of the platform data.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/ata/sata_dwc_460ex.c          | 2 +-
 drivers/dma/dw/core.c                 | 9 +++++----
 drivers/dma/dw/pci.c                  | 5 +++--
 drivers/dma/dw/platform.c             | 5 +++--
 include/linux/dma/dw.h                | 5 ++++-
 sound/soc/intel/common/sst-firmware.c | 2 +-
 6 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 80bdcabc293f..2cb6f7e04b5c 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -1248,7 +1248,7 @@ static int sata_dwc_probe(struct platform_device *ofdev)
 	hsdev->dma->dev = &ofdev->dev;
 
 	/* Initialize AHB DMAC */
-	err = dw_dma_probe(hsdev->dma, NULL);
+	err = dw_dma_probe(hsdev->dma);
 	if (err)
 		goto error_dma_iomap;
 
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 30843a17b438..edf053f73a49 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1439,8 +1439,9 @@ EXPORT_SYMBOL(dw_dma_cyclic_free);
 
 /*----------------------------------------------------------------------*/
 
-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
+int dw_dma_probe(struct dw_dma_chip *chip)
 {
+	struct dw_dma_platform_data *pdata;
 	struct dw_dma		*dw;
 	bool			autocfg = false;
 	unsigned int		dw_params;
@@ -1460,7 +1461,7 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 
 	pm_runtime_get_sync(chip->dev);
 
-	if (!pdata) {
+	if (!chip->pdata) {
 		dw_params = dma_readl(dw, DW_PARAMS);
 		dev_dbg(chip->dev, "DW_PARAMS: 0x%08x\n", dw_params);
 
@@ -1487,11 +1488,11 @@ int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
 		pdata->is_memcpy = true;
 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
-	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
+	} else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
 		err = -EINVAL;
 		goto err_pdata;
 	} else {
-		memcpy(dw->pdata, pdata, sizeof(*dw->pdata));
+		memcpy(dw->pdata, chip->pdata, sizeof(*dw->pdata));
 
 		/* Reassign the platform data pointer */
 		pdata = dw->pdata;
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 358f9689a3f5..0ae6c3b1d34e 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -17,8 +17,8 @@
 
 static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 {
+	const struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
 	struct dw_dma_chip *chip;
-	struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
 	int ret;
 
 	ret = pcim_enable_device(pdev);
@@ -49,8 +49,9 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 	chip->dev = &pdev->dev;
 	chip->regs = pcim_iomap_table(pdev)[0];
 	chip->irq = pdev->irq;
+	chip->pdata = pdata;
 
-	ret = dw_dma_probe(chip, pdata);
+	ret = dw_dma_probe(chip);
 	if (ret)
 		return ret;
 
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 0a49011633b9..5bda0eb9f393 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -161,7 +161,7 @@ static int dw_probe(struct platform_device *pdev)
 	struct dw_dma_chip *chip;
 	struct device *dev = &pdev->dev;
 	struct resource *mem;
-	struct dw_dma_platform_data *pdata;
+	const struct dw_dma_platform_data *pdata;
 	int err;
 
 	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
@@ -186,6 +186,7 @@ static int dw_probe(struct platform_device *pdev)
 		pdata = dw_dma_parse_dt(pdev);
 
 	chip->dev = dev;
+	chip->pdata = pdata;
 
 	chip->clk = devm_clk_get(chip->dev, "hclk");
 	if (IS_ERR(chip->clk))
@@ -196,7 +197,7 @@ static int dw_probe(struct platform_device *pdev)
 
 	pm_runtime_enable(&pdev->dev);
 
-	err = dw_dma_probe(chip, pdata);
+	err = dw_dma_probe(chip);
 	if (err)
 		goto err_dw_dma_probe;
 
diff --git a/include/linux/dma/dw.h b/include/linux/dma/dw.h
index 71456442ebe3..f2e538aaddad 100644
--- a/include/linux/dma/dw.h
+++ b/include/linux/dma/dw.h
@@ -27,6 +27,7 @@ struct dw_dma;
  * @regs:		memory mapped I/O space
  * @clk:		hclk clock
  * @dw:			struct dw_dma that is filed by dw_dma_probe()
+ * @pdata:		pointer to platform data
  */
 struct dw_dma_chip {
 	struct device	*dev;
@@ -34,10 +35,12 @@ struct dw_dma_chip {
 	void __iomem	*regs;
 	struct clk	*clk;
 	struct dw_dma	*dw;
+
+	const struct dw_dma_platform_data	*pdata;
 };
 
 /* Export to the platform drivers */
-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata);
+int dw_dma_probe(struct dw_dma_chip *chip);
 int dw_dma_remove(struct dw_dma_chip *chip);
 
 /* DMA API extensions */
diff --git a/sound/soc/intel/common/sst-firmware.c b/sound/soc/intel/common/sst-firmware.c
index ef4881e7753a..25993527370b 100644
--- a/sound/soc/intel/common/sst-firmware.c
+++ b/sound/soc/intel/common/sst-firmware.c
@@ -203,7 +203,7 @@ static struct dw_dma_chip *dw_probe(struct device *dev, struct resource *mem,
 
 	chip->dev = dev;
 
-	err = dw_dma_probe(chip, NULL);
+	err = dw_dma_probe(chip);
 	if (err)
 		return ERR_PTR(err);
 
-- 
cgit v1.2.3


From 011d6f5c3e5f38a767c8f4c7e2de73dc91959cb0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 2 May 2016 12:59:19 +0200
Subject: of: include errno.h in of_graph.h

When CONFIG_OF is disabled, we have to include linux/errno.h before
including of_graph.h, or get build errors like in the newly added
sun4i drm driver:

In file included from ../drivers/gpu/drm/sun4i/sun4i_drv.c:14:0:
include/linux/of_graph.h: In function 'of_graph_parse_endpoint':
include/linux/of_graph.h:58:10: error: 'ENOSYS' undeclared (first use in this function)

A better solution is to ensure that the header can be included
by itself, so let's include linux/errno.h here to fix the error
we just got, and any similar future error.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support")
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_graph.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index f8bcd0e21a26..bb3a5a2cd570 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -15,6 +15,7 @@
 #define __LINUX_OF_GRAPH_H
 
 #include <linux/types.h>
+#include <linux/errno.h>
 
 /**
  * struct of_endpoint - the OF graph endpoint data structure
-- 
cgit v1.2.3


From ba0263340a2aeaf5f08e4f2b0f4c29e300828b06 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Thu, 28 Apr 2016 17:18:34 +0200
Subject: fbdev: fb_defio: Export fb_deferred_io_mmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export fb_deferred_io_mmap so drivers can change vma->vm_page_prot.
When the framebuffer memory is allocated using dma_alloc_writecombine()
instead of vmalloc(), I get cache syncing problems on ARM.
This solves it:

static int drm_fbdev_cma_deferred_io_mmap(struct fb_info *info,
					  struct vm_area_struct *vma)
{
	fb_deferred_io_mmap(info, vma);
	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);

	return 0;
}

Could this have been done in the core?
Drivers that don't set (struct fb_ops *)->fb_mmap, gets a call to
fb_pgprotect() at the end of the default fb_mmap implementation
(drivers/video/fbdev/core/fbmem.c). This is an architecture specific
function that on many platforms uses pgprot_writecombine(), but not on
all. And looking at some of the fb_mmap implementations, some of them
sets vm_page_prot to nocache for instance, so I think the safest bet is
to do this in the driver and not in the fbdev core. And we can't call
fb_pgprotect() from fb_deferred_io_mmap() either because we don't have
access to the file pointer that powerpc needs.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Acked-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1461856717-6476-5-git-send-email-noralf@tronnes.org
---
 drivers/video/fbdev/core/fb_defio.c | 3 ++-
 include/linux/fb.h                  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 57721c73177f..74b5bcac8bf2 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -164,7 +164,7 @@ static const struct address_space_operations fb_deferred_io_aops = {
 	.set_page_dirty = fb_deferred_io_set_page_dirty,
 };
 
-static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
+int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
@@ -173,6 +173,7 @@ static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	vma->vm_private_data = info;
 	return 0;
 }
+EXPORT_SYMBOL(fb_deferred_io_mmap);
 
 /* workqueue callback */
 static void fb_deferred_io_work(struct work_struct *work)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dfe88351341f..a964d076b4dc 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -673,6 +673,7 @@ static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch,
 }
 
 /* drivers/video/fb_defio.c */
+int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 extern void fb_deferred_io_init(struct fb_info *info);
 extern void fb_deferred_io_open(struct fb_info *info,
 				struct inode *inode,
-- 
cgit v1.2.3


From a5b714ad395803a6aa91793b9e52a81b176b8ba9 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@foxmail.com>
Date: Wed, 27 Apr 2016 20:10:16 +0800
Subject: NVMe: correct comment for offset enum of controller registers in
 nvme.h

Section 3.1 gives the comment for the offset of controller registers
in the specification 1.2a.

Some are mis-copied in the header file nvme.h. Correct them.

Signed-off-by: Wang Sheng-Hui <shhuiw@foxmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a55986f6fe38..7d51b2904cb7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -21,13 +21,13 @@ enum {
 	NVME_REG_CAP	= 0x0000,	/* Controller Capabilities */
 	NVME_REG_VS	= 0x0008,	/* Version */
 	NVME_REG_INTMS	= 0x000c,	/* Interrupt Mask Set */
-	NVME_REG_INTMC	= 0x0010,	/* Interrupt Mask Set */
+	NVME_REG_INTMC	= 0x0010,	/* Interrupt Mask Clear */
 	NVME_REG_CC	= 0x0014,	/* Controller Configuration */
 	NVME_REG_CSTS	= 0x001c,	/* Controller Status */
 	NVME_REG_NSSR	= 0x0020,	/* NVM Subsystem Reset */
 	NVME_REG_AQA	= 0x0024,	/* Admin Queue Attributes */
 	NVME_REG_ASQ	= 0x0028,	/* Admin SQ Base Address */
-	NVME_REG_ACQ	= 0x0030,	/* Admin SQ Base Address */
+	NVME_REG_ACQ	= 0x0030,	/* Admin CQ Base Address */
 	NVME_REG_CMBLOC = 0x0038,	/* Controller Memory Buffer Location */
 	NVME_REG_CMBSZ	= 0x003c,	/* Controller Memory Buffer Size */
 };
-- 
cgit v1.2.3


From 38f252553300ee1d3346a5273e95fe1dd60ca50a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Apr 2016 14:55:28 -0400
Subject: block: add __blkdev_issue_discard

This is a version of blkdev_issue_discard which doesn't wait for
the I/O to complete, but instead allows the caller to submit
the final bio and/or chain it to others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lin <ming.l@ssi.samsung.com>
Signed-off-by: Sagi Grimberg <sagig@grimberg.me>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c        | 63 +++++++++++++++++++++++++++++---------------------
 include/linux/blkdev.h |  2 ++
 2 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 700d248cbde5..ccbce2b2ea05 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -22,45 +22,25 @@ static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages,
 	return new;
 }
 
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev:	blockdev to issue discard for
- * @sector:	start sector
- * @nr_sects:	number of sectors to discard
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = REQ_WRITE | REQ_DISCARD;
+	struct bio *bio = *biop;
 	unsigned int granularity;
 	int alignment;
-	struct bio *bio = NULL;
-	int ret = 0;
-	struct blk_plug plug;
 
 	if (!q)
 		return -ENXIO;
-
 	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
+	if ((type & REQ_SECURE) && !blk_queue_secdiscard(q))
+		return -EOPNOTSUPP;
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
-	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (!blk_queue_secdiscard(q))
-			return -EOPNOTSUPP;
-		type |= REQ_SECURE;
-	}
-
-	blk_start_plug(&plug);
 	while (nr_sects) {
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
@@ -98,7 +78,38 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 */
 		cond_resched();
 	}
-	if (bio)
+
+	*biop = bio;
+	return 0;
+}
+EXPORT_SYMBOL(__blkdev_issue_discard);
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	int type = REQ_WRITE | REQ_DISCARD;
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+	int ret;
+
+	if (flags & BLKDEV_DISCARD_SECURE)
+		type |= REQ_SECURE;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type,
+			&bio);
+	if (!ret && bio)
 		ret = submit_bio_wait(type, bio);
 	blk_finish_plug(&plug);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba72687c5654..b79131acf6c0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1131,6 +1131,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int type, struct bio **biop);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 339e6e31d2bfb40354cbfe672b357b88a88223f2 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 18:42:47 -0400
Subject: isa: Implement the module_isa_driver macro

The module_isa_driver macro is a helper macro for ISA drivers which do
not do anything special in module init/exit. This eliminates a lot of
boilerplate code. Each module may only use this macro once, and calling
it replaces module_init and module_exit.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/isa.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index b0270e3814c8..e394917d18c2 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -36,4 +36,25 @@ static inline void isa_unregister_driver(struct isa_driver *d)
 }
 #endif
 
+/**
+ * module_isa_driver() - Helper macro for registering a ISA driver
+ * @__isa_driver: isa_driver struct
+ * @__num_isa_dev: number of devices to register
+ *
+ * Helper macro for ISA drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate code. Each module may only
+ * use this macro once, and calling it replaces module_init and module_exit.
+ */
+#define module_isa_driver(__isa_driver, __num_isa_dev) \
+static int __init __isa_driver##_init(void) \
+{ \
+	return isa_register_driver(&(__isa_driver), __num_isa_dev); \
+} \
+module_init(__isa_driver##_init); \
+static void __exit __isa_driver##_exit(void) \
+{ \
+	isa_unregister_driver(&(__isa_driver)); \
+} \
+module_exit(__isa_driver##_exit);
+
 #endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From d9a9c6172d1cec51851e9015b6c4379635c31f1a Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <vilhelm.gray@gmail.com>
Date: Sun, 1 May 2016 18:43:10 -0400
Subject: isa: Implement the max_num_isa_dev macro

max_num_isa_dev is a macro to determine the maximum possible number of
ISA devices which may be registered in the I/O port address space given
the address extent of the ISA devices.

The highest base address possible for an ISA device is 0x3FF; this
results in 1024 possible base addresses. Dividing the number of possible
base addresses by the address extent taken by each device results in the
maximum number of devices on a system.

Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/isa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index e394917d18c2..5ab85281230b 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -57,4 +57,15 @@ static void __exit __isa_driver##_exit(void) \
 } \
 module_exit(__isa_driver##_exit);
 
+/**
+ * max_num_isa_dev() - Maximum possible number registered of an ISA device
+ * @__ida_dev_ext: ISA device address extent
+ *
+ * The highest base address possible for an ISA device is 0x3FF; this results in
+ * 1024 possible base addresses. Dividing the number of possible base addresses
+ * by the address extent taken by each device results in the maximum number of
+ * devices on a system.
+ */
+#define max_num_isa_dev(__isa_dev_ext) (1024 / __isa_dev_ext)
+
 #endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From 4d31c6109a24892df461b6a98842935e80159a5e Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Mon, 2 May 2016 14:09:10 -0500
Subject: PCI: imx6: Implement reset sequence for i.MX6+

I.MX6+ has a dedicated bit for resetting PCIe core, which should be used
instead of a regular reset sequence since using the latter will hang the
SoC.

This commit is based on c34068d48273e24d392d9a49a38be807954420ed from
http://git.freescale.com/git/cgit.cgi/imx/linux-2.6-imx.git

Tested-by: Gary Bisson <gary.bisson@boundarydevices.com>
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
---
 .../devicetree/bindings/pci/fsl,imx6q-pcie.txt     |  2 +-
 drivers/pci/host/pci-imx6.c                        | 23 ++++++++++++++++++++--
 include/linux/mfd/syscon/imx6q-iomuxc-gpr.h        |  1 +
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
index d742f917505a..83aeb1f5a645 100644
--- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
@@ -4,7 +4,7 @@ This PCIe host controller is based on the Synopsis Designware PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
-- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie"
+- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie", "fsl,imx6qp-pcie"
 - reg: base address and length of the PCIe controller
 - interrupts: A list of interrupt outputs of the controller. Must contain an
   entry for each entry in the interrupt-names property.
diff --git a/drivers/pci/host/pci-imx6.c b/drivers/pci/host/pci-imx6.c
index 6b5d8c2fe2f2..b741a36a67f3 100644
--- a/drivers/pci/host/pci-imx6.c
+++ b/drivers/pci/host/pci-imx6.c
@@ -34,7 +34,8 @@
 
 enum imx6_pcie_variants {
 	IMX6Q,
-	IMX6SX
+	IMX6SX,
+	IMX6QP,
 };
 
 struct imx6_pcie {
@@ -256,6 +257,11 @@ static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET,
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET);
 		break;
+	case IMX6QP:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
+				   IMX6Q_GPR1_PCIE_SW_RST,
+				   IMX6Q_GPR1_PCIE_SW_RST);
+		break;
 	case IMX6Q:
 		/*
 		 * If the bootloader already enabled the link we need some
@@ -310,6 +316,7 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_TEST_POWERDOWN, 0);
 		break;
+	case IMX6QP: 		/* FALLTHROUGH */
 	case IMX6Q:
 		/* power up core phy and enable ref clock */
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
@@ -370,9 +377,20 @@ static int imx6_pcie_deassert_core_reset(struct pcie_port *pp)
 					!imx6_pcie->gpio_active_high);
 	}
 
-	if (imx6_pcie->variant == IMX6SX)
+	switch (imx6_pcie->variant) {
+	case IMX6SX:
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR5,
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET, 0);
+		break;
+	case IMX6QP:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
+				   IMX6Q_GPR1_PCIE_SW_RST, 0);
+
+		usleep_range(200, 500);
+		break;
+	case IMX6Q:		/* Nothing to do */
+		break;
+	}
 
 	return 0;
 
@@ -718,6 +736,7 @@ static void imx6_pcie_shutdown(struct platform_device *pdev)
 static const struct of_device_id imx6_pcie_of_match[] = {
 	{ .compatible = "fsl,imx6q-pcie",  .data = (void *)IMX6Q,  },
 	{ .compatible = "fsl,imx6sx-pcie", .data = (void *)IMX6SX, },
+	{ .compatible = "fsl,imx6qp-pcie", .data = (void *)IMX6QP, },
 	{},
 };
 MODULE_DEVICE_TABLE(of, imx6_pcie_of_match);
diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
index 238c8db953eb..5b08e3c5325f 100644
--- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
@@ -95,6 +95,7 @@
 #define IMX6Q_GPR0_DMAREQ_MUX_SEL0_IOMUX	BIT(0)
 
 #define IMX6Q_GPR1_PCIE_REQ_MASK		(0x3 << 30)
+#define IMX6Q_GPR1_PCIE_SW_RST			BIT(29)
 #define IMX6Q_GPR1_PCIE_EXIT_L1			BIT(28)
 #define IMX6Q_GPR1_PCIE_RDY_L23			BIT(27)
 #define IMX6Q_GPR1_PCIE_ENTER_L1		BIT(26)
-- 
cgit v1.2.3


From 85c7f81041d57cfe9dc97f4680d5586b54534a39 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 14 Apr 2016 19:52:13 -0400
Subject: beginning of transition to parallel lookups - marking in-lookup
 dentries

marked as such when (would be) parallel lookup is about to pass them
to actual ->lookup(); unmarked when
	* __d_add() is about to make it hashed, positive or not.
	* __d_move() (from d_splice_alias(), directly or via
__d_unalias()) puts a preexisting dentry in its place
	* in caller of ->lookup() if it has escaped all of the
above.  Bug (WARN_ON, actually) if it reaches the final dput()
or d_instantiate() while still marked such.

As the result, we are guaranteed that for as long as the flag is
set, dentry will
	* remain negative unhashed with positive refcount
	* never have its ->d_alias looked at
	* never have its ->d_lru looked at
	* never have its ->d_parent and ->d_name changed

Right now we have at most one such for any given parent directory.
With parallel lookups that restriction will weaken to
	* only exist when parent is locked shared
	* at most one with given (parent,name) pair (comparison of
names is according to ->d_compare())
	* only exist when there's no hashed dentry with the same
(parent,name)

Transition will take the next several commits; unfortunately, we'll
only be able to switch to rwsem at the end of this series.  The
reason for not making it a single patch is to simplify review.

New primitives: d_in_lookup() (a predicate checking if dentry is in
the in-lookup state) and d_lookup_done() (tells the system that
we are done with lookup and if it's still marked as in-lookup, it
should cease to be such).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 13 +++++++++++++
 fs/namei.c             |  4 ++++
 include/linux/dcache.h | 18 ++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 20394fb6f967..0f1d93866e69 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -761,6 +761,8 @@ repeat:
 	/* Slow case: now with the dentry lock held */
 	rcu_read_unlock();
 
+	WARN_ON(d_in_lookup(dentry));
+
 	/* Unreachable? Get rid of it */
 	if (unlikely(d_unhashed(dentry)))
 		goto kill_it;
@@ -1746,6 +1748,7 @@ type_determined:
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	unsigned add_flags = d_flags_for_inode(inode);
+	WARN_ON(d_in_lookup(dentry));
 
 	spin_lock(&dentry->d_lock);
 	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2361,12 +2364,20 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
 
+void __d_lookup_done(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
+	/* more stuff will land here */
+}
+EXPORT_SYMBOL(__d_lookup_done);
 
 /* inode->i_lock held if inode is non-NULL */
 
 static inline void __d_add(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
+	if (unlikely(d_in_lookup(dentry)))
+		__d_lookup_done(dentry);
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2612,6 +2623,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	BUG_ON(d_ancestor(target, dentry));
 
 	dentry_lock_for_move(dentry, target);
+	if (unlikely(d_in_lookup(target)))
+		__d_lookup_done(target);
 
 	write_seqcount_begin(&dentry->d_seq);
 	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
diff --git a/fs/namei.c b/fs/namei.c
index c275635c4b9e..26e5f84e0c36 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1634,7 +1634,11 @@ static struct dentry *lookup_slow(const struct qstr *name,
 		inode_unlock(inode);
 		return ERR_PTR(-ENOMEM);
 	}
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_PAR_LOOKUP;
+	spin_unlock(&dentry->d_lock);
 	old = inode->i_op->lookup(inode, dentry, flags);
+	d_lookup_done(dentry);
 	if (unlikely(old)) {
 		dput(dentry);
 		dentry = old;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4bb4de8d95ea..9a7aa890b642 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -232,6 +232,8 @@ struct dentry_operations {
 #define DCACHE_ENCRYPTED_WITH_KEY	0x04000000 /* dir is encrypted with a valid key */
 #define DCACHE_OP_REAL			0x08000000
 
+#define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
+
 extern seqlock_t rename_lock;
 
 /*
@@ -367,6 +369,22 @@ static inline void dont_mount(struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 }
 
+extern void __d_lookup_done(struct dentry *);
+
+static inline int d_in_lookup(struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_PAR_LOOKUP;
+}
+
+static inline void d_lookup_done(struct dentry *dentry)
+{
+	if (unlikely(d_in_lookup(dentry))) {
+		spin_lock(&dentry->d_lock);
+		__d_lookup_done(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
+}
+
 extern void dput(struct dentry *);
 
 static inline bool d_managed(const struct dentry *dentry)
-- 
cgit v1.2.3


From 84e710da2a1dfacfc87f604869a4d22df91ce6cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 00:58:55 -0400
Subject: parallel lookups machinery, part 2

We'll need to verify that there's neither a hashed nor in-lookup
dentry with desired parent/name before adding to in-lookup set.

One possible solution would be to hold the parent's ->d_lock through
both checks, but while the in-lookup set is relatively small at any
time, dcache is not.  And holding the parent's ->d_lock through
something like __d_lookup_rcu() would suck too badly.

So we leave the parent's ->d_lock alone, which means that we watch
out for the following scenario:
	* we verify that there's no hashed match
	* existing in-lookup match gets hashed by another process
	* we verify that there's no in-lookup matches and decide
that everything's fine.

Solution: per-directory kinda-sorta seqlock, bumped around the times
we hash something that used to be in-lookup or move (and hash)
something in place of in-lookup.  Then the above would turn into
	* read the counter
	* do dcache lookup
	* if no matches found, check for in-lookup matches
	* if there had been none of those either, check if the
counter has changed; repeat if it has.

The "kinda-sorta" part is due to the fact that we don't have much spare
space in inode.  There is a spare word (shared with i_bdev/i_cdev/i_pipe),
so the counter part is not a problem, but spinlock is a different story.

We could use the parent's ->d_lock, and it would be less painful in
terms of contention, for __d_add() it would be rather inconvenient to
grab; we could do that (using lock_parent()), but...

Fortunately, we can get serialization on the counter itself, and it
might be a good idea in general; we can use cmpxchg() in a loop to
get from even to odd and smp_store_release() from odd to even.

This commit adds the counter and updating logics; the readers will be
added in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  8 ++++++++
 fs/dcache.c                       | 34 ++++++++++++++++++++++++++++++++--
 fs/inode.c                        |  1 +
 include/linux/fs.h                |  1 +
 mm/shmem.c                        |  3 ++-
 5 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 57bb3754a027..8810e2367fe6 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -531,3 +531,11 @@ in your dentry operations instead.
 	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
 	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
 	called before we attach dentry to inode.
+--
+[mandatory]
+	symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/
+	i_pipe/i_link union zeroed out at inode eviction.  As the result, you can't
+	assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that
+	it's a symlink.  Checking ->i_mode is really needed now.  In-tree we had
+	to fix shmem_destroy_callback() that used to take that kind of shortcut;
+	watch out, since that shortcut is no longer valid.
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f1d93866e69..10988f7e5a23 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2364,6 +2364,22 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
 
+static inline unsigned start_dir_add(struct inode *dir)
+{
+
+	for (;;) {
+		unsigned n = dir->i_dir_seq;
+		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+			return n;
+		cpu_relax();
+	}
+}
+
+static inline void end_dir_add(struct inode *dir, unsigned n)
+{
+	smp_store_release(&dir->i_dir_seq, n + 2);
+}
+
 void __d_lookup_done(struct dentry *dentry)
 {
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
@@ -2375,9 +2391,14 @@ EXPORT_SYMBOL(__d_lookup_done);
 
 static inline void __d_add(struct dentry *dentry, struct inode *inode)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	spin_lock(&dentry->d_lock);
-	if (unlikely(d_in_lookup(dentry)))
+	if (unlikely(d_in_lookup(dentry))) {
+		dir = dentry->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(dentry);
+	}
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2387,6 +2408,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		__fsnotify_d_instantiate(dentry);
 	}
 	_d_rehash(dentry);
+	if (dir)
+		end_dir_add(dir, n);
 	spin_unlock(&dentry->d_lock);
 	if (inode)
 		spin_unlock(&inode->i_lock);
@@ -2616,6 +2639,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
 static void __d_move(struct dentry *dentry, struct dentry *target,
 		     bool exchange)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
@@ -2623,8 +2648,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	BUG_ON(d_ancestor(target, dentry));
 
 	dentry_lock_for_move(dentry, target);
-	if (unlikely(d_in_lookup(target)))
+	if (unlikely(d_in_lookup(target))) {
+		dir = target->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(target);
+	}
 
 	write_seqcount_begin(&dentry->d_seq);
 	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
@@ -2674,6 +2702,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	write_seqcount_end(&target->d_seq);
 	write_seqcount_end(&dentry->d_seq);
 
+	if (dir)
+		end_dir_add(dir, n);
 	dentry_unlock_for_move(dentry, target);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 4202aac99464..4b884f73214e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
 	inode->i_link = NULL;
+	inode->i_dir_seq = 0;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6d0fa9174a24..00cecc5a2f75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -684,6 +684,7 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 		char			*i_link;
+		unsigned		i_dir_seq;
 	};
 
 	__u32			i_generation;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4640699b209b..e684a9140228 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3123,7 +3123,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kfree(inode->i_link);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-- 
cgit v1.2.3


From 94bdd655caba2080ae81d83d756d325abdffcb9f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 02:42:04 -0400
Subject: parallel lookups machinery, part 3

We will need to be able to check if there is an in-lookup
dentry with matching parent/name.  Right now it's impossible,
but as soon as start locking directories shared such beasts
will appear.

Add a secondary hash for locating those.  Hash chains go through
the same space where d_alias will be once it's not in-lookup anymore.
Search is done under the same bitlock we use for modifications -
with the primary hash we can rely on d_rehash() into the wrong
chain being the worst that could happen, but here the pointers are
buggered once it's removed from the chain.  On the other hand,
the chains are not going to be long and normally we'll end up
adding to the chain anyway.  That allows us to avoid bothering with
->d_lock when doing the comparisons - everything is stable until
removed from chain.

New helper: d_alloc_parallel().  Right now it allocates, verifies
that no hashed and in-lookup matches exist and adds to in-lookup
hash.

Returns ERR_PTR() for error, hashed match (in the unlikely case it's
been found) or new dentry.  In-lookup matches trigger BUG() for
now; that will change in the next commit when we introduce waiting
for ongoing lookup to finish.  Note that in-lookup matches won't be
possible until we actually go for shared locking.

lookup_slow() switched to use of d_alloc_parallel().

Again, these commits are separated only for making it easier to
review.  All this machinery will start doing something useful only
when we go for shared locking; it's just that the combination is
too large for my taste.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/namei.c             |  44 +++++++++------------
 include/linux/dcache.h |   2 +
 3 files changed, 125 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 10988f7e5a23..ea2de7c19b08 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -111,6 +111,17 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
 	return dentry_hashtable + hash_32(hash, d_hash_shift);
 }
 
+#define IN_LOOKUP_SHIFT 10
+static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];
+
+static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
+					unsigned int hash)
+{
+	hash += (unsigned long) parent / L1_CACHE_BYTES;
+	return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
+}
+
+
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
@@ -2380,9 +2391,102 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
 	smp_store_release(&dir->i_dir_seq, n + 2);
 }
 
+struct dentry *d_alloc_parallel(struct dentry *parent,
+				const struct qstr *name)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_bl_head *b = in_lookup_hash(parent, hash);
+	struct hlist_bl_node *node;
+	struct dentry *new = d_alloc(parent, name);
+	struct dentry *dentry;
+	unsigned seq, r_seq, d_seq;
+
+	if (unlikely(!new))
+		return ERR_PTR(-ENOMEM);
+
+retry:
+	rcu_read_lock();
+	seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
+	r_seq = read_seqbegin(&rename_lock);
+	dentry = __d_lookup_rcu(parent, name, &d_seq);
+	if (unlikely(dentry)) {
+		if (!lockref_get_not_dead(&dentry->d_lockref)) {
+			rcu_read_unlock();
+			goto retry;
+		}
+		if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
+			rcu_read_unlock();
+			dput(dentry);
+			goto retry;
+		}
+		rcu_read_unlock();
+		dput(new);
+		return dentry;
+	}
+	if (unlikely(read_seqretry(&rename_lock, r_seq))) {
+		rcu_read_unlock();
+		goto retry;
+	}
+	hlist_bl_lock(b);
+	if (unlikely(parent->d_inode->i_dir_seq != seq)) {
+		hlist_bl_unlock(b);
+		rcu_read_unlock();
+		goto retry;
+	}
+	rcu_read_unlock();
+	/*
+	 * No changes for the parent since the beginning of d_lookup().
+	 * Since all removals from the chain happen with hlist_bl_lock(),
+	 * any potential in-lookup matches are going to stay here until
+	 * we unlock the chain.  All fields are stable in everything
+	 * we encounter.
+	 */
+	hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
+		if (dentry->d_name.hash != hash)
+			continue;
+		if (dentry->d_parent != parent)
+			continue;
+		if (d_unhashed(dentry))
+			continue;
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
+			int tlen = dentry->d_name.len;
+			const char *tname = dentry->d_name.name;
+			if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
+				continue;
+		} else {
+			if (dentry->d_name.len != len)
+				continue;
+			if (dentry_cmp(dentry, str, len))
+				continue;
+		}
+		dget(dentry);
+		hlist_bl_unlock(b);
+		/* impossible until we actually enable parallel lookups */
+		BUG();
+		/* and this will be "wait for it to stop being in-lookup" */
+		/* this one will be handled in the next commit */
+		dput(new);
+		return dentry;
+	}
+	/* we can't take ->d_lock here; it's OK, though. */
+	new->d_flags |= DCACHE_PAR_LOOKUP;
+	hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
+	hlist_bl_unlock(b);
+	return new;
+}
+EXPORT_SYMBOL(d_alloc_parallel);
+
 void __d_lookup_done(struct dentry *dentry)
 {
+	struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
+						 dentry->d_name.hash);
+	hlist_bl_lock(b);
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
+	__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
+	hlist_bl_unlock(b);
+	INIT_HLIST_NODE(&dentry->d_u.d_alias);
 	/* more stuff will land here */
 }
 EXPORT_SYMBOL(__d_lookup_done);
diff --git a/fs/namei.c b/fs/namei.c
index 26e5f84e0c36..aa04320e1f37 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1603,46 +1603,40 @@ static struct dentry *lookup_slow(const struct qstr *name,
 				  struct dentry *dir,
 				  unsigned int flags)
 {
-	struct dentry *dentry, *old;
+	struct dentry *dentry = ERR_PTR(-ENOENT), *old;
 	struct inode *inode = dir->d_inode;
 
 	inode_lock(inode);
 	/* Don't go there if it's already dead */
-	if (unlikely(IS_DEADDIR(inode))) {
-		inode_unlock(inode);
-		return ERR_PTR(-ENOENT);
-	}
-	dentry = d_lookup(dir, name);
-	if (unlikely(dentry)) {
+	if (unlikely(IS_DEADDIR(inode)))
+		goto out;
+again:
+	dentry = d_alloc_parallel(dir, name);
+	if (IS_ERR(dentry))
+		goto out;
+	if (unlikely(!d_in_lookup(dentry))) {
 		if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
 		    !(flags & LOOKUP_NO_REVAL)) {
 			int error = d_revalidate(dentry, flags);
 			if (unlikely(error <= 0)) {
-				if (!error)
+				if (!error) {
 					d_invalidate(dentry);
+					dput(dentry);
+					goto again;
+				}
 				dput(dentry);
 				dentry = ERR_PTR(error);
 			}
 		}
-		if (dentry) {
-			inode_unlock(inode);
-			return dentry;
+	} else {
+		old = inode->i_op->lookup(inode, dentry, flags);
+		d_lookup_done(dentry);
+		if (unlikely(old)) {
+			dput(dentry);
+			dentry = old;
 		}
 	}
-	dentry = d_alloc(dir, name);
-	if (unlikely(!dentry)) {
-		inode_unlock(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_PAR_LOOKUP;
-	spin_unlock(&dentry->d_lock);
-	old = inode->i_op->lookup(inode, dentry, flags);
-	d_lookup_done(dentry);
-	if (unlikely(old)) {
-		dput(dentry);
-		dentry = old;
-	}
+out:
 	inode_unlock(inode);
 	return dentry;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9a7aa890b642..3eea562f5f27 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -131,6 +131,7 @@ struct dentry {
 	 */
 	union {
 		struct hlist_node d_alias;	/* inode alias list */
+		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
 	 	struct rcu_head d_rcu;
 	} d_u;
 };
@@ -250,6 +251,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
 /* allocate/de-allocate */
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
-- 
cgit v1.2.3


From d9171b9345261e0d941d92fdda5672b5db67f968 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 03:33:13 -0400
Subject: parallel lookups machinery, part 4 (and last)

If we *do* run into an in-lookup match, we need to wait for it to
cease being in-lookup.  Fortunately, we do have unused space in
in-lookup dentries - d_lru is never looked at until it stops being
in-lookup.

So we can stash a pointer to wait_queue_head from stack frame of
the caller of ->lookup().  Some precautions are needed while
waiting, but it's not that hard - we do hold a reference to dentry
we are waiting for, so it can't go away.  If it's found to be
in-lookup the wait_queue_head is still alive and will remain so
at least while ->d_lock is held.  Moreover, the condition we
are waiting for becomes true at the same point where everything
on that wq gets woken up, so we can just add ourselves to the
queue once.

d_alloc_parallel() gets a pointer to wait_queue_head_t from its
caller; lookup_slow() adjusted, d_add_ci() taught to use
d_alloc_parallel() if the dentry passed to it happens to be
in-lookup one (i.e. if it's been called from the parallel lookup).

That's pretty much it - all that remains is to switch ->i_mutex
to rwsem and have lookup_slow() take it shared.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 94 +++++++++++++++++++++++++++++++++++++++-----------
 fs/namei.c             |  3 +-
 include/linux/dcache.h |  8 +++--
 3 files changed, 82 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index ea2de7c19b08..59fcffcbf096 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1987,28 +1987,36 @@ EXPORT_SYMBOL(d_obtain_root);
 struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 			struct qstr *name)
 {
-	struct dentry *found;
-	struct dentry *new;
+	struct dentry *found, *res;
 
 	/*
 	 * First check if a dentry matching the name already exists,
 	 * if not go ahead and create it now.
 	 */
 	found = d_hash_and_lookup(dentry->d_parent, name);
-	if (!found) {
-		new = d_alloc(dentry->d_parent, name);
-		if (!new) {
-			found = ERR_PTR(-ENOMEM);
-		} else {
-			found = d_splice_alias(inode, new);
-			if (found) {
-				dput(new);
-				return found;
-			}
-			return new;
+	if (found) {
+		iput(inode);
+		return found;
+	}
+	if (d_in_lookup(dentry)) {
+		found = d_alloc_parallel(dentry->d_parent, name,
+					dentry->d_wait);
+		if (IS_ERR(found) || !d_in_lookup(found)) {
+			iput(inode);
+			return found;
 		}
+	} else {
+		found = d_alloc(dentry->d_parent, name);
+		if (!found) {
+			iput(inode);
+			return ERR_PTR(-ENOMEM);
+		} 
+	}
+	res = d_splice_alias(inode, found);
+	if (res) {
+		dput(found);
+		return res;
 	}
-	iput(inode);
 	return found;
 }
 EXPORT_SYMBOL(d_add_ci);
@@ -2391,8 +2399,23 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
 	smp_store_release(&dir->i_dir_seq, n + 2);
 }
 
+static void d_wait_lookup(struct dentry *dentry)
+{
+	if (d_in_lookup(dentry)) {
+		DECLARE_WAITQUEUE(wait, current);
+		add_wait_queue(dentry->d_wait, &wait);
+		do {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock(&dentry->d_lock);
+			schedule();
+			spin_lock(&dentry->d_lock);
+		} while (d_in_lookup(dentry));
+	}
+}
+
 struct dentry *d_alloc_parallel(struct dentry *parent,
-				const struct qstr *name)
+				const struct qstr *name,
+				wait_queue_head_t *wq)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
@@ -2463,18 +2486,47 @@ retry:
 		}
 		dget(dentry);
 		hlist_bl_unlock(b);
-		/* impossible until we actually enable parallel lookups */
-		BUG();
-		/* and this will be "wait for it to stop being in-lookup" */
-		/* this one will be handled in the next commit */
+		/* somebody is doing lookup for it right now; wait for it */
+		spin_lock(&dentry->d_lock);
+		d_wait_lookup(dentry);
+		/*
+		 * it's not in-lookup anymore; in principle we should repeat
+		 * everything from dcache lookup, but it's likely to be what
+		 * d_lookup() would've found anyway.  If it is, just return it;
+		 * otherwise we really have to repeat the whole thing.
+		 */
+		if (unlikely(dentry->d_name.hash != hash))
+			goto mismatch;
+		if (unlikely(dentry->d_parent != parent))
+			goto mismatch;
+		if (unlikely(d_unhashed(dentry)))
+			goto mismatch;
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
+			int tlen = dentry->d_name.len;
+			const char *tname = dentry->d_name.name;
+			if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
+				goto mismatch;
+		} else {
+			if (unlikely(dentry->d_name.len != len))
+				goto mismatch;
+			if (unlikely(dentry_cmp(dentry, str, len)))
+				goto mismatch;
+		}
+		/* OK, it *is* a hashed match; return it */
+		spin_unlock(&dentry->d_lock);
 		dput(new);
 		return dentry;
 	}
 	/* we can't take ->d_lock here; it's OK, though. */
 	new->d_flags |= DCACHE_PAR_LOOKUP;
+	new->d_wait = wq;
 	hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
 	hlist_bl_unlock(b);
 	return new;
+mismatch:
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+	goto retry;
 }
 EXPORT_SYMBOL(d_alloc_parallel);
 
@@ -2485,9 +2537,11 @@ void __d_lookup_done(struct dentry *dentry)
 	hlist_bl_lock(b);
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
 	__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
+	wake_up_all(dentry->d_wait);
+	dentry->d_wait = NULL;
 	hlist_bl_unlock(b);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
-	/* more stuff will land here */
+	INIT_LIST_HEAD(&dentry->d_lru);
 }
 EXPORT_SYMBOL(__d_lookup_done);
 
diff --git a/fs/namei.c b/fs/namei.c
index aa04320e1f37..7babb5e5f276 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,13 +1605,14 @@ static struct dentry *lookup_slow(const struct qstr *name,
 {
 	struct dentry *dentry = ERR_PTR(-ENOENT), *old;
 	struct inode *inode = dir->d_inode;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
 	inode_lock(inode);
 	/* Don't go there if it's already dead */
 	if (unlikely(IS_DEADDIR(inode)))
 		goto out;
 again:
-	dentry = d_alloc_parallel(dir, name);
+	dentry = d_alloc_parallel(dir, name, &wq);
 	if (IS_ERR(dentry))
 		goto out;
 	if (unlikely(!d_in_lookup(dentry))) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3eea562f5f27..6b1d8bbb3496 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -123,7 +123,10 @@ struct dentry {
 	unsigned long d_time;		/* used by d_revalidate */
 	void *d_fsdata;			/* fs-specific data */
 
-	struct list_head d_lru;		/* LRU list */
+	union {
+		struct list_head d_lru;		/* LRU list */
+		wait_queue_head_t *d_wait;	/* in-lookup ones only */
+	};
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	/*
@@ -251,7 +254,8 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
 /* allocate/de-allocate */
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
-extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *);
+extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
+					wait_queue_head_t *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
-- 
cgit v1.2.3


From 9902af79c01a8e39bb99b922fa3eef6d4ea23d69 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 15:08:36 -0400
Subject: parallel lookups: actual switch to rwsem

ta-da!

The main issue is the lack of down_write_killable(), so the places
like readdir.c switched to plain inode_lock(); once killable
variants of rwsem primitives appear, that'll be dealt with.

lockdep side also might need more work

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 18 ++++++++++++++++++
 fs/btrfs/ioctl.c                  | 18 +++++++++++-------
 fs/configfs/inode.c               |  2 +-
 fs/dcache.c                       |  9 +++++----
 fs/gfs2/ops_fstype.c              |  2 +-
 fs/inode.c                        | 12 ++++++------
 fs/namei.c                        |  4 ++--
 fs/ocfs2/inode.c                  |  2 +-
 fs/overlayfs/readdir.c            |  4 +++-
 fs/readdir.c                      |  7 ++++---
 include/linux/fs.h                | 27 +++++++++++++++++++++------
 11 files changed, 73 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 8810e2367fe6..1567a53857bd 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -539,3 +539,21 @@ in your dentry operations instead.
 	it's a symlink.  Checking ->i_mode is really needed now.  In-tree we had
 	to fix shmem_destroy_callback() that used to take that kind of shortcut;
 	watch out, since that shortcut is no longer valid.
+--
+[mandatory]
+	->i_mutex is replaced with ->i_rwsem now.  inode_lock() et.al. work as
+	they used to - they just take it exclusive.  However, ->lookup() may be
+	called with parent locked shared.  Its instances must not
+		* use d_instantiate) and d_rehash() separately - use d_add() or
+		  d_splice_alias() instead.
+		* use d_rehash() alone - call d_add(new_dentry, NULL) instead.
+		* in the unlikely case when (read-only) access to filesystem
+		  data structures needs exclusion for some reason, arrange it
+		  yourself.  None of the in-tree filesystems needed that.
+		* rely on ->d_parent and ->d_name not changing after dentry has
+		  been fed to d_add() or d_splice_alias().  Again, none of the
+		  in-tree instances relied upon that.
+	We are guaranteed that lookups of the same name in the same directory
+	will not happen in parallel ("same" in the sense of your ->d_compare()).
+	Lookups on different names in the same directory can and do happen in
+	parallel now.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5a23806ae418..0b8ba717175b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -837,9 +837,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 	struct dentry *dentry;
 	int error;
 
-	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	if (error == -EINTR)
-		return error;
+	inode_lock_nested(dir, I_MUTEX_PARENT);
+	// XXX: should've been
+	// mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	// if (error == -EINTR)
+	//	return error;
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -2366,9 +2368,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out;
 
 
-	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	if (err == -EINTR)
-		goto out_drop_write;
+	inode_lock_nested(dir, I_MUTEX_PARENT);
+	// XXX: should've been
+	// err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	// if (err == -EINTR)
+	//	goto out_drop_write;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -2558,7 +2562,7 @@ out_dput:
 	dput(dentry);
 out_unlock_dir:
 	inode_unlock(dir);
-out_drop_write:
+//out_drop_write:
 	mnt_drop_write_file(file);
 out:
 	kfree(vol_args);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 03d124ae27d7..0387968e6f47 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -156,7 +156,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
 
 	if (depth > 0) {
 		if (depth <= ARRAY_SIZE(default_group_class)) {
-			lockdep_set_class(&inode->i_mutex,
+			lockdep_set_class(&inode->i_rwsem,
 					  &default_group_class[depth - 1]);
 		} else {
 			/*
diff --git a/fs/dcache.c b/fs/dcache.c
index 59fcffcbf096..e49ba7d1b957 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2932,7 +2932,8 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 static int __d_unalias(struct inode *inode,
 		struct dentry *dentry, struct dentry *alias)
 {
-	struct mutex *m1 = NULL, *m2 = NULL;
+	struct mutex *m1 = NULL;
+	struct rw_semaphore *m2 = NULL;
 	int ret = -ESTALE;
 
 	/* If alias and dentry share a parent, then no extra locks required */
@@ -2943,15 +2944,15 @@ static int __d_unalias(struct inode *inode,
 	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
 		goto out_err;
 	m1 = &dentry->d_sb->s_vfs_rename_mutex;
-	if (!inode_trylock(alias->d_parent->d_inode))
+	if (!inode_trylock_shared(alias->d_parent->d_inode))
 		goto out_err;
-	m2 = &alias->d_parent->d_inode->i_mutex;
+	m2 = &alias->d_parent->d_inode->i_rwsem;
 out_unalias:
 	__d_move(alias, dentry, false);
 	ret = 0;
 out_err:
 	if (m2)
-		mutex_unlock(m2);
+		up_read(m2);
 	if (m1)
 		mutex_unlock(m1);
 	return ret;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c09c63dcd7a2..45463600fb81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -824,7 +824,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	 * i_mutex on quota files is special. Since this inode is hidden system
 	 * file, we are safe to define locking ourselves.
 	 */
-	lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
+	lockdep_set_class(&sdp->sd_quota_inode->i_rwsem,
 			  &gfs2_quota_imutex_key);
 
 	error = gfs2_rindex_update(sdp);
diff --git a/fs/inode.c b/fs/inode.c
index 4b884f73214e..4ccbc21b30ce 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,8 +166,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
-	mutex_init(&inode->i_mutex);
-	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+	init_rwsem(&inode->i_rwsem);
+	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
 
 	atomic_set(&inode->i_dio_count, 0);
 
@@ -925,13 +925,13 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
 		struct file_system_type *type = inode->i_sb->s_type;
 
 		/* Set new key only if filesystem hasn't already changed it */
-		if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
+		if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
 			/*
 			 * ensure nobody is actually holding i_mutex
 			 */
-			mutex_destroy(&inode->i_mutex);
-			mutex_init(&inode->i_mutex);
-			lockdep_set_class(&inode->i_mutex,
+			// mutex_destroy(&inode->i_mutex);
+			init_rwsem(&inode->i_rwsem);
+			lockdep_set_class(&inode->i_rwsem,
 					  &type->i_mutex_dir_key);
 		}
 	}
diff --git a/fs/namei.c b/fs/namei.c
index 7babb5e5f276..8249852b5fc6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1607,7 +1607,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
 	struct inode *inode = dir->d_inode;
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
-	inode_lock(inode);
+	inode_lock_shared(inode);
 	/* Don't go there if it's already dead */
 	if (unlikely(IS_DEADDIR(inode)))
 		goto out;
@@ -1638,7 +1638,7 @@ again:
 		}
 	}
 out:
-	inode_unlock(inode);
+	inode_unlock_shared(inode);
 	return dentry;
 }
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 12f4a9e9800f..0748777f2e2a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -262,7 +262,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
 	if (args->fi_sysfile_type != 0)
-		lockdep_set_class(&inode->i_mutex,
+		lockdep_set_class(&inode->i_rwsem,
 			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
 	if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
 	    args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 6ec1e43a9a54..da186ee4f846 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -218,7 +218,9 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
 	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
 	old_cred = override_creds(override_cred);
 
-	err = mutex_lock_killable(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
+	err = 0;
+	// XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex);
 	if (!err) {
 		while (rdd->first_maybe_whiteout) {
 			p = rdd->first_maybe_whiteout;
diff --git a/fs/readdir.c b/fs/readdir.c
index e69ef3b79787..bf583e848a1a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -32,9 +32,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 	if (res)
 		goto out;
 
-	res = mutex_lock_killable(&inode->i_mutex);
-	if (res)
-		goto out;
+	inode_lock(inode);
+	// res = mutex_lock_killable(&inode->i_mutex);
+	// if (res)
+	//	goto out;
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00cecc5a2f75..3018f31f7aa0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -647,7 +647,7 @@ struct inode {
 
 	/* Misc */
 	unsigned long		i_state;
-	struct mutex		i_mutex;
+	struct rw_semaphore	i_rwsem;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 	unsigned long		dirtied_time_when;
@@ -734,27 +734,42 @@ enum inode_i_mutex_lock_class
 
 static inline void inode_lock(struct inode *inode)
 {
-	mutex_lock(&inode->i_mutex);
+	down_write(&inode->i_rwsem);
 }
 
 static inline void inode_unlock(struct inode *inode)
 {
-	mutex_unlock(&inode->i_mutex);
+	up_write(&inode->i_rwsem);
+}
+
+static inline void inode_lock_shared(struct inode *inode)
+{
+	down_read(&inode->i_rwsem);
+}
+
+static inline void inode_unlock_shared(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
 }
 
 static inline int inode_trylock(struct inode *inode)
 {
-	return mutex_trylock(&inode->i_mutex);
+	return down_write_trylock(&inode->i_rwsem);
+}
+
+static inline int inode_trylock_shared(struct inode *inode)
+{
+	return down_read_trylock(&inode->i_rwsem);
 }
 
 static inline int inode_is_locked(struct inode *inode)
 {
-	return mutex_is_locked(&inode->i_mutex);
+	return rwsem_is_locked(&inode->i_rwsem);
 }
 
 static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
 {
-	mutex_lock_nested(&inode->i_mutex, subclass);
+	down_write_nested(&inode->i_rwsem, subclass);
 }
 
 void lock_two_nondirectories(struct inode *, struct inode*);
-- 
cgit v1.2.3


From 63b6df14134ddd048984c8afadb46e721815bfc6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 20 Apr 2016 17:08:21 -0400
Subject: give readdir(2)/getdents(2)/etc. uniform exclusion with lseek()

same as read() on regular files has, and for the same reason.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c |  4 ++--
 fs/compat.c                 | 12 ++++++------
 fs/file.c                   |  5 +++++
 fs/open.c                   |  2 +-
 fs/read_write.c             | 12 ------------
 fs/readdir.c                | 12 ++++++------
 include/linux/file.h        | 13 +++++++++++++
 7 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 6cc08166ff00..ffb93f499c83 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -147,7 +147,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 		long __user *, basep)
 {
 	int error;
-	struct fd arg = fdget(fd);
+	struct fd arg = fdget_pos(fd);
 	struct osf_dirent_callback buf = {
 		.ctx.actor = osf_filldir,
 		.dirent = dirent,
@@ -164,7 +164,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 	if (count != buf.count)
 		error = count - buf.count;
 
-	fdput(arg);
+	fdput_pos(arg);
 	return error;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index a71936a3f4cb..8754e9aa14ad 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -884,7 +884,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	struct compat_readdir_callback buf = {
 		.ctx.actor = compat_fillonedir,
 		.dirent = dirent
@@ -897,7 +897,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 	if (buf.result)
 		error = buf.result;
 
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
 
@@ -975,7 +975,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	f = fdget(fd);
+	f = fdget_pos(fd);
 	if (!f.file)
 		return -EBADF;
 
@@ -989,7 +989,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
 
@@ -1062,7 +1062,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	f = fdget(fd);
+	f = fdget_pos(fd);
 	if (!f.file)
 		return -EBADF;
 
@@ -1077,7 +1077,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
diff --git a/fs/file.c b/fs/file.c
index 1fbc5c0555a9..6b1acdfe59da 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -784,6 +784,11 @@ unsigned long __fdget_pos(unsigned int fd)
 	return v;
 }
 
+void __f_unlock_pos(struct file *f)
+{
+	mutex_unlock(&f->f_pos_lock);
+}
+
 /*
  * We only lock f_pos if we have threads or if the file might be
  * shared with another process. In both cases we'll have an elevated
diff --git a/fs/open.c b/fs/open.c
index 17cb6b1dab75..938a658a5c6d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -713,7 +713,7 @@ static int do_dentry_open(struct file *f,
 	}
 
 	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
-	if (S_ISREG(inode->i_mode))
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
 		f->f_mode |= FMODE_ATOMIC_POS;
 
 	f->f_op = fops_get(inode->i_fop);
diff --git a/fs/read_write.c b/fs/read_write.c
index cf377cf9dfe3..69c7c3c2955c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -302,18 +302,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-static inline struct fd fdget_pos(int fd)
-{
-	return __to_fd(__fdget_pos(fd));
-}
-
-static inline void fdput_pos(struct fd f)
-{
-	if (f.flags & FDPUT_POS_UNLOCK)
-		mutex_unlock(&f.file->f_pos_lock);
-	fdput(f);
-}
-
 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
 	off_t retval;
diff --git a/fs/readdir.c b/fs/readdir.c
index bf583e848a1a..d7308b8f6cf7 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -112,7 +112,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	struct readdir_callback buf = {
 		.ctx.actor = fillonedir,
 		.dirent = dirent
@@ -125,7 +125,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 	if (buf.result)
 		error = buf.result;
 
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
 
@@ -209,7 +209,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	f = fdget(fd);
+	f = fdget_pos(fd);
 	if (!f.file)
 		return -EBADF;
 
@@ -223,7 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
 
@@ -290,7 +290,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	f = fdget(fd);
+	f = fdget_pos(fd);
 	if (!f.file)
 		return -EBADF;
 
@@ -305,6 +305,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
-	fdput(f);
+	fdput_pos(f);
 	return error;
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index f87d30882a24..7444f5feda12 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -44,6 +44,7 @@ extern struct file *fget_raw(unsigned int fd);
 extern unsigned long __fdget(unsigned int fd);
 extern unsigned long __fdget_raw(unsigned int fd);
 extern unsigned long __fdget_pos(unsigned int fd);
+extern void __f_unlock_pos(struct file *);
 
 static inline struct fd __to_fd(unsigned long v)
 {
@@ -60,6 +61,18 @@ static inline struct fd fdget_raw(unsigned int fd)
 	return __to_fd(__fdget_raw(fd));
 }
 
+static inline struct fd fdget_pos(int fd)
+{
+	return __to_fd(__fdget_pos(fd));
+}
+
+static inline void fdput_pos(struct fd f)
+{
+	if (f.flags & FDPUT_POS_UNLOCK)
+		__f_unlock_pos(f.file);
+	fdput(f);
+}
+
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
-- 
cgit v1.2.3


From 6192269444ebfbfb42e23c7a6a93c76ffe4b5e51 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 20 Apr 2016 23:08:32 -0400
Subject: introduce a parallel variant of ->iterate()

New method: ->iterate_shared().  Same arguments as in ->iterate(),
called with the directory locked only shared.  Once all filesystems
switch, the old one will be gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 18 ++++++++++++++++++
 fs/coda/dir.c                     | 18 ++++++++++++------
 fs/exportfs/expfs.c               |  2 +-
 fs/readdir.c                      | 20 ++++++++++++++++----
 include/linux/fs.h                |  1 +
 5 files changed, 48 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 1567a53857bd..12c57abdaac9 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -557,3 +557,21 @@ in your dentry operations instead.
 	will not happen in parallel ("same" in the sense of your ->d_compare()).
 	Lookups on different names in the same directory can and do happen in
 	parallel now.
+--
+[recommended]
+	->iterate_shared() is added; it's a parallel variant of ->iterate().
+	Exclusion on struct file level is still provided (as well as that
+	between it and lseek on the same struct file), but if your directory
+	has been opened several times, you can get these called in parallel.
+	Exclusion between that method and all directory-modifying ones is
+	still provided, of course.
+
+	Often enough ->iterate() can serve as ->iterate_shared() without any
+	changes - it is a read-only operation, after all.  If you have any
+	per-inode or per-dentry in-core data structures modified by ->iterate(),
+	you might need something to serialize the access to them.  If you
+	do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for
+	that; look for in-tree examples.
+
+	Old method is only used if the new one is absent; eventually it will
+	be removed.  Switch while you still can; the old one won't stay.
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 42e731b8c80a..6fb8672c0892 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -424,16 +424,22 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (host_file->f_op->iterate) {
+	if (host_file->f_op->iterate || host_file->f_op->iterate_shared) {
 		struct inode *host_inode = file_inode(host_file);
-
-		inode_lock(host_inode);
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
-			ret = host_file->f_op->iterate(host_file, ctx);
-			file_accessed(host_file);
+			if (host_file->f_op->iterate_shared) {
+				inode_lock_shared(host_inode);
+				ret = host_file->f_op->iterate_shared(host_file, ctx);
+				file_accessed(host_file);
+				inode_unlock_shared(host_inode);
+			} else {
+				inode_lock(host_inode);
+				ret = host_file->f_op->iterate(host_file, ctx);
+				file_accessed(host_file);
+				inode_unlock(host_inode);
+			}
 		}
-		inode_unlock(host_inode);
 		return ret;
 	}
 	/* Venus: we must read Venus dirents from a file */
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 402c5caab5ca..207ba8d627ca 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -312,7 +312,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 		goto out;
 
 	error = -EINVAL;
-	if (!file->f_op->iterate)
+	if (!file->f_op->iterate && !file->f_op->iterate_shared)
 		goto out_close;
 
 	buffer.sequence = 0;
diff --git a/fs/readdir.c b/fs/readdir.c
index d7308b8f6cf7..a86c6c04b9bc 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -24,15 +24,21 @@
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
+	bool shared = false;
 	int res = -ENOTDIR;
-	if (!file->f_op->iterate)
+	if (file->f_op->iterate_shared)
+		shared = true;
+	else if (!file->f_op->iterate)
 		goto out;
 
 	res = security_file_permission(file, MAY_READ);
 	if (res)
 		goto out;
 
-	inode_lock(inode);
+	if (shared)
+		inode_lock_shared(inode);
+	else
+		inode_lock(inode);
 	// res = mutex_lock_killable(&inode->i_mutex);
 	// if (res)
 	//	goto out;
@@ -40,12 +46,18 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		ctx->pos = file->f_pos;
-		res = file->f_op->iterate(file, ctx);
+		if (shared)
+			res = file->f_op->iterate_shared(file, ctx);
+		else
+			res = file->f_op->iterate(file, ctx);
 		file->f_pos = ctx->pos;
 		fsnotify_access(file);
 		file_accessed(file);
 	}
-	inode_unlock(inode);
+	if (shared)
+		inode_unlock_shared(inode);
+	else
+		inode_unlock(inode);
 out:
 	return res;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3018f31f7aa0..3dc0258a2b64 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1674,6 +1674,7 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
+	int (*iterate_shared) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
-- 
cgit v1.2.3


From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 2 May 2016 21:30:04 -0400
Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic

Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  3 --
 kernel/trace/trace_events_filter.c | 71 ++++++--------------------------------
 2 files changed, 10 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3111a1efdad6..ba6456302e34 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -214,7 +214,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
-	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
@@ -229,7 +228,6 @@ enum {
  *  WAS_ENABLED   - Set and stays set when an event was ever enabled
  *                    (used for module unloading, if a module event is enabled,
  *                     it is best to clear the buffers that used it).
- *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
@@ -240,7 +238,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
-	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b3f5051cd4e9..d1d27bf37a19 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
-	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		return file->event_call->filter;
-	else
-		return file->filter;
+	return file->filter;
 }
 
 /* caller must hold event_mutex */
@@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags &= ~EVENT_FILE_FL_FILTERED;
+	file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	filter_disable(file);
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		remove_filter_string(call->filter);
-	else
-		remove_filter_string(file->filter);
+	remove_filter_string(file->filter);
 }
 
 static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 
 static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
-		__free_filter(call->filter);
-		call->filter = NULL;
-	} else {
-		__free_filter(file->filter);
-		file->filter = NULL;
-	}
+	__free_filter(file->filter);
+	file->filter = NULL;
 }
 
 static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
@@ -1718,69 +1698,38 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags |= EVENT_FILE_FL_FILTERED;
+	file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		rcu_assign_pointer(call->filter, filter);
-	else
-		rcu_assign_pointer(file->filter, filter);
+	rcu_assign_pointer(file->filter, filter);
 }
 
 static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		RCU_INIT_POINTER(call->filter, NULL);
-	else
-		RCU_INIT_POINTER(file->filter, NULL);
+	RCU_INIT_POINTER(file->filter, NULL);
 }
 
 static inline void
 event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
 event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
 event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
-	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
-	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From c0ef079ca791ef9e057ac748051425a768c9e192 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 03:29:09 +0200
Subject: netdevice: shrink size of struct netdev_queue

- trans_timeout is incremented when tx queue timed out (tx watchdog).
- tx_maxrate is set via sysfs

Moving tx_maxrate to read-mostly part shrinks the struct by 64 bytes.
While at it, also move trans_timeout (it is out-of-place in the
'write-mostly' part).

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 52914a854386..f2182594160e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -569,6 +569,12 @@ struct netdev_queue {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
 	int			numa_node;
 #endif
+	unsigned long		tx_maxrate;
+	/*
+	 * Number of TX timeouts for this queue
+	 * (/sys/class/net/DEV/Q/trans_timeout)
+	 */
+	unsigned long		trans_timeout;
 /*
  * write-mostly part
  */
@@ -579,18 +585,11 @@ struct netdev_queue {
 	 */
 	unsigned long		trans_start;
 
-	/*
-	 * Number of TX timeouts for this queue
-	 * (/sys/class/net/DEV/Q/trans_timeout)
-	 */
-	unsigned long		trans_timeout;
-
 	unsigned long		state;
 
 #ifdef CONFIG_BQL
 	struct dql		dql;
 #endif
-	unsigned long		tx_maxrate;
 } ____cacheline_aligned_in_smp;
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
-- 
cgit v1.2.3


From 9580bf2edb402b3afaf9c5a4efb6953f993ef52e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 30 Apr 2016 10:19:29 -0700
Subject: net: relax expensive skb_unclone() in iptunnel_handle_offloads()

Locally generated TCP GSO packets having to go through a GRE/SIT/IPIP
tunnel have to go through an expensive skb_unclone()

Reallocating skb->head is a lot of work.

Test should really check if a 'real clone' of the packet was done.

TCP does not care if the original gso_type is changed while the packet
travels in the stack.

This adds skb_header_unclone() which is a variant of skb_clone()
using skb_header_cloned() check instead of skb_cloned().

This variant can probably be used from other points.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 10 ++++++++++
 net/ipv4/ip_tunnel_core.c |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c84a5a1078c5..c413c588a24f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1325,6 +1325,16 @@ static inline int skb_header_cloned(const struct sk_buff *skb)
 	return dataref != 1;
 }
 
+static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
+{
+	might_sleep_if(gfpflags_allow_blocking(pri));
+
+	if (skb_header_cloned(skb))
+		return pskb_expand_head(skb, 0, 0, pri);
+
+	return 0;
+}
+
 /**
  *	skb_header_release - release reference to header
  *	@skb: buffer to operate on
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 786fa7ca28e0..9118b0e640ba 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -157,7 +157,7 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
 	}
 
 	if (skb_is_gso(skb)) {
-		err = skb_unclone(skb, GFP_ATOMIC);
+		err = skb_header_unclone(skb, GFP_ATOMIC);
 		if (unlikely(err))
 			return err;
 		skb_shinfo(skb)->gso_type |= gso_type_mask;
-- 
cgit v1.2.3


From 502d6df11ae394301470703fa6e485a0dc133401 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Mon, 11 Apr 2016 16:32:54 +0100
Subject: irqchip/gic-v2: Parse and export virtual GIC information

For now, the firmware tables are parsed 2 times: once in the GIC
drivers, the other timer when initializing the vGIC. It means code
duplication and make more tedious to add the support for another
firmware table (like ACPI).

Introduce a new structure and set of helpers to get/set the virtual GIC
information. Also fill up the structure for GICv2.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-common.c       | 13 ++++++
 drivers/irqchip/irq-gic-common.h       |  3 ++
 drivers/irqchip/irq-gic.c              | 76 +++++++++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-common.h | 33 +++++++++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/irqchip/arm-gic-common.h

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c
index f174ce0ca361..2e9443be2b14 100644
--- a/drivers/irqchip/irq-gic-common.c
+++ b/drivers/irqchip/irq-gic-common.c
@@ -21,6 +21,19 @@
 
 #include "irq-gic-common.h"
 
+static const struct gic_kvm_info *gic_kvm_info;
+
+const struct gic_kvm_info *gic_get_kvm_info(void)
+{
+	return gic_kvm_info;
+}
+
+void gic_set_kvm_info(const struct gic_kvm_info *info)
+{
+	BUG_ON(gic_kvm_info != NULL);
+	gic_kvm_info = info;
+}
+
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data)
 {
diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
index fff697db8e22..205e5fddf6da 100644
--- a/drivers/irqchip/irq-gic-common.h
+++ b/drivers/irqchip/irq-gic-common.h
@@ -19,6 +19,7 @@
 
 #include <linux/of.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/arm-gic-common.h>
 
 struct gic_quirk {
 	const char *desc;
@@ -35,4 +36,6 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void));
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data);
 
+void gic_set_kvm_info(const struct gic_kvm_info *info);
+
 #endif /* _IRQ_GIC_COMMON_H */
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 7a73786596cd..3f1d9fd3a462 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -102,6 +102,8 @@ static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
 
+static struct gic_kvm_info gic_v2_kvm_info;
+
 #ifdef CONFIG_GIC_NON_BANKED
 static void __iomem *gic_get_percpu_base(union gic_base *base)
 {
@@ -1189,6 +1191,29 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base)
 	return true;
 }
 
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v2_kvm_info.maint_irq)
+		return;
+
+	ret = of_address_to_resource(node, 2, vctrl_res);
+	if (ret)
+		return;
+
+	ret = of_address_to_resource(node, 3, vcpu_res);
+	if (ret)
+		return;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}
+
 int __init
 gic_of_init(struct device_node *node, struct device_node *parent)
 {
@@ -1218,8 +1243,10 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 
 	__gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
 			 &node->fwnode);
-	if (!gic_cnt)
+	if (!gic_cnt) {
 		gic_init_physaddr(node);
+		gic_of_setup_kvm_info(node);
+	}
 
 	if (parent) {
 		irq = irq_of_parse_and_map(node, 0);
@@ -1248,6 +1275,10 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init);
 static struct
 {
 	phys_addr_t cpu_phys_base;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vctrl_base;
+	phys_addr_t vcpu_base;
 } acpi_data __initdata;
 
 static int __init
@@ -1272,6 +1303,12 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
 		return -EINVAL;
 
 	acpi_data.cpu_phys_base = gic_cpu_base;
+	acpi_data.maint_irq = processor->vgic_interrupt;
+	acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+				    ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+	acpi_data.vctrl_base = processor->gich_base_address;
+	acpi_data.vcpu_base = processor->gicv_base_address;
+
 	cpu_base_assigned = 1;
 	return 0;
 }
@@ -1302,6 +1339,41 @@ static bool __init gic_validate_dist(struct acpi_subtable_header *header,
 
 #define ACPI_GICV2_DIST_MEM_SIZE	(SZ_4K)
 #define ACPI_GIC_CPU_IF_MEM_SIZE	(SZ_8K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	if (!acpi_data.vctrl_base)
+		return;
+
+	vctrl_res->flags = IORESOURCE_MEM;
+	vctrl_res->start = acpi_data.vctrl_base;
+	vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+
+	if (!acpi_data.vcpu_base)
+		return;
+
+	vcpu_res->flags = IORESOURCE_MEM;
+	vcpu_res->start = acpi_data.vcpu_base;
+	vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v2_kvm_info.maint_irq = irq;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}
 
 static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 				   const unsigned long end)
@@ -1359,6 +1431,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 	if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
 		gicv2m_init(NULL, gic_data[0].domain);
 
+	gic_acpi_setup_kvm_info();
+
 	return 0;
 }
 IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
new file mode 100644
index 000000000000..ef34f6f35e91
--- /dev/null
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -0,0 +1,33 @@
+/*
+ * include/linux/irqchip/arm-gic-common.h
+ *
+ * Copyright (C) 2016 ARM Limited, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+#define __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+
+#include <linux/types.h>
+#include <linux/ioport.h>
+
+enum gic_type {
+	GIC_V2,
+};
+
+struct gic_kvm_info {
+	/* GIC type */
+	enum gic_type	type;
+	/* Virtual CPU interface */
+	struct resource vcpu;
+	/* Interrupt number */
+	unsigned int	maint_irq;
+	/* Virtual control interface */
+	struct resource vctrl;
+};
+
+const struct gic_kvm_info *gic_get_kvm_info(void);
+
+#endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */
-- 
cgit v1.2.3


From 1839e576968f34b9a31da9f0033f8de12a1c9de6 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Mon, 11 Apr 2016 16:32:57 +0100
Subject: irqchip/gic-v3: Parse and export virtual GIC information

Fill up the recently introduced gic_kvm_info with the hardware
information used for virtualization.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 drivers/irqchip/irq-gic-v3.c           | 114 ++++++++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-common.h |   1 +
 2 files changed, 114 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 6a9f5ff161a4..05a856073714 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 
 #include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-common.h>
 #include <linux/irqchip/arm-gic-v3.h>
 
 #include <asm/cputype.h>
@@ -58,6 +59,8 @@ struct gic_chip_data {
 static struct gic_chip_data gic_data __read_mostly;
 static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
+static struct gic_kvm_info gic_v3_kvm_info;
+
 #define gic_data_rdist()		(this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_sgi_base()	(gic_data_rdist_rd_base() + SZ_64K)
@@ -903,6 +906,30 @@ static int __init gic_validate_dist_version(void __iomem *dist_base)
 	return 0;
 }
 
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource r;
+	u32 gicv_idx;
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v3_kvm_info.maint_irq)
+		return;
+
+	if (of_property_read_u32(node, "#redistributor-regions",
+				 &gicv_idx))
+		gicv_idx = 1;
+
+	gicv_idx += 3;	/* Also skip GICD, GICC, GICH */
+	ret = of_address_to_resource(node, gicv_idx, &r);
+	if (!ret)
+		gic_v3_kvm_info.vcpu = r;
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}
+
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
 {
 	void __iomem *dist_base;
@@ -954,8 +981,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 
 	err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
 			     redist_stride, &node->fwnode);
-	if (!err)
+	if (!err) {
+		gic_of_setup_kvm_info(node);
 		return 0;
+	}
 
 out_unmap_rdist:
 	for (i = 0; i < nr_redist_regions; i++)
@@ -976,6 +1005,9 @@ static struct
 	struct redist_region *redist_regs;
 	u32 nr_redist_regions;
 	bool single_redist;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vcpu_base;
 } acpi_data __initdata;
 
 static void __init
@@ -1112,7 +1144,85 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 	return true;
 }
 
+static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+						const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+		(struct acpi_madt_generic_interrupt *)header;
+	int maint_irq_mode;
+	static int first_madt = true;
+
+	/* Skip unusable CPUs */
+	if (!(gicc->flags & ACPI_MADT_ENABLED))
+		return 0;
+
+	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+		ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+
+	if (first_madt) {
+		first_madt = false;
+
+		acpi_data.maint_irq = gicc->vgic_interrupt;
+		acpi_data.maint_irq_mode = maint_irq_mode;
+		acpi_data.vcpu_base = gicc->gicv_base_address;
+
+		return 0;
+	}
+
+	/*
+	 * The maintenance interrupt and GICV should be the same for every CPU
+	 */
+	if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+	    (acpi_data.maint_irq_mode != maint_irq_mode) ||
+	    (acpi_data.vcpu_base != gicc->gicv_base_address))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool __init gic_acpi_collect_virt_info(void)
+{
+	int count;
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      gic_acpi_parse_virt_madt_gicc, 0);
+
+	return (count > 0);
+}
+
 #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+
+	if (!gic_acpi_collect_virt_info()) {
+		pr_warn("Unable to get hardware information used for virtualization\n");
+		return;
+	}
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v3_kvm_info.maint_irq = irq;
+
+	if (acpi_data.vcpu_base) {
+		struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+
+		vcpu->flags = IORESOURCE_MEM;
+		vcpu->start = acpi_data.vcpu_base;
+		vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+	}
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}
 
 static int __init
 gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
@@ -1161,6 +1271,8 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 		goto out_fwhandle_free;
 
 	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	gic_acpi_setup_kvm_info();
+
 	return 0;
 
 out_fwhandle_free:
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
index ef34f6f35e91..c647b0547bcd 100644
--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -15,6 +15,7 @@
 
 enum gic_type {
 	GIC_V2,
+	GIC_V3,
 };
 
 struct gic_kvm_info {
-- 
cgit v1.2.3


From 10126ac14d36e74b2705802dc915b0b18463a51f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 2 May 2016 15:10:31 -0500
Subject: PCI: Add Downstream Port Containment portdrv service type

Add the Downstream Port Containment (PCIE_PORT_SERVICE_DPC) portdrv service
type, available if the device has the DPC extended capability.

[bhelgaas: split to separate patch, changelog]
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/portdrv.h      | 2 +-
 drivers/pci/pcie/portdrv_acpi.c | 2 +-
 drivers/pci/pcie/portdrv_core.c | 4 +++-
 include/linux/pcieport_if.h     | 2 ++
 include/uapi/linux/pci_regs.h   | 3 ++-
 5 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 63cb2ef9c5ae..7d82f6d47e68 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -11,7 +11,7 @@
 
 #include <linux/compiler.h>
 
-#define PCIE_PORT_DEVICE_MAXSERVICES   4
+#define PCIE_PORT_DEVICE_MAXSERVICES   5
 /*
  * According to the PCI Express Base Specification 2.0, the indices of
  * the MSI-X table entries used by port services must not exceed 31
diff --git a/drivers/pci/pcie/portdrv_acpi.c b/drivers/pci/pcie/portdrv_acpi.c
index b4d2894ee3fc..44296eb729d3 100644
--- a/drivers/pci/pcie/portdrv_acpi.c
+++ b/drivers/pci/pcie/portdrv_acpi.c
@@ -51,7 +51,7 @@ int pcie_port_acpi_setup(struct pci_dev *port, int *srv_mask)
 
 	flags = root->osc_control_set;
 
-	*srv_mask = PCIE_PORT_SERVICE_VC;
+	*srv_mask = PCIE_PORT_SERVICE_VC | PCIE_PORT_SERVICE_DPC;
 	if (flags & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL)
 		*srv_mask |= PCIE_PORT_SERVICE_HP;
 	if (flags & OSC_PCI_EXPRESS_PME_CONTROL)
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 94d3b82415c1..2ab0f424a378 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -262,7 +262,7 @@ static int get_port_device_capability(struct pci_dev *dev)
 		return 0;
 
 	cap_mask = PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP
-			| PCIE_PORT_SERVICE_VC;
+			| PCIE_PORT_SERVICE_VC | PCIE_PORT_SERVICE_DPC;
 	if (pci_aer_available())
 		cap_mask |= PCIE_PORT_SERVICE_AER;
 
@@ -311,6 +311,8 @@ static int get_port_device_capability(struct pci_dev *dev)
 		 */
 		pcie_pme_interrupt_enable(dev, false);
 	}
+	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC))
+		services |= PCIE_PORT_SERVICE_DPC;
 
 	return services;
 }
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 4f1089f2cc98..afcd130ab3a9 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -21,6 +21,8 @@
 #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
 #define PCIE_PORT_SERVICE_VC_SHIFT	3	/* Virtual Channel */
 #define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
+#define PCIE_PORT_SERVICE_DPC_SHIFT	4	/* Downstream Port Containment */
+#define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
 
 struct pcie_device {
 	int		irq;	    /* Service IRQ/MSI/MSI-X Vector */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 1becea86c73c..61e95c142547 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -670,7 +670,8 @@
 #define PCI_EXT_CAP_ID_SECPCI	0x19	/* Secondary PCIe Capability */
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PASID
+#define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DPC
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
-- 
cgit v1.2.3


From e511267bc25e18926826e7cccdf7872bcbb4776a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 26 Apr 2016 11:38:20 +0100
Subject: io-64-nonatomic: Add relaxed accessor variants

Whilst commit 9439eb3ab9d1 ("asm-generic: io: implement relaxed
accessor macros as conditional wrappers") makes the *_relaxed forms of
I/O accessors universally available to drivers, in cases where writeq()
is implemented via the io-64-nonatomic helpers, writeq_relaxed() will
end up falling back to writel() regardless of whether writel_relaxed()
is available (identically for s/write/read/).

Add corresponding relaxed forms of the nonatomic helpers to delegate
to the equivalent 32-bit accessors as appropriate. We also need to fix
io.h to avoid defining default relaxed variants if the basic accessors
themselves don't exist.

CC: Christoph Hellwig <hch@lst.de>
CC: Darren Hart <dvhart@linux.intel.com>
CC: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/asm-generic/io.h              |  4 ++--
 include/linux/io-64-nonatomic-hi-lo.h | 25 +++++++++++++++++++++++++
 include/linux/io-64-nonatomic-lo-hi.h | 25 +++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index eed3bbe88c8a..002b81f6f2bc 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -191,7 +191,7 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 #define readl_relaxed readl
 #endif
 
-#ifndef readq_relaxed
+#if defined(readq) && !defined(readq_relaxed)
 #define readq_relaxed readq
 #endif
 
@@ -207,7 +207,7 @@ static inline void writeq(u64 value, volatile void __iomem *addr)
 #define writel_relaxed writel
 #endif
 
-#ifndef writeq_relaxed
+#if defined(writeq) && !defined(writeq_relaxed)
 #define writeq_relaxed writeq
 #endif
 
diff --git a/include/linux/io-64-nonatomic-hi-lo.h b/include/linux/io-64-nonatomic-hi-lo.h
index 11d7e840d913..defcc4644ce3 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -21,6 +21,23 @@ static inline void hi_lo_writeq(__u64 val, volatile void __iomem *addr)
 	writel(val, addr);
 }
 
+static inline __u64 hi_lo_readq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	high = readl_relaxed(p + 1);
+	low = readl_relaxed(p);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void hi_lo_writeq_relaxed(__u64 val, volatile void __iomem *addr)
+{
+	writel_relaxed(val >> 32, addr + 4);
+	writel_relaxed(val, addr);
+}
+
 #ifndef readq
 #define readq hi_lo_readq
 #endif
@@ -29,4 +46,12 @@ static inline void hi_lo_writeq(__u64 val, volatile void __iomem *addr)
 #define writeq hi_lo_writeq
 #endif
 
+#ifndef readq_relaxed
+#define readq_relaxed hi_lo_readq_relaxed
+#endif
+
+#ifndef writeq_relaxed
+#define writeq_relaxed hi_lo_writeq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h b/include/linux/io-64-nonatomic-lo-hi.h
index 1a4315f97360..084461a4e5ab 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -21,6 +21,23 @@ static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
 	writel(val >> 32, addr + 4);
 }
 
+static inline __u64 lo_hi_readq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl_relaxed(p);
+	high = readl_relaxed(p + 1);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void lo_hi_writeq_relaxed(__u64 val, volatile void __iomem *addr)
+{
+	writel_relaxed(val, addr);
+	writel_relaxed(val >> 32, addr + 4);
+}
+
 #ifndef readq
 #define readq lo_hi_readq
 #endif
@@ -29,4 +46,12 @@ static inline void lo_hi_writeq(__u64 val, volatile void __iomem *addr)
 #define writeq lo_hi_writeq
 #endif
 
+#ifndef readq_relaxed
+#define readq_relaxed lo_hi_readq_relaxed
+#endif
+
+#ifndef writeq_relaxed
+#define writeq_relaxed lo_hi_writeq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
-- 
cgit v1.2.3


From 6fb650d43da3e7054984dc548eaa88765a94d49f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 29 Apr 2016 15:25:17 -0400
Subject: USB: leave LPM alone if possible when binding/unbinding interface
 drivers

When a USB driver is bound to an interface (either through probing or
by claiming it) or is unbound from an interface, the USB core always
disables Link Power Management during the transition and then
re-enables it afterward.  The reason is because the driver might want
to prevent hub-initiated link power transitions, in which case the HCD
would have to recalculate the various LPM parameters.  This
recalculation takes place when LPM is re-enabled and the new
parameters are sent to the device and its parent hub.

However, if the driver does not want to prevent hub-initiated link
power transitions then none of this work is necessary.  The parameters
don't need to be recalculated, and LPM doesn't need to be disabled and
re-enabled.

It turns out that disabling and enabling LPM can be time-consuming,
enough so that it interferes with user programs that want to claim and
release interfaces rapidly via usbfs.  Since the usbfs kernel driver
doesn't set the disable_hub_initiated_lpm flag, we can speed things up
and get the user programs to work by leaving LPM alone whenever the
flag isn't set.

And while we're improving the way disable_hub_initiated_lpm gets used,
let's also fix its kerneldoc.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Matthew Giassa <matthew@giassa.net>
CC: Mathias Nyman <mathias.nyman@intel.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 40 +++++++++++++++++++++++-----------------
 include/linux/usb.h       |  2 +-
 2 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 2057d91d8336..dadd1e8dfe09 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -284,7 +284,7 @@ static int usb_probe_interface(struct device *dev)
 	struct usb_device *udev = interface_to_usbdev(intf);
 	const struct usb_device_id *id;
 	int error = -ENODEV;
-	int lpm_disable_error;
+	int lpm_disable_error = -ENODEV;
 
 	dev_dbg(dev, "%s\n", __func__);
 
@@ -336,12 +336,14 @@ static int usb_probe_interface(struct device *dev)
 	 * setting during probe, that should also be fine.  usb_set_interface()
 	 * will attempt to disable LPM, and fail if it can't disable it.
 	 */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
-	if (lpm_disable_error && driver->disable_hub_initiated_lpm) {
-		dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n.",
-				__func__, driver->name);
-		error = lpm_disable_error;
-		goto err;
+	if (driver->disable_hub_initiated_lpm) {
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
+		if (lpm_disable_error) {
+			dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n.",
+					__func__, driver->name);
+			error = lpm_disable_error;
+			goto err;
+		}
 	}
 
 	/* Carry out a deferred switch to altsetting 0 */
@@ -391,7 +393,8 @@ static int usb_unbind_interface(struct device *dev)
 	struct usb_interface *intf = to_usb_interface(dev);
 	struct usb_host_endpoint *ep, **eps = NULL;
 	struct usb_device *udev;
-	int i, j, error, r, lpm_disable_error;
+	int i, j, error, r;
+	int lpm_disable_error = -ENODEV;
 
 	intf->condition = USB_INTERFACE_UNBINDING;
 
@@ -399,12 +402,13 @@ static int usb_unbind_interface(struct device *dev)
 	udev = interface_to_usbdev(intf);
 	error = usb_autoresume_device(udev);
 
-	/* Hub-initiated LPM policy may change, so attempt to disable LPM until
+	/* If hub-initiated LPM policy may change, attempt to disable LPM until
 	 * the driver is unbound.  If LPM isn't disabled, that's fine because it
 	 * wouldn't be enabled unless all the bound interfaces supported
 	 * hub-initiated LPM.
 	 */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
+	if (driver->disable_hub_initiated_lpm)
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
 
 	/*
 	 * Terminate all URBs for this interface unless the driver
@@ -505,7 +509,7 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 	struct device *dev;
 	struct usb_device *udev;
 	int retval = 0;
-	int lpm_disable_error;
+	int lpm_disable_error = -ENODEV;
 
 	if (!iface)
 		return -ENODEV;
@@ -526,12 +530,14 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 
 	iface->condition = USB_INTERFACE_BOUND;
 
-	/* Disable LPM until this driver is bound. */
-	lpm_disable_error = usb_unlocked_disable_lpm(udev);
-	if (lpm_disable_error && driver->disable_hub_initiated_lpm) {
-		dev_err(&iface->dev, "%s Failed to disable LPM for driver %s\n.",
-				__func__, driver->name);
-		return -ENOMEM;
+	/* See the comment about disabling LPM in usb_probe_interface(). */
+	if (driver->disable_hub_initiated_lpm) {
+		lpm_disable_error = usb_unlocked_disable_lpm(udev);
+		if (lpm_disable_error) {
+			dev_err(&iface->dev, "%s Failed to disable LPM for driver %s\n.",
+					__func__, driver->name);
+			return -ENOMEM;
+		}
 	}
 
 	/* Claimed interfaces are initially inactive (suspended) and
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 01b6c61cf9bb..eba1f10e8cfd 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1068,7 +1068,7 @@ struct usbdrv_wrap {
  *	for interfaces bound to this driver.
  * @soft_unbind: if set to 1, the USB core will not kill URBs and disable
  *	endpoints before calling the driver's disconnect method.
- * @disable_hub_initiated_lpm: if set to 0, the USB core will not allow hubs
+ * @disable_hub_initiated_lpm: if set to 1, the USB core will not allow hubs
  *	to initiate lower power link state transitions when an idle timeout
  *	occurs.  Device-initiated USB 3.0 link PM will still be allowed.
  *
-- 
cgit v1.2.3


From 8e996a2874bbbed30e8dfe881453825fc6b7654e Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 3 May 2016 11:33:37 -0600
Subject: stm class: Support devices that override software assigned masters

Some STM devices adjust software assigned master numbers depending on
the trace source and its runtime state and whatnot. This patch adds
a sysfs attribute to inform the trace-side software that master numbers
assigned to software sources will not match those in the STP stream,
so that, for example, master/channel allocation policy can be adjusted
accordingly.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-stm | 10 ++++++++++
 drivers/hwtracing/stm/core.c              | 15 +++++++++++++++
 include/linux/stm.h                       |  3 +++
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-stm b/Documentation/ABI/testing/sysfs-class-stm
index c9aa4f3fc9a7..77ed3da0f68e 100644
--- a/Documentation/ABI/testing/sysfs-class-stm
+++ b/Documentation/ABI/testing/sysfs-class-stm
@@ -12,3 +12,13 @@ KernelVersion:	4.3
 Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 Description:
 		Shows the number of channels per master on this STM device.
+
+What:		/sys/class/stm/<stm>/hw_override
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+		Reads as 0 if master numbers in the STP stream produced by
+		this stm device will match the master numbers assigned by
+		the software or 1 if the stm hardware overrides software
+		assigned masters.
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 2591442e2c5b..ff31108b066f 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -67,9 +67,24 @@ static ssize_t channels_show(struct device *dev,
 
 static DEVICE_ATTR_RO(channels);
 
+static ssize_t hw_override_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct stm_device *stm = to_stm_device(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u\n", stm->data->hw_override);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(hw_override);
+
 static struct attribute *stm_attrs[] = {
 	&dev_attr_masters.attr,
 	&dev_attr_channels.attr,
+	&dev_attr_hw_override.attr,
 	NULL,
 };
 
diff --git a/include/linux/stm.h b/include/linux/stm.h
index 1a79ed8e43da..8369d8a8cabd 100644
--- a/include/linux/stm.h
+++ b/include/linux/stm.h
@@ -50,6 +50,8 @@ struct stm_device;
  * @sw_end:		last STP master available to software
  * @sw_nchannels:	number of STP channels per master
  * @sw_mmiosz:		size of one channel's IO space, for mmap, optional
+ * @hw_override:	masters in the STP stream will not match the ones
+ *			assigned by software, but are up to the STM hardware
  * @packet:		callback that sends an STP packet
  * @mmio_addr:		mmap callback, optional
  * @link:		called when a new stm_source gets linked to us, optional
@@ -85,6 +87,7 @@ struct stm_data {
 	unsigned int		sw_end;
 	unsigned int		sw_nchannels;
 	unsigned int		sw_mmiosz;
+	unsigned int		hw_override;
 	ssize_t			(*packet)(struct stm_data *, unsigned int,
 					  unsigned int, unsigned int,
 					  unsigned int, unsigned int,
-- 
cgit v1.2.3


From 237483aa5cf43105d148d3f03b29eed47c3e6cf9 Mon Sep 17 00:00:00 2001
From: Pratik Patel <pratikp@codeaurora.org>
Date: Tue, 3 May 2016 11:33:40 -0600
Subject: coresight: stm: adding driver for CoreSight STM component

This driver adds support for the STM CoreSight IP block, allowing any
system compoment (HW or SW) to log and aggregate messages via a
single entity.

The CoreSight STM exposes an application defined number of channels
called stimulus port.  Configuration is done using entries in sysfs
and channels made available to userspace via configfs.

Signed-off-by: Pratik Patel <pratikp@codeaurora.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Michael Williams <michael.williams@arm.com>
Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-coresight-devices-stm    |  53 ++
 Documentation/trace/coresight.txt                  |  37 +-
 drivers/hwtracing/coresight/Kconfig                |  11 +
 drivers/hwtracing/coresight/Makefile               |   1 +
 drivers/hwtracing/coresight/coresight-stm.c        | 920 +++++++++++++++++++++
 include/linux/coresight-stm.h                      |   6 +
 include/uapi/linux/coresight-stm.h                 |  21 +
 7 files changed, 1047 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
 create mode 100644 drivers/hwtracing/coresight/coresight-stm.c
 create mode 100644 include/linux/coresight-stm.h
 create mode 100644 include/uapi/linux/coresight-stm.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
new file mode 100644
index 000000000000..1dffabe7f48d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
@@ -0,0 +1,53 @@
+What:		/sys/bus/coresight/devices/<memory_map>.stm/enable_source
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Enable/disable tracing on this specific trace macrocell.
+		Enabling the trace macrocell implies it has been configured
+		properly and a sink has been identified for it.  The path
+		of coresight components linking the source to the sink is
+		configured and managed automatically by the coresight framework.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/hwevent_enable
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Provides access to the HW event enable register, used in
+		conjunction with HW event bank select register.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/hwevent_select
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Gives access to the HW event block select register
+		(STMHEBSR) in order to configure up to 256 channels.  Used in
+		conjunction with "hwevent_enable" register as described above.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/port_enable
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Provides access to the stimulus port enable register
+		(STMSPER).  Used in conjunction with "port_select" described
+		below.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/port_select
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Used to determine which bank of stimulus port bit in
+		register STMSPER (see above) apply to.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/status
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(R) List various control and status registers.  The specific
+		layout and content is driver specific.
+
+What:		/sys/bus/coresight/devices/<memory_map>.stm/traceid
+Date:		April 2016
+KernelVersion:	4.7
+Contact:	Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:	(RW) Holds the trace ID that will appear in the trace stream
+		coming from this trace entity.
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 0a5c3290e732..a33c88cd5d1d 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -190,8 +190,8 @@ expected to be accessed and controlled using those entries.
 Last but not least, "struct module *owner" is expected to be set to reflect
 the information carried in "THIS_MODULE".
 
-How to use
-----------
+How to use the tracer modules
+-----------------------------
 
 Before trace collection can start, a coresight sink needs to be identify.
 There is no limit on the amount of sinks (nor sources) that can be enabled at
@@ -297,3 +297,36 @@ Info                                    Tracing enabled
 Instruction     13570831        0x8026B584      E28DD00C        false   ADD      sp,sp,#0xc
 Instruction     0       0x8026B588      E8BD8000        true    LDM      sp!,{pc}
 Timestamp                                       Timestamp: 17107041535
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source   hwevent_select  port_enable     subsystem       uevent
+hwevent_enable  mgmt            port_select     traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw-------    1 root     root       10,  61 Jan  3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig
index db0541031c72..130cb2114059 100644
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -78,4 +78,15 @@ config CORESIGHT_QCOM_REPLICATOR
 	  programmable ATB replicator sends the ATB trace stream from the
 	  ETB/ETF to the TPIUi and ETR.
 
+config CORESIGHT_STM
+	bool "CoreSight System Trace Macrocell driver"
+	depends on (ARM && !(CPU_32v3 || CPU_32v4 || CPU_32v4T)) || ARM64
+	select CORESIGHT_LINKS_AND_SINKS
+	select STM
+	help
+	  This driver provides support for hardware assisted software
+	  instrumentation based tracing. This is primarily used for
+	  logging useful software events or data coming from various entities
+	  in the system, possibly running different OSs
+
 endif
diff --git a/drivers/hwtracing/coresight/Makefile b/drivers/hwtracing/coresight/Makefile
index 1d0e32c7dbe4..c6f84b57f52a 100644
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_CORESIGHT_SOURCE_ETM3X) += coresight-etm3x.o coresight-etm-cp14.o \
 obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o \
 					coresight-etm4x-sysfs.o
 obj-$(CONFIG_CORESIGHT_QCOM_REPLICATOR) += coresight-replicator-qcom.o
+obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
new file mode 100644
index 000000000000..73be58a11e4f
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -0,0 +1,920 @@
+/* Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight System Trace Macrocell driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Initial implementation by Pratik Patel
+ * (C) 2014-2015 Pratik Patel <pratikp@codeaurora.org>
+ *
+ * Serious refactoring, code cleanup and upgrading to the Coresight upstream
+ * framework by Mathieu Poirier
+ * (C) 2015-2016 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * Guaranteed timing and support for various packet type coming from the
+ * generic STM API by Chunyan Zhang
+ * (C) 2015-2016 Chunyan Zhang <zhang.chunyan@linaro.org>
+ */
+#include <asm/local.h>
+#include <linux/amba/bus.h>
+#include <linux/bitmap.h>
+#include <linux/clk.h>
+#include <linux/coresight.h>
+#include <linux/coresight-stm.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/of_address.h>
+#include <linux/perf_event.h>
+#include <linux/pm_runtime.h>
+#include <linux/stm.h>
+
+#include "coresight-priv.h"
+
+#define STMDMASTARTR			0xc04
+#define STMDMASTOPR			0xc08
+#define STMDMASTATR			0xc0c
+#define STMDMACTLR			0xc10
+#define STMDMAIDR			0xcfc
+#define STMHEER				0xd00
+#define STMHETER			0xd20
+#define STMHEBSR			0xd60
+#define STMHEMCR			0xd64
+#define STMHEMASTR			0xdf4
+#define STMHEFEAT1R			0xdf8
+#define STMHEIDR			0xdfc
+#define STMSPER				0xe00
+#define STMSPTER			0xe20
+#define STMPRIVMASKR			0xe40
+#define STMSPSCR			0xe60
+#define STMSPMSCR			0xe64
+#define STMSPOVERRIDER			0xe68
+#define STMSPMOVERRIDER			0xe6c
+#define STMSPTRIGCSR			0xe70
+#define STMTCSR				0xe80
+#define STMTSSTIMR			0xe84
+#define STMTSFREQR			0xe8c
+#define STMSYNCR			0xe90
+#define STMAUXCR			0xe94
+#define STMSPFEAT1R			0xea0
+#define STMSPFEAT2R			0xea4
+#define STMSPFEAT3R			0xea8
+#define STMITTRIGGER			0xee8
+#define STMITATBDATA0			0xeec
+#define STMITATBCTR2			0xef0
+#define STMITATBID			0xef4
+#define STMITATBCTR0			0xef8
+
+#define STM_32_CHANNEL			32
+#define BYTES_PER_CHANNEL		256
+#define STM_TRACE_BUF_SIZE		4096
+#define STM_SW_MASTER_END		127
+
+/* Register bit definition */
+#define STMTCSR_BUSY_BIT		23
+/* Reserve the first 10 channels for kernel usage */
+#define STM_CHANNEL_OFFSET		0
+
+enum stm_pkt_type {
+	STM_PKT_TYPE_DATA	= 0x98,
+	STM_PKT_TYPE_FLAG	= 0xE8,
+	STM_PKT_TYPE_TRIG	= 0xF8,
+};
+
+#define stm_channel_addr(drvdata, ch)	(drvdata->chs.base +	\
+					(ch * BYTES_PER_CHANNEL))
+#define stm_channel_off(type, opts)	(type & ~opts)
+
+static int boot_nr_channel;
+
+/*
+ * Not really modular but using module_param is the easiest way to
+ * remain consistent with existing use cases for now.
+ */
+module_param_named(
+	boot_nr_channel, boot_nr_channel, int, S_IRUGO
+);
+
+/**
+ * struct channel_space - central management entity for extended ports
+ * @base:		memory mapped base address where channels start.
+ * @guaraneed:		is the channel delivery guaranteed.
+ */
+struct channel_space {
+	void __iomem		*base;
+	unsigned long		*guaranteed;
+};
+
+/**
+ * struct stm_drvdata - specifics associated to an STM component
+ * @base:		memory mapped base address for this component.
+ * @dev:		the device entity associated to this component.
+ * @atclk:		optional clock for the core parts of the STM.
+ * @csdev:		component vitals needed by the framework.
+ * @spinlock:		only one at a time pls.
+ * @chs:		the channels accociated to this STM.
+ * @stm:		structure associated to the generic STM interface.
+ * @mode:		this tracer's mode, i.e sysFS, or disabled.
+ * @traceid:		value of the current ID for this component.
+ * @write_bytes:	Maximus bytes this STM can write at a time.
+ * @stmsper:		settings for register STMSPER.
+ * @stmspscr:		settings for register STMSPSCR.
+ * @numsp:		the total number of stimulus port support by this STM.
+ * @stmheer:		settings for register STMHEER.
+ * @stmheter:		settings for register STMHETER.
+ * @stmhebsr:		settings for register STMHEBSR.
+ */
+struct stm_drvdata {
+	void __iomem		*base;
+	struct device		*dev;
+	struct clk		*atclk;
+	struct coresight_device	*csdev;
+	spinlock_t		spinlock;
+	struct channel_space	chs;
+	struct stm_data		stm;
+	local_t			mode;
+	u8			traceid;
+	u32			write_bytes;
+	u32			stmsper;
+	u32			stmspscr;
+	u32			numsp;
+	u32			stmheer;
+	u32			stmheter;
+	u32			stmhebsr;
+};
+
+static void stm_hwevent_enable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(drvdata->stmhebsr, drvdata->base + STMHEBSR);
+	writel_relaxed(drvdata->stmheter, drvdata->base + STMHETER);
+	writel_relaxed(drvdata->stmheer, drvdata->base + STMHEER);
+	writel_relaxed(0x01 |	/* Enable HW event tracing */
+		       0x04,	/* Error detection on event tracing */
+		       drvdata->base + STMHEMCR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_port_enable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+	/* ATB trigger enable on direct writes to TRIG locations */
+	writel_relaxed(0x10,
+		       drvdata->base + STMSPTRIGCSR);
+	writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+	writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_enable_hw(struct stm_drvdata *drvdata)
+{
+	if (drvdata->stmheer)
+		stm_hwevent_enable_hw(drvdata);
+
+	stm_port_enable_hw(drvdata);
+
+	CS_UNLOCK(drvdata->base);
+
+	/* 4096 byte between synchronisation packets */
+	writel_relaxed(0xFFF, drvdata->base + STMSYNCR);
+	writel_relaxed((drvdata->traceid << 16 | /* trace id */
+			0x02 |			 /* timestamp enable */
+			0x01),			 /* global STM enable */
+			drvdata->base + STMTCSR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static int stm_enable(struct coresight_device *csdev,
+		      struct perf_event_attr *attr, u32 mode)
+{
+	u32 val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	if (mode != CS_MODE_SYSFS)
+		return -EINVAL;
+
+	val = local_cmpxchg(&drvdata->mode, CS_MODE_DISABLED, mode);
+
+	/* Someone is already using the tracer */
+	if (val)
+		return -EBUSY;
+
+	pm_runtime_get_sync(drvdata->dev);
+
+	spin_lock(&drvdata->spinlock);
+	stm_enable_hw(drvdata);
+	spin_unlock(&drvdata->spinlock);
+
+	dev_info(drvdata->dev, "STM tracing enabled\n");
+	return 0;
+}
+
+static void stm_hwevent_disable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(0x0, drvdata->base + STMHEMCR);
+	writel_relaxed(0x0, drvdata->base + STMHEER);
+	writel_relaxed(0x0, drvdata->base + STMHETER);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_port_disable_hw(struct stm_drvdata *drvdata)
+{
+	CS_UNLOCK(drvdata->base);
+
+	writel_relaxed(0x0, drvdata->base + STMSPER);
+	writel_relaxed(0x0, drvdata->base + STMSPTRIGCSR);
+
+	CS_LOCK(drvdata->base);
+}
+
+static void stm_disable_hw(struct stm_drvdata *drvdata)
+{
+	u32 val;
+
+	CS_UNLOCK(drvdata->base);
+
+	val = readl_relaxed(drvdata->base + STMTCSR);
+	val &= ~0x1; /* clear global STM enable [0] */
+	writel_relaxed(val, drvdata->base + STMTCSR);
+
+	CS_LOCK(drvdata->base);
+
+	stm_port_disable_hw(drvdata);
+	if (drvdata->stmheer)
+		stm_hwevent_disable_hw(drvdata);
+}
+
+static void stm_disable(struct coresight_device *csdev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	/*
+	 * For as long as the tracer isn't disabled another entity can't
+	 * change its status.  As such we can read the status here without
+	 * fearing it will change under us.
+	 */
+	if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+		spin_lock(&drvdata->spinlock);
+		stm_disable_hw(drvdata);
+		spin_unlock(&drvdata->spinlock);
+
+		/* Wait until the engine has completely stopped */
+		coresight_timeout(drvdata, STMTCSR, STMTCSR_BUSY_BIT, 0);
+
+		pm_runtime_put(drvdata->dev);
+
+		local_set(&drvdata->mode, CS_MODE_DISABLED);
+		dev_info(drvdata->dev, "STM tracing disabled\n");
+	}
+}
+
+static int stm_trace_id(struct coresight_device *csdev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+	return drvdata->traceid;
+}
+
+static const struct coresight_ops_source stm_source_ops = {
+	.trace_id	= stm_trace_id,
+	.enable		= stm_enable,
+	.disable	= stm_disable,
+};
+
+static const struct coresight_ops stm_cs_ops = {
+	.source_ops	= &stm_source_ops,
+};
+
+static inline bool stm_addr_unaligned(const void *addr, u8 write_bytes)
+{
+	return ((unsigned long)addr & (write_bytes - 1));
+}
+
+static void stm_send(void *addr, const void *data, u32 size, u8 write_bytes)
+{
+	u8 paload[8];
+
+	if (stm_addr_unaligned(data, write_bytes)) {
+		memcpy(paload, data, size);
+		data = paload;
+	}
+
+	/* now we are 64bit/32bit aligned */
+	switch (size) {
+#ifdef CONFIG_64BIT
+	case 8:
+		writeq_relaxed(*(u64 *)data, addr);
+		break;
+#endif
+	case 4:
+		writel_relaxed(*(u32 *)data, addr);
+		break;
+	case 2:
+		writew_relaxed(*(u16 *)data, addr);
+		break;
+	case 1:
+		writeb_relaxed(*(u8 *)data, addr);
+		break;
+	default:
+		break;
+	}
+}
+
+static int stm_generic_link(struct stm_data *stm_data,
+			    unsigned int master,  unsigned int channel)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!drvdata || !drvdata->csdev)
+		return -EINVAL;
+
+	return coresight_enable(drvdata->csdev);
+}
+
+static void stm_generic_unlink(struct stm_data *stm_data,
+			       unsigned int master,  unsigned int channel)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!drvdata || !drvdata->csdev)
+		return;
+
+	stm_disable(drvdata->csdev);
+}
+
+static long stm_generic_set_options(struct stm_data *stm_data,
+				    unsigned int master,
+				    unsigned int channel,
+				    unsigned int nr_chans,
+				    unsigned long options)
+{
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+	if (!(drvdata && local_read(&drvdata->mode)))
+		return -EINVAL;
+
+	if (channel >= drvdata->numsp)
+		return -EINVAL;
+
+	switch (options) {
+	case STM_OPTION_GUARANTEED:
+		set_bit(channel, drvdata->chs.guaranteed);
+		break;
+
+	case STM_OPTION_INVARIANT:
+		clear_bit(channel, drvdata->chs.guaranteed);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static ssize_t stm_generic_packet(struct stm_data *stm_data,
+				  unsigned int master,
+				  unsigned int channel,
+				  unsigned int packet,
+				  unsigned int flags,
+				  unsigned int size,
+				  const unsigned char *payload)
+{
+	unsigned long ch_addr;
+	struct stm_drvdata *drvdata = container_of(stm_data,
+						   struct stm_drvdata, stm);
+
+	if (!(drvdata && local_read(&drvdata->mode)))
+		return 0;
+
+	if (channel >= drvdata->numsp)
+		return 0;
+
+	ch_addr = (unsigned long)stm_channel_addr(drvdata, channel);
+
+	flags = (flags == STP_PACKET_TIMESTAMPED) ? STM_FLAG_TIMESTAMPED : 0;
+	flags |= test_bit(channel, drvdata->chs.guaranteed) ?
+			   STM_FLAG_GUARANTEED : 0;
+
+	if (size > drvdata->write_bytes)
+		size = drvdata->write_bytes;
+	else
+		size = rounddown_pow_of_two(size);
+
+	switch (packet) {
+	case STP_PACKET_FLAG:
+		ch_addr |= stm_channel_off(STM_PKT_TYPE_FLAG, flags);
+
+		/*
+		 * The generic STM core sets a size of '0' on flag packets.
+		 * As such send a flag packet of size '1' and tell the
+		 * core we did so.
+		 */
+		stm_send((void *)ch_addr, payload, 1, drvdata->write_bytes);
+		size = 1;
+		break;
+
+	case STP_PACKET_DATA:
+		ch_addr |= stm_channel_off(STM_PKT_TYPE_DATA, flags);
+		stm_send((void *)ch_addr, payload, size,
+				drvdata->write_bytes);
+		break;
+
+	default:
+		return -ENOTSUPP;
+	}
+
+	return size;
+}
+
+static ssize_t hwevent_enable_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val = drvdata->stmheer;
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_enable_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return -EINVAL;
+
+	drvdata->stmheer = val;
+	/* HW event enable and trigger go hand in hand */
+	drvdata->stmheter = val;
+
+	return size;
+}
+static DEVICE_ATTR_RW(hwevent_enable);
+
+static ssize_t hwevent_select_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val = drvdata->stmhebsr;
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_select_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return -EINVAL;
+
+	drvdata->stmhebsr = val;
+
+	return size;
+}
+static DEVICE_ATTR_RW(hwevent_select);
+
+static ssize_t port_select_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+
+	if (!local_read(&drvdata->mode)) {
+		val = drvdata->stmspscr;
+	} else {
+		spin_lock(&drvdata->spinlock);
+		val = readl_relaxed(drvdata->base + STMSPSCR);
+		spin_unlock(&drvdata->spinlock);
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_select_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val, stmsper;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	spin_lock(&drvdata->spinlock);
+	drvdata->stmspscr = val;
+
+	if (local_read(&drvdata->mode)) {
+		CS_UNLOCK(drvdata->base);
+		/* Process as per ARM's TRM recommendation */
+		stmsper = readl_relaxed(drvdata->base + STMSPER);
+		writel_relaxed(0x0, drvdata->base + STMSPER);
+		writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+		writel_relaxed(stmsper, drvdata->base + STMSPER);
+		CS_LOCK(drvdata->base);
+	}
+	spin_unlock(&drvdata->spinlock);
+
+	return size;
+}
+static DEVICE_ATTR_RW(port_select);
+
+static ssize_t port_enable_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+
+	if (!local_read(&drvdata->mode)) {
+		val = drvdata->stmsper;
+	} else {
+		spin_lock(&drvdata->spinlock);
+		val = readl_relaxed(drvdata->base + STMSPER);
+		spin_unlock(&drvdata->spinlock);
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_enable_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t size)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+	unsigned long val;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	spin_lock(&drvdata->spinlock);
+	drvdata->stmsper = val;
+
+	if (local_read(&drvdata->mode)) {
+		CS_UNLOCK(drvdata->base);
+		writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+		CS_LOCK(drvdata->base);
+	}
+	spin_unlock(&drvdata->spinlock);
+
+	return size;
+}
+static DEVICE_ATTR_RW(port_enable);
+
+static ssize_t traceid_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+	val = drvdata->traceid;
+	return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t traceid_store(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t size)
+{
+	int ret;
+	unsigned long val;
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return ret;
+
+	/* traceid field is 7bit wide on STM32 */
+	drvdata->traceid = val & 0x7f;
+	return size;
+}
+static DEVICE_ATTR_RW(traceid);
+
+#define coresight_stm_simple_func(name, offset)	\
+	coresight_simple_func(struct stm_drvdata, name, offset)
+
+coresight_stm_simple_func(tcsr, STMTCSR);
+coresight_stm_simple_func(tsfreqr, STMTSFREQR);
+coresight_stm_simple_func(syncr, STMSYNCR);
+coresight_stm_simple_func(sper, STMSPER);
+coresight_stm_simple_func(spter, STMSPTER);
+coresight_stm_simple_func(privmaskr, STMPRIVMASKR);
+coresight_stm_simple_func(spscr, STMSPSCR);
+coresight_stm_simple_func(spmscr, STMSPMSCR);
+coresight_stm_simple_func(spfeat1r, STMSPFEAT1R);
+coresight_stm_simple_func(spfeat2r, STMSPFEAT2R);
+coresight_stm_simple_func(spfeat3r, STMSPFEAT3R);
+coresight_stm_simple_func(devid, CORESIGHT_DEVID);
+
+static struct attribute *coresight_stm_attrs[] = {
+	&dev_attr_hwevent_enable.attr,
+	&dev_attr_hwevent_select.attr,
+	&dev_attr_port_enable.attr,
+	&dev_attr_port_select.attr,
+	&dev_attr_traceid.attr,
+	NULL,
+};
+
+static struct attribute *coresight_stm_mgmt_attrs[] = {
+	&dev_attr_tcsr.attr,
+	&dev_attr_tsfreqr.attr,
+	&dev_attr_syncr.attr,
+	&dev_attr_sper.attr,
+	&dev_attr_spter.attr,
+	&dev_attr_privmaskr.attr,
+	&dev_attr_spscr.attr,
+	&dev_attr_spmscr.attr,
+	&dev_attr_spfeat1r.attr,
+	&dev_attr_spfeat2r.attr,
+	&dev_attr_spfeat3r.attr,
+	&dev_attr_devid.attr,
+	NULL,
+};
+
+static const struct attribute_group coresight_stm_group = {
+	.attrs = coresight_stm_attrs,
+};
+
+static const struct attribute_group coresight_stm_mgmt_group = {
+	.attrs = coresight_stm_mgmt_attrs,
+	.name = "mgmt",
+};
+
+static const struct attribute_group *coresight_stm_groups[] = {
+	&coresight_stm_group,
+	&coresight_stm_mgmt_group,
+	NULL,
+};
+
+static int stm_get_resource_byname(struct device_node *np,
+				   char *ch_base, struct resource *res)
+{
+	const char *name = NULL;
+	int index = 0, found = 0;
+
+	while (!of_property_read_string_index(np, "reg-names", index, &name)) {
+		if (strcmp(ch_base, name)) {
+			index++;
+			continue;
+		}
+
+		/* We have a match and @index is where it's at */
+		found = 1;
+		break;
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	return of_address_to_resource(np, index, res);
+}
+
+static u32 stm_fundamental_data_size(struct stm_drvdata *drvdata)
+{
+	u32 stmspfeat2r;
+
+	if (!IS_ENABLED(CONFIG_64BIT))
+		return 4;
+
+	stmspfeat2r = readl_relaxed(drvdata->base + STMSPFEAT2R);
+
+	/*
+	 * bit[15:12] represents the fundamental data size
+	 * 0 - 32-bit data
+	 * 1 - 64-bit data
+	 */
+	return BMVAL(stmspfeat2r, 12, 15) ? 8 : 4;
+}
+
+static u32 stm_num_stimulus_port(struct stm_drvdata *drvdata)
+{
+	u32 numsp;
+
+	numsp = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
+	/*
+	 * NUMPS in STMDEVID is 17 bit long and if equal to 0x0,
+	 * 32 stimulus ports are supported.
+	 */
+	numsp &= 0x1ffff;
+	if (!numsp)
+		numsp = STM_32_CHANNEL;
+	return numsp;
+}
+
+static void stm_init_default_data(struct stm_drvdata *drvdata)
+{
+	/* Don't use port selection */
+	drvdata->stmspscr = 0x0;
+	/*
+	 * Enable all channel regardless of their number.  When port
+	 * selection isn't used (see above) STMSPER applies to all
+	 * 32 channel group available, hence setting all 32 bits to 1
+	 */
+	drvdata->stmsper = ~0x0;
+
+	/*
+	 * The trace ID value for *ETM* tracers start at CPU_ID * 2 + 0x10 and
+	 * anything equal to or higher than 0x70 is reserved.  Since 0x00 is
+	 * also reserved the STM trace ID needs to be higher than 0x00 and
+	 * lowner than 0x10.
+	 */
+	drvdata->traceid = 0x1;
+
+	/* Set invariant transaction timing on all channels */
+	bitmap_clear(drvdata->chs.guaranteed, 0, drvdata->numsp);
+}
+
+static void stm_init_generic_data(struct stm_drvdata *drvdata)
+{
+	drvdata->stm.name = dev_name(drvdata->dev);
+
+	/*
+	 * MasterIDs are assigned at HW design phase. As such the core is
+	 * using a single master for interaction with this device.
+	 */
+	drvdata->stm.sw_start = 1;
+	drvdata->stm.sw_end = 1;
+	drvdata->stm.hw_override = true;
+	drvdata->stm.sw_nchannels = drvdata->numsp;
+	drvdata->stm.packet = stm_generic_packet;
+	drvdata->stm.link = stm_generic_link;
+	drvdata->stm.unlink = stm_generic_unlink;
+	drvdata->stm.set_options = stm_generic_set_options;
+}
+
+static int stm_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	int ret;
+	void __iomem *base;
+	unsigned long *guaranteed;
+	struct device *dev = &adev->dev;
+	struct coresight_platform_data *pdata = NULL;
+	struct stm_drvdata *drvdata;
+	struct resource *res = &adev->res;
+	struct resource ch_res;
+	size_t res_size, bitmap_size;
+	struct coresight_desc *desc;
+	struct device_node *np = adev->dev.of_node;
+
+	if (np) {
+		pdata = of_get_coresight_platform_data(dev, np);
+		if (IS_ERR(pdata))
+			return PTR_ERR(pdata);
+		adev->dev.platform_data = pdata;
+	}
+	drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+	if (!drvdata)
+		return -ENOMEM;
+
+	drvdata->dev = &adev->dev;
+	drvdata->atclk = devm_clk_get(&adev->dev, "atclk"); /* optional */
+	if (!IS_ERR(drvdata->atclk)) {
+		ret = clk_prepare_enable(drvdata->atclk);
+		if (ret)
+			return ret;
+	}
+	dev_set_drvdata(dev, drvdata);
+
+	base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+	drvdata->base = base;
+
+	ret = stm_get_resource_byname(np, "stm-stimulus-base", &ch_res);
+	if (ret)
+		return ret;
+
+	base = devm_ioremap_resource(dev, &ch_res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+	drvdata->chs.base = base;
+
+	drvdata->write_bytes = stm_fundamental_data_size(drvdata);
+
+	if (boot_nr_channel) {
+		drvdata->numsp = boot_nr_channel;
+		res_size = min((resource_size_t)(boot_nr_channel *
+				  BYTES_PER_CHANNEL), resource_size(res));
+	} else {
+		drvdata->numsp = stm_num_stimulus_port(drvdata);
+		res_size = min((resource_size_t)(drvdata->numsp *
+				 BYTES_PER_CHANNEL), resource_size(res));
+	}
+	bitmap_size = BITS_TO_LONGS(drvdata->numsp) * sizeof(long);
+
+	guaranteed = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
+	if (!guaranteed)
+		return -ENOMEM;
+	drvdata->chs.guaranteed = guaranteed;
+
+	spin_lock_init(&drvdata->spinlock);
+
+	stm_init_default_data(drvdata);
+	stm_init_generic_data(drvdata);
+
+	if (stm_register_device(dev, &drvdata->stm, THIS_MODULE)) {
+		dev_info(dev,
+			 "stm_register_device failed, probing deffered\n");
+		return -EPROBE_DEFER;
+	}
+
+	desc = devm_kzalloc(dev, sizeof(*desc), GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto stm_unregister;
+	}
+
+	desc->type = CORESIGHT_DEV_TYPE_SOURCE;
+	desc->subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE;
+	desc->ops = &stm_cs_ops;
+	desc->pdata = pdata;
+	desc->dev = dev;
+	desc->groups = coresight_stm_groups;
+	drvdata->csdev = coresight_register(desc);
+	if (IS_ERR(drvdata->csdev)) {
+		ret = PTR_ERR(drvdata->csdev);
+		goto stm_unregister;
+	}
+
+	pm_runtime_put(&adev->dev);
+
+	dev_info(dev, "%s initialized\n", (char *)id->data);
+	return 0;
+
+stm_unregister:
+	stm_unregister_device(&drvdata->stm);
+	return ret;
+}
+
+#ifdef CONFIG_PM
+static int stm_runtime_suspend(struct device *dev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+	if (drvdata && !IS_ERR(drvdata->atclk))
+		clk_disable_unprepare(drvdata->atclk);
+
+	return 0;
+}
+
+static int stm_runtime_resume(struct device *dev)
+{
+	struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+	if (drvdata && !IS_ERR(drvdata->atclk))
+		clk_prepare_enable(drvdata->atclk);
+
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops stm_dev_pm_ops = {
+	SET_RUNTIME_PM_OPS(stm_runtime_suspend, stm_runtime_resume, NULL)
+};
+
+static struct amba_id stm_ids[] = {
+	{
+		.id     = 0x0003b962,
+		.mask   = 0x0003ffff,
+		.data	= "STM32",
+	},
+	{ 0, 0},
+};
+
+static struct amba_driver stm_driver = {
+	.drv = {
+		.name   = "coresight-stm",
+		.owner	= THIS_MODULE,
+		.pm	= &stm_dev_pm_ops,
+		.suppress_bind_attrs = true,
+	},
+	.probe          = stm_probe,
+	.id_table	= stm_ids,
+};
+
+builtin_amba_driver(stm_driver);
diff --git a/include/linux/coresight-stm.h b/include/linux/coresight-stm.h
new file mode 100644
index 000000000000..a978bb85599a
--- /dev/null
+++ b/include/linux/coresight-stm.h
@@ -0,0 +1,6 @@
+#ifndef __LINUX_CORESIGHT_STM_H_
+#define __LINUX_CORESIGHT_STM_H_
+
+#include <uapi/linux/coresight-stm.h>
+
+#endif
diff --git a/include/uapi/linux/coresight-stm.h b/include/uapi/linux/coresight-stm.h
new file mode 100644
index 000000000000..7e4272cf1fb2
--- /dev/null
+++ b/include/uapi/linux/coresight-stm.h
@@ -0,0 +1,21 @@
+#ifndef __UAPI_CORESIGHT_STM_H_
+#define __UAPI_CORESIGHT_STM_H_
+
+#define STM_FLAG_TIMESTAMPED   BIT(3)
+#define STM_FLAG_GUARANTEED    BIT(7)
+
+/*
+ * The CoreSight STM supports guaranteed and invariant timing
+ * transactions.  Guaranteed transactions are guaranteed to be
+ * traced, this might involve stalling the bus or system to
+ * ensure the transaction is accepted by the STM.  While invariant
+ * timing transactions are not guaranteed to be traced, they
+ * will take an invariant amount of time regardless of the
+ * state of the STM.
+ */
+enum {
+	STM_OPTION_GUARANTEED = 0,
+	STM_OPTION_INVARIANT,
+};
+
+#endif
-- 
cgit v1.2.3


From 18d28819809909c3f24bb72183a901c5e332a63d Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Tue, 3 May 2016 09:46:22 +0200
Subject: mcb: Correctly initialize the bus's device

The mcb bus' device member wasn't correctly initialized and thus wasn't placed
correctly into the driver model.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Andreas Werner <andreas.werner@men.de>
Tested-by: Andreas Werner <andreas.werner@men.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mcb/mcb-core.c | 19 ++++++++++++++++---
 include/linux/mcb.h    |  5 ++---
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index a4be451074e5..1e336cc56751 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -187,6 +187,7 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 {
 	struct mcb_bus *bus;
 	int bus_nr;
+	int rc;
 
 	bus = kzalloc(sizeof(struct mcb_bus), GFP_KERNEL);
 	if (!bus)
@@ -194,14 +195,26 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 
 	bus_nr = ida_simple_get(&mcb_ida, 0, 0, GFP_KERNEL);
 	if (bus_nr < 0) {
-		kfree(bus);
-		return ERR_PTR(bus_nr);
+		rc = bus_nr;
+		goto err_free;
 	}
 
-	INIT_LIST_HEAD(&bus->children);
 	bus->bus_nr = bus_nr;
 	bus->carrier = carrier;
+
+	device_initialize(&bus->dev);
+	bus->dev.parent = carrier;
+	bus->dev.bus = &mcb_bus_type;
+
+	dev_set_name(&bus->dev, "mcb:%d", bus_nr);
+	rc = device_add(&bus->dev);
+	if (rc)
+		goto err_free;
+
 	return bus;
+err_free:
+	kfree(bus);
+	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_GPL(mcb_alloc_bus);
 
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index ed06e15a36aa..3efafbca166d 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -21,13 +21,12 @@ struct mcb_device;
 /**
  * struct mcb_bus - MEN Chameleon Bus
  *
- * @dev: pointer to carrier device
- * @children: the child busses
+ * @dev: bus device
+ * @carrier: pointer to carrier device
  * @bus_nr: mcb bus number
  * @get_irq: callback to get IRQ number
  */
 struct mcb_bus {
-	struct list_head children;
 	struct device dev;
 	struct device *carrier;
 	int bus_nr;
-- 
cgit v1.2.3


From 803f1ca60d5c0107adfbce4e2d70488598b03a80 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Tue, 3 May 2016 09:46:23 +0200
Subject: mcb: export bus information via sysfs

Export information about the bus stored in the FPGA's header to userspace via
sysfs, instead of hiding it in pr_debug()s from everyone.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Andreas Werner <andreas.werner@men.de>
Tested-by: Andreas Werner <andreas.werner@men.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-mcb | 29 ++++++++++++++++
 drivers/mcb/mcb-core.c                  | 60 +++++++++++++++++++++++++++++++++
 drivers/mcb/mcb-internal.h              |  1 -
 drivers/mcb/mcb-parse.c                 | 15 +++------
 include/linux/mcb.h                     |  9 +++++
 5 files changed, 103 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-mcb

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-mcb b/Documentation/ABI/testing/sysfs-bus-mcb
new file mode 100644
index 000000000000..77947c509796
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-mcb
@@ -0,0 +1,29 @@
+What:		/sys/bus/mcb/devices/mcb:X
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	Hardware chip or device hosting the MEN chameleon bus
+
+What:		/sys/bus/mcb/devices/mcb:X/revision
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's revision number
+
+What:		/sys/bus/mcb/devices/mcb:X/minor
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's minor number
+
+What:		/sys/bus/mcb/devices/mcb:X/model
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's model number
+
+What:		/sys/bus/mcb/devices/mcb:X/name
+Date:		March 2016
+KernelVersion:	4.7
+Contact:	Johannes Thumshirn <jth@kernel.org>
+Description:	The FPGA's name
diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index 1e336cc56751..9ae4d15fc229 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -90,6 +90,60 @@ static void mcb_shutdown(struct device *dev)
 		mdrv->shutdown(mdev);
 }
 
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", bus->revision);
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t model_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%c\n", bus->model);
+}
+static DEVICE_ATTR_RO(model);
+
+static ssize_t minor_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", bus->minor);
+}
+static DEVICE_ATTR_RO(minor);
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct mcb_bus *bus = to_mcb_bus(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", bus->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static struct attribute *mcb_bus_attrs[] = {
+	&dev_attr_revision.attr,
+	&dev_attr_model.attr,
+	&dev_attr_minor.attr,
+	&dev_attr_name.attr,
+	NULL,
+};
+
+static const struct attribute_group mcb_carrier_group = {
+	.attrs = mcb_bus_attrs,
+};
+
+static const struct attribute_group *mcb_carrier_groups[] = {
+	&mcb_carrier_group,
+	NULL,
+};
+
+
 static struct bus_type mcb_bus_type = {
 	.name = "mcb",
 	.match = mcb_match,
@@ -99,6 +153,11 @@ static struct bus_type mcb_bus_type = {
 	.shutdown = mcb_shutdown,
 };
 
+static struct device_type mcb_carrier_device_type = {
+	.name = "mcb-carrier",
+	.groups = mcb_carrier_groups,
+};
+
 /**
  * __mcb_register_driver() - Register a @mcb_driver at the system
  * @drv: The @mcb_driver
@@ -205,6 +264,7 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier)
 	device_initialize(&bus->dev);
 	bus->dev.parent = carrier;
 	bus->dev.bus = &mcb_bus_type;
+	bus->dev.type = &mcb_carrier_device_type;
 
 	dev_set_name(&bus->dev, "mcb:%d", bus_nr);
 	rc = device_add(&bus->dev);
diff --git a/drivers/mcb/mcb-internal.h b/drivers/mcb/mcb-internal.h
index fb7493dcfb79..5254e0285725 100644
--- a/drivers/mcb/mcb-internal.h
+++ b/drivers/mcb/mcb-internal.h
@@ -5,7 +5,6 @@
 
 #define PCI_VENDOR_ID_MEN		0x1a88
 #define PCI_DEVICE_ID_MEN_CHAMELEON	0x4d45
-#define CHAMELEON_FILENAME_LEN		12
 #define CHAMELEONV2_MAGIC		0xabce
 #define CHAM_HEADER_SIZE		0x200
 
diff --git a/drivers/mcb/mcb-parse.c b/drivers/mcb/mcb-parse.c
index 004926955263..35f385b59221 100644
--- a/drivers/mcb/mcb-parse.c
+++ b/drivers/mcb/mcb-parse.c
@@ -113,16 +113,11 @@ int chameleon_parse_cells(struct mcb_bus *bus, phys_addr_t mapbase,
 	}
 	p += hsize;
 
-	pr_debug("header->revision = %d\n", header->revision);
-	pr_debug("header->model = 0x%x ('%c')\n", header->model,
-		header->model);
-	pr_debug("header->minor = %d\n", header->minor);
-	pr_debug("header->bus_type = 0x%x\n", header->bus_type);
-
-
-	pr_debug("header->magic = 0x%x\n", header->magic);
-	pr_debug("header->filename = \"%.*s\"\n", CHAMELEON_FILENAME_LEN,
-		header->filename);
+	bus->revision = header->revision;
+	bus->model = header->model;
+	bus->minor = header->minor;
+	snprintf(bus->name, CHAMELEON_FILENAME_LEN + 1, "%s",
+		 header->filename);
 
 	for_each_chameleon_cell(dtype, p) {
 		switch (dtype) {
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 3efafbca166d..ead13d233a97 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -15,6 +15,8 @@
 #include <linux/device.h>
 #include <linux/irqreturn.h>
 
+#define CHAMELEON_FILENAME_LEN 12
+
 struct mcb_driver;
 struct mcb_device;
 
@@ -25,11 +27,18 @@ struct mcb_device;
  * @carrier: pointer to carrier device
  * @bus_nr: mcb bus number
  * @get_irq: callback to get IRQ number
+ * @revision: the FPGA's revision number
+ * @model: the FPGA's model number
+ * @filename: the FPGA's name
  */
 struct mcb_bus {
 	struct device dev;
 	struct device *carrier;
 	int bus_nr;
+	u8 revision;
+	char model;
+	u8 minor;
+	char name[CHAMELEON_FILENAME_LEN + 1];
 	int (*get_irq)(struct mcb_device *dev);
 };
 #define to_mcb_bus(b) container_of((b), struct mcb_bus, dev)
-- 
cgit v1.2.3


From efdc810ba39dae0ccce9cb9c1c84ff9b0157ca43 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Tue, 3 May 2016 17:13:54 +0300
Subject: net/mlx5: Flow steering, Add vport ACL support

Update the relevant flow steering device structs and commands to
support vport.
Update the flow steering core API to receive vport number.
Add ingress and egress ACL flow table name spaces.
Add ACL flow table support:
* ACL (Access Control List) flow table is a table that contains
only allow/drop steering rules.

* We have two types of ACL flow tables - ingress and egress.

* ACLs handle traffic sent from/to E-Switch FDB table, Ingress refers to
traffic sent from Vport to E-Switch and Egress refers to traffic sent
from E-Switch to vport.

* Ingress ACL flow table allow/drop rules is checked against traffic
sent from VF.

* Egress ACL flow table allow/drop rules is checked against traffic sent
to VF.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 33 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h   |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 85 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  7 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +
 include/linux/mlx5/device.h                        | 12 +++
 include/linux/mlx5/driver.h                        |  2 +
 include/linux/mlx5/fs.h                            |  7 ++
 9 files changed, 142 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index ff91bb5e1c43..dd066199d172 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -845,7 +845,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
 	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
-	int total_vports = 1 + pci_sriov_get_totalvfs(dev->pdev);
+	int total_vports = MLX5_TOTAL_VPORTS(dev);
 	struct mlx5_eswitch *esw;
 	int vport_num;
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index f46f1db0fc00..9797768891ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -50,6 +50,10 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
 	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
 	MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
+		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
+	}
 
 	memset(out, 0, sizeof(out));
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
@@ -57,6 +61,7 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 }
 
 int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       u16 vport,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
 			       *next_ft, unsigned int *table_id)
@@ -77,6 +82,10 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 	MLX5_SET(create_flow_table_in, in, table_type, type);
 	MLX5_SET(create_flow_table_in, in, level, level);
 	MLX5_SET(create_flow_table_in, in, log_size, log_size);
+	if (vport) {
+		MLX5_SET(create_flow_table_in, in, vport_number, vport);
+		MLX5_SET(create_flow_table_in, in, other_vport, 1);
+	}
 
 	memset(out, 0, sizeof(out));
 	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
@@ -101,6 +110,10 @@ int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
 	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_table_in, in, other_vport, 1);
+	}
 
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
 					  sizeof(out));
@@ -120,6 +133,10 @@ int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_MODIFY_FLOW_TABLE);
 	MLX5_SET(modify_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(modify_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(modify_flow_table_in, in, other_vport, 1);
+	}
 	MLX5_SET(modify_flow_table_in, in, modify_field_select,
 		 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
 	if (next_ft) {
@@ -148,6 +165,10 @@ int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_CREATE_FLOW_GROUP);
 	MLX5_SET(create_flow_group_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_group_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(create_flow_group_in, in, other_vport, 1);
+	}
 
 	err = mlx5_cmd_exec_check_status(dev, in,
 					 inlen, out,
@@ -174,6 +195,10 @@ int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
 	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
 	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
 	MLX5_SET(destroy_flow_group_in, in, group_id, group_id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_group_in, in, other_vport, 1);
+	}
 
 	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
 					  sizeof(out));
@@ -207,6 +232,10 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(set_fte_in, in, table_type, ft->type);
 	MLX5_SET(set_fte_in, in, table_id,   ft->id);
 	MLX5_SET(set_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(set_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(set_fte_in, in, other_vport, 1);
+	}
 
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
@@ -285,6 +314,10 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(delete_fte_in, in, table_type, ft->type);
 	MLX5_SET(delete_fte_in, in, table_id, ft->id);
 	MLX5_SET(delete_fte_in, in, flow_index, index);
+	if (ft->vport) {
+		MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(delete_fte_in, in, other_vport, 1);
+	}
 
 	err =  mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 9814d4784803..c97b4a03eeed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -34,6 +34,7 @@
 #define _MLX5_FS_CMD_
 
 int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       u16 vport,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
 			       *next_ft, unsigned int *table_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 4d78d5a48af3..659a6980cda2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -457,7 +457,7 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
 	return fg;
 }
 
-static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte,
+static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte,
 						enum fs_flow_table_type table_type)
 {
 	struct mlx5_flow_table *ft;
@@ -469,6 +469,7 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte,
 	ft->level = level;
 	ft->node.type = FS_TYPE_FLOW_TABLE;
 	ft->type = table_type;
+	ft->vport = vport;
 	ft->max_fte = max_fte;
 	INIT_LIST_HEAD(&ft->fwd_rules);
 	mutex_init(&ft->lock);
@@ -700,9 +701,9 @@ static void list_add_flow_table(struct mlx5_flow_table *ft,
 	list_add(&ft->node.list, prev);
 }
 
-struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-					       int prio, int max_fte,
-					       u32 level)
+static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+							u16 vport, int prio,
+							int max_fte, u32 level)
 {
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_table *ft;
@@ -732,6 +733,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 	 */
 	level += fs_prio->start_level;
 	ft = alloc_flow_table(level,
+			      vport,
 			      roundup_pow_of_two(max_fte),
 			      root->table_type);
 	if (!ft) {
@@ -742,7 +744,7 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 	tree_init_node(&ft->node, 1, del_flow_table);
 	log_table_sz = ilog2(ft->max_fte);
 	next_ft = find_next_chained_ft(fs_prio);
-	err = mlx5_cmd_create_flow_table(root->dev, ft->type, ft->level,
+	err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->type, ft->level,
 					 log_table_sz, next_ft, &ft->id);
 	if (err)
 		goto free_ft;
@@ -766,6 +768,20 @@ unlock_root:
 	return ERR_PTR(err);
 }
 
+struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+					       int prio, int max_fte,
+					       u32 level)
+{
+	return __mlx5_create_flow_table(ns, 0, prio, max_fte, level);
+}
+
+struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+						     int prio, int max_fte,
+						     u32 level, u16 vport)
+{
+	return __mlx5_create_flow_table(ns, vport, prio, max_fte, level);
+}
+
 struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 							    int prio,
 							    int num_flow_table_entries,
@@ -1319,6 +1335,16 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			return &dev->priv.fdb_root_ns->ns;
 		else
 			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+		if (dev->priv.esw_egress_root_ns)
+			return &dev->priv.esw_egress_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+		if (dev->priv.esw_ingress_root_ns)
+			return &dev->priv.esw_ingress_root_ns->ns;
+		else
+			return NULL;
 	default:
 		return NULL;
 	}
@@ -1699,6 +1725,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 {
 	cleanup_root_ns(dev);
 	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
+	cleanup_single_prio_root_ns(dev, dev->priv.esw_egress_root_ns);
+	cleanup_single_prio_root_ns(dev, dev->priv.esw_ingress_root_ns);
 }
 
 static int init_fdb_root_ns(struct mlx5_core_dev *dev)
@@ -1719,6 +1747,38 @@ static int init_fdb_root_ns(struct mlx5_core_dev *dev)
 	}
 }
 
+static int init_egress_acl_root_ns(struct mlx5_core_dev *dev)
+{
+	struct fs_prio *prio;
+
+	dev->priv.esw_egress_root_ns = create_root_ns(dev, FS_FT_ESW_EGRESS_ACL);
+	if (!dev->priv.esw_egress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&dev->priv.esw_egress_root_ns->ns, 0, MLX5_TOTAL_VPORTS(dev));
+	if (IS_ERR(prio))
+		return PTR_ERR(prio);
+	else
+		return 0;
+}
+
+static int init_ingress_acl_root_ns(struct mlx5_core_dev *dev)
+{
+	struct fs_prio *prio;
+
+	dev->priv.esw_ingress_root_ns = create_root_ns(dev, FS_FT_ESW_INGRESS_ACL);
+	if (!dev->priv.esw_ingress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&dev->priv.esw_ingress_root_ns->ns, 0, MLX5_TOTAL_VPORTS(dev));
+	if (IS_ERR(prio))
+		return PTR_ERR(prio);
+	else
+		return 0;
+}
+
 int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	int err = 0;
@@ -1731,8 +1791,21 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
 		err = init_fdb_root_ns(dev);
 		if (err)
-			cleanup_root_ns(dev);
+			goto err;
+	}
+	if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
+		err = init_egress_acl_root_ns(dev);
+		if (err)
+			goto err;
+	}
+	if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
+		err = init_ingress_acl_root_ns(dev);
+		if (err)
+			goto err;
 	}
 
+	return 0;
+err:
+	mlx5_cleanup_fs(dev);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index d607e564f454..8e76cc505f5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -45,8 +45,10 @@ enum fs_node_type {
 };
 
 enum fs_flow_table_type {
-	FS_FT_NIC_RX	 = 0x0,
-	FS_FT_FDB	 = 0X4,
+	FS_FT_NIC_RX          = 0x0,
+	FS_FT_ESW_EGRESS_ACL  = 0x2,
+	FS_FT_ESW_INGRESS_ACL = 0x3,
+	FS_FT_FDB             = 0X4,
 };
 
 enum fs_fte_status {
@@ -79,6 +81,7 @@ struct mlx5_flow_rule {
 struct mlx5_flow_table {
 	struct fs_node			node;
 	u32				id;
+	u16				vport;
 	unsigned int			max_fte;
 	unsigned int			level;
 	enum fs_flow_table_type		type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 0b0b226c789e..482604bd051c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -42,6 +42,8 @@
 #define DRIVER_VERSION "3.0-1"
 #define DRIVER_RELDATE  "January 2015"
 
+#define MLX5_TOTAL_VPORTS(mdev) (1 + pci_sriov_get_totalvfs(mdev->pdev))
+
 extern int mlx5_core_debug_mask;
 
 #define mlx5_core_dbg(__dev, format, ...)				\
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8fecd6d6f814..ee0d5a937f02 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1349,6 +1349,18 @@ enum mlx5_cap_type {
 #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap)
 
+#define MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_EGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_ingress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap)
+
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
 		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d5529449ef47..9613143f0561 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -518,6 +518,8 @@ struct mlx5_priv {
 	unsigned long		pci_dev_data;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_root_namespace *esw_egress_root_ns;
+	struct mlx5_flow_root_namespace *esw_ingress_root_ns;
 };
 
 enum mlx5_device_state {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 165ff4f9cc6a..6467569ad76e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -58,6 +58,8 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_LEFTOVERS,
 	MLX5_FLOW_NAMESPACE_ANCHOR,
 	MLX5_FLOW_NAMESPACE_FDB,
+	MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+	MLX5_FLOW_NAMESPACE_ESW_INGRESS,
 };
 
 struct mlx5_flow_table;
@@ -90,6 +92,11 @@ mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
 		       int num_flow_table_entries,
 		       u32 level);
+struct mlx5_flow_table *
+mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+			     int prio,
+			     int num_flow_table_entries,
+			     u32 level, u16 vport);
 int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
 
 /* inbox should be set with the following values:
-- 
cgit v1.2.3


From ba162f8eed61a7e71e26455ce1cff5b5898a3579 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 16:31:00 +0200
Subject: netdevice: add helper to update trans_start

trans_start exists twice:
- as member of net_device (legacy)
- as member of netdev_queue

In order to get rid of the legacy case, add a helper for the
dev->trans_update (this patch), then convert spots that do

dev->trans_start = jiffies

to use this helper (next patch).

This would then allow us to change the helper so that it updates the
trans_stamp of netdev queue 0 instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcf012637d10..f53412cccbaa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3481,6 +3481,12 @@ static inline void txq_trans_update(struct netdev_queue *txq)
 		txq->trans_start = jiffies;
 }
 
+/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
+static inline void netif_trans_update(struct net_device *dev)
+{
+	dev->trans_start = jiffies;
+}
+
 /**
  *	netif_tx_lock - grab network device transmit lock
  *	@dev: network device
-- 
cgit v1.2.3


From 9b36627acecd5792e81daf1a3bff8eab39ed45fb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2016 16:33:14 +0200
Subject: net: remove dev->trans_start

previous patches removed all direct accesses to dev->trans_start,
so change the netif_trans_update helper to update trans_start of
netdev queue 0 instead and then remove trans_start from struct net_device.

AFAICS a lot of the netif_trans_update() invocations are now useless
because they occur in ndo_start_xmit and driver doesn't set LLTX
(i.e. stack already took care of the update).

As I can't test any of them it seems better to just leave them alone.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  2 +-
 include/linux/netdevice.h                   | 15 +++++----------
 net/sched/sch_generic.c                     | 10 +++-------
 3 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8e6c0f2487d7..f6da6b76e678 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -328,7 +328,7 @@ static void i40e_tx_timeout(struct net_device *netdev)
 		unsigned long trans_start;
 
 		q = netdev_get_tx_queue(netdev, i);
-		trans_start = q->trans_start ? : netdev->trans_start;
+		trans_start = q->trans_start;
 		if (netif_xmit_stopped(q) &&
 		    time_after(jiffies,
 			       (trans_start + netdev->watchdog_timeo))) {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f53412cccbaa..63580e6d0df4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -581,7 +581,7 @@ struct netdev_queue {
 	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	int			xmit_lock_owner;
 	/*
-	 * please use this field instead of dev->trans_start
+	 * Time (in jiffies) of last Tx
 	 */
 	unsigned long		trans_start;
 
@@ -1545,7 +1545,6 @@ enum netdev_priv_flags {
  *
  *	@offload_fwd_mark:	Offload device fwding mark
  *
- *	@trans_start:		Time (in jiffies) of last Tx
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1794,13 +1793,6 @@ struct net_device {
 #endif
 
 	/* These may be needed for future network-power-down code. */
-
-	/*
-	 * trans_start here is expensive for high speed devices on SMP,
-	 * please use netdev_queue->trans_start instead.
-	 */
-	unsigned long		trans_start;
-
 	struct timer_list	watchdog_timer;
 
 	int __percpu		*pcpu_refcnt;
@@ -3484,7 +3476,10 @@ static inline void txq_trans_update(struct netdev_queue *txq)
 /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
 static inline void netif_trans_update(struct net_device *dev)
 {
-	dev->trans_start = jiffies;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+
+	if (txq->trans_start != jiffies)
+		txq->trans_start = jiffies;
 }
 
 /**
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 70182cfe119c..269dd71b3828 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -227,13 +227,12 @@ unsigned long dev_trans_start(struct net_device *dev)
 
 	if (is_vlan_dev(dev))
 		dev = vlan_dev_real_dev(dev);
-	res = dev->trans_start;
-	for (i = 0; i < dev->num_tx_queues; i++) {
+	res = netdev_get_tx_queue(dev, 0)->trans_start;
+	for (i = 1; i < dev->num_tx_queues; i++) {
 		val = netdev_get_tx_queue(dev, i)->trans_start;
 		if (val && time_after(val, res))
 			res = val;
 	}
-	dev->trans_start = res;
 
 	return res;
 }
@@ -256,10 +255,7 @@ static void dev_watchdog(unsigned long arg)
 				struct netdev_queue *txq;
 
 				txq = netdev_get_tx_queue(dev, i);
-				/*
-				 * old device drivers set dev->trans_start
-				 */
-				trans_start = txq->trans_start ? : dev->trans_start;
+				trans_start = txq->trans_start;
 				if (netif_xmit_stopped(txq) &&
 				    time_after(jiffies, (trans_start +
 							 dev->watchdog_timeo))) {
-- 
cgit v1.2.3


From 8320f495cf441d593f7cd4f30e6b63455be71a2c Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 4 May 2016 22:15:27 +0200
Subject: i2c: allow adapter drivers to override the adapter locking

Add i2c_lock_bus() and i2c_unlock_bus(), which call the new lock_bus and
unlock_bus ops in the adapter. These funcs/ops take an additional flags
argument that indicates for what purpose the adapter is locked.

There are two flags, I2C_LOCK_ROOT_ADAPTER and I2C_LOCK_SEGMENT, but they
are both implemented the same. For now. Locking the root adapter means
that the whole bus is locked, locking the segment means that only the
current bus segment is locked (i.e. i2c traffic on the parent side of
a mux is still allowed even if the child side of the mux is locked).

Also support a trylock_bus op (but no function to call it, as it is not
expected to be needed outside of the i2c core).

Implement i2c_lock_adapter/i2c_unlock_adapter in terms of the new locking
scheme (i.e. lock with the I2C_LOCK_ROOT_ADAPTER flag).

Locking the root adapter and locking the segment is the same thing for
all root adapters (e.g. in the normal case of a simple topology with no
i2c muxes). The two locking variants are also the same for traditional
muxes (aka parent-locked muxes). These muxes traverse the tree, locking
each level as they go until they reach the root. This patch is preparatory
for a later patch in the series introducing mux-locked muxes, which behave
differently depending on the requested locking. Since all current users
are using i2c_lock_adapter, which is a wrapper for I2C_LOCK_ROOT_ADAPTER,
we only need to annotate the calls that will not need to lock the root
adapter for mux-locked muxes. I.e. the instances that needs to use
I2C_LOCK_SEGMENT instead of i2c_lock_adapter/I2C_LOCK_ROOT_ADAPTER. Those
instances are in the i2c_transfer and i2c_smbus_xfer functions, so that
mux-locked muxes can single out normal i2c accesses to its slave side
and adjust the locking for those accesses.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 41 +++++++++++++++++++++++++++--------------
 include/linux/i2c.h    | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 4979728f7fb2..7ef5bd085476 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -954,10 +954,13 @@ static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
 }
 
 /**
- * i2c_lock_adapter - Get exclusive access to an I2C bus segment
+ * i2c_adapter_lock_bus - Get exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	locks only this branch in the adapter tree
  */
-void i2c_lock_adapter(struct i2c_adapter *adapter)
+static void i2c_adapter_lock_bus(struct i2c_adapter *adapter,
+				 unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
@@ -966,27 +969,32 @@ void i2c_lock_adapter(struct i2c_adapter *adapter)
 	else
 		rt_mutex_lock(&adapter->bus_lock);
 }
-EXPORT_SYMBOL_GPL(i2c_lock_adapter);
 
 /**
- * i2c_trylock_adapter - Try to get exclusive access to an I2C bus segment
+ * i2c_adapter_trylock_bus - Try to get exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER trylocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	trylocks only this branch in the adapter tree
  */
-static int i2c_trylock_adapter(struct i2c_adapter *adapter)
+static int i2c_adapter_trylock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
 	if (parent)
-		return i2c_trylock_adapter(parent);
+		return parent->trylock_bus(parent, flags);
 	else
 		return rt_mutex_trylock(&adapter->bus_lock);
 }
 
 /**
- * i2c_unlock_adapter - Release exclusive access to an I2C bus segment
+ * i2c_adapter_unlock_bus - Release exclusive access to an I2C bus segment
  * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	unlocks only this branch in the adapter tree
  */
-void i2c_unlock_adapter(struct i2c_adapter *adapter)
+static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
 {
 	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 
@@ -995,7 +1003,6 @@ void i2c_unlock_adapter(struct i2c_adapter *adapter)
 	else
 		rt_mutex_unlock(&adapter->bus_lock);
 }
-EXPORT_SYMBOL_GPL(i2c_unlock_adapter);
 
 static void i2c_dev_set_name(struct i2c_adapter *adap,
 			     struct i2c_client *client)
@@ -1541,6 +1548,12 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		return -EINVAL;
 	}
 
+	if (!adap->lock_bus) {
+		adap->lock_bus = i2c_adapter_lock_bus;
+		adap->trylock_bus = i2c_adapter_trylock_bus;
+		adap->unlock_bus = i2c_adapter_unlock_bus;
+	}
+
 	rt_mutex_init(&adap->bus_lock);
 	mutex_init(&adap->userspace_clients_lock);
 	INIT_LIST_HEAD(&adap->userspace_clients);
@@ -2310,16 +2323,16 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 #endif
 
 		if (in_atomic() || irqs_disabled()) {
-			ret = i2c_trylock_adapter(adap);
+			ret = adap->trylock_bus(adap, I2C_LOCK_SEGMENT);
 			if (!ret)
 				/* I2C activity is ongoing. */
 				return -EAGAIN;
 		} else {
-			i2c_lock_adapter(adap);
+			i2c_lock_bus(adap, I2C_LOCK_SEGMENT);
 		}
 
 		ret = __i2c_transfer(adap, msgs, num);
-		i2c_unlock_adapter(adap);
+		i2c_unlock_bus(adap, I2C_LOCK_SEGMENT);
 
 		return ret;
 	} else {
@@ -3094,7 +3107,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC | I2C_CLIENT_SCCB;
 
 	if (adapter->algo->smbus_xfer) {
-		i2c_lock_adapter(adapter);
+		i2c_lock_bus(adapter, I2C_LOCK_SEGMENT);
 
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
@@ -3108,7 +3121,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		i2c_unlock_adapter(adapter);
+		i2c_unlock_bus(adapter, I2C_LOCK_SEGMENT);
 
 		if (res != -EOPNOTSUPP || !adapter->algo->master_xfer)
 			goto trace;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index c30833b7b073..50934d6e1050 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -538,6 +538,10 @@ struct i2c_adapter {
 
 	struct i2c_bus_recovery_info *bus_recovery_info;
 	const struct i2c_adapter_quirks *quirks;
+
+	void (*lock_bus)(struct i2c_adapter *, unsigned int flags);
+	int (*trylock_bus)(struct i2c_adapter *, unsigned int flags);
+	void (*unlock_bus)(struct i2c_adapter *, unsigned int flags);
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
@@ -567,8 +571,44 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *));
 
 /* Adapter locking functions, exported for shared pin cases */
-void i2c_lock_adapter(struct i2c_adapter *);
-void i2c_unlock_adapter(struct i2c_adapter *);
+#define I2C_LOCK_ROOT_ADAPTER BIT(0)
+#define I2C_LOCK_SEGMENT      BIT(1)
+
+/**
+ * i2c_lock_bus - Get exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	locks only this branch in the adapter tree
+ */
+static inline void
+i2c_lock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	adapter->lock_bus(adapter, flags);
+}
+
+/**
+ * i2c_unlock_bus - Release exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT
+ *	unlocks only this branch in the adapter tree
+ */
+static inline void
+i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	adapter->unlock_bus(adapter, flags);
+}
+
+static inline void
+i2c_lock_adapter(struct i2c_adapter *adapter)
+{
+	i2c_lock_bus(adapter, I2C_LOCK_ROOT_ADAPTER);
+}
+
+static inline void
+i2c_unlock_adapter(struct i2c_adapter *adapter)
+{
+	i2c_unlock_bus(adapter, I2C_LOCK_ROOT_ADAPTER);
+}
 
 /*flags for the client struct: */
 #define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
-- 
cgit v1.2.3


From 6ef91fcca8a8ba3df9810a4cc6cd6a9d3f21bf45 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 4 May 2016 22:15:29 +0200
Subject: i2c: mux: relax locking of the top i2c adapter during mux-locked
 muxing

With a i2c topology like the following

                       GPIO ---|  ------ BAT1
                        |      v /
   I2C  -----+----------+---- MUX
             |                   \
           EEPROM                 ------ BAT2

there is a locking problem with the GPIO controller since it is a client
on the same i2c bus that it muxes. Transfers to the mux clients (e.g. BAT1)
will lock the whole i2c bus prior to attempting to switch the mux to the
correct i2c segment. In the above case, the GPIO device is an I/O expander
with an i2c interface, and since the GPIO subsystem knows nothing (and
rightfully so) about the lockless needs of the i2c mux code, this results
in a deadlock when the GPIO driver issues i2c transfers to modify the
mux.

So, observing that while it is needed to have the i2c bus locked during the
actual MUX update in order to avoid random garbage on the slave side, it
is not strictly a must to have it locked over the whole sequence of a full
select-transfer-deselect mux client operation. The mux itself needs to be
locked, so transfers to clients behind the mux are serialized, and the mux
needs to be stable during all i2c traffic (otherwise individual mux slave
segments might see garbage, or worse).

Introduce this new locking concept as "mux-locked" muxes, and call the
pre-existing mux locking scheme "parent-locked".

Modify the i2c mux locking so that muxes that are "mux-locked" locks only
the muxes on the parent adapter instead of the whole i2c bus when there is
a transfer to the slave side of the mux. This lock serializes transfers to
the slave side of the muxes on the parent adapter.

Add code to i2c-mux-gpio and i2c-mux-pinctrl that checks if all involved
gpio/pinctrl devices have a parent that is an i2c adapter in the same
adapter tree that is muxed, and request a "mux-locked mux" if that is the
case.

Modify the select-transfer-deselect code for "mux-locked" muxes so
that each of the select-transfer-deselect ops locks the mux parent
adapter individually.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c              |   1 +
 drivers/i2c/i2c-mux.c               | 152 +++++++++++++++++++++++++++++++++---
 drivers/i2c/muxes/i2c-mux-gpio.c    |  18 +++++
 drivers/i2c/muxes/i2c-mux-pinctrl.c |  38 +++++++++
 include/linux/i2c-mux.h             |   8 ++
 include/linux/i2c.h                 |   1 +
 6 files changed, 205 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index afdee66002db..9da446162529 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1540,6 +1540,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	}
 
 	rt_mutex_init(&adap->bus_lock);
+	rt_mutex_init(&adap->mux_lock);
 	mutex_init(&adap->userspace_clients_lock);
 	INIT_LIST_HEAD(&adap->userspace_clients);
 
diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 5fa8af715e24..8eee98634cda 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -35,6 +35,25 @@ struct i2c_mux_priv {
 	u32 chan_id;
 };
 
+static int __i2c_mux_master_xfer(struct i2c_adapter *adap,
+				 struct i2c_msg msgs[], int num)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
+	int ret;
+
+	/* Switch to the right mux port and perform the transfer. */
+
+	ret = muxc->select(muxc, priv->chan_id);
+	if (ret >= 0)
+		ret = __i2c_transfer(parent, msgs, num);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
+
+	return ret;
+}
+
 static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 			       struct i2c_msg msgs[], int num)
 {
@@ -47,7 +66,29 @@ static int i2c_mux_master_xfer(struct i2c_adapter *adap,
 
 	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
-		ret = __i2c_transfer(parent, msgs, num);
+		ret = i2c_transfer(parent, msgs, num);
+	if (muxc->deselect)
+		muxc->deselect(muxc, priv->chan_id);
+
+	return ret;
+}
+
+static int __i2c_mux_smbus_xfer(struct i2c_adapter *adap,
+				u16 addr, unsigned short flags,
+				char read_write, u8 command,
+				int size, union i2c_smbus_data *data)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_mux_core *muxc = priv->muxc;
+	struct i2c_adapter *parent = muxc->parent;
+	int ret;
+
+	/* Select the right mux port and perform the transfer. */
+
+	ret = muxc->select(muxc, priv->chan_id);
+	if (ret >= 0)
+		ret = parent->algo->smbus_xfer(parent, addr, flags,
+					read_write, command, size, data);
 	if (muxc->deselect)
 		muxc->deselect(muxc, priv->chan_id);
 
@@ -68,8 +109,8 @@ static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
 
 	ret = muxc->select(muxc, priv->chan_id);
 	if (ret >= 0)
-		ret = parent->algo->smbus_xfer(parent, addr, flags,
-					read_write, command, size, data);
+		ret = i2c_smbus_xfer(parent, addr, flags,
+				     read_write, command, size, data);
 	if (muxc->deselect)
 		muxc->deselect(muxc, priv->chan_id);
 
@@ -98,13 +139,50 @@ static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
 	return class;
 }
 
+static void i2c_mux_lock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	rt_mutex_lock(&parent->mux_lock);
+	if (!(flags & I2C_LOCK_ROOT_ADAPTER))
+		return;
+	i2c_lock_bus(parent, flags);
+}
+
+static int i2c_mux_trylock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	if (!rt_mutex_trylock(&parent->mux_lock))
+		return 0;	/* mux_lock not locked, failure */
+	if (!(flags & I2C_LOCK_ROOT_ADAPTER))
+		return 1;	/* we only want mux_lock, success */
+	if (parent->trylock_bus(parent, flags))
+		return 1;	/* parent locked too, success */
+	rt_mutex_unlock(&parent->mux_lock);
+	return 0;		/* parent not locked, failure */
+}
+
+static void i2c_mux_unlock_bus(struct i2c_adapter *adapter, unsigned int flags)
+{
+	struct i2c_mux_priv *priv = adapter->algo_data;
+	struct i2c_adapter *parent = priv->muxc->parent;
+
+	if (flags & I2C_LOCK_ROOT_ADAPTER)
+		i2c_unlock_bus(parent, flags);
+	rt_mutex_unlock(&parent->mux_lock);
+}
+
 static void i2c_parent_lock_bus(struct i2c_adapter *adapter,
 				unsigned int flags)
 {
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	parent->lock_bus(parent, flags);
+	rt_mutex_lock(&parent->mux_lock);
+	i2c_lock_bus(parent, flags);
 }
 
 static int i2c_parent_trylock_bus(struct i2c_adapter *adapter,
@@ -113,7 +191,12 @@ static int i2c_parent_trylock_bus(struct i2c_adapter *adapter,
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	return parent->trylock_bus(parent, flags);
+	if (!rt_mutex_trylock(&parent->mux_lock))
+		return 0;	/* mux_lock not locked, failure */
+	if (parent->trylock_bus(parent, flags))
+		return 1;	/* parent locked too, success */
+	rt_mutex_unlock(&parent->mux_lock);
+	return 0;		/* parent not locked, failure */
 }
 
 static void i2c_parent_unlock_bus(struct i2c_adapter *adapter,
@@ -122,9 +205,36 @@ static void i2c_parent_unlock_bus(struct i2c_adapter *adapter,
 	struct i2c_mux_priv *priv = adapter->algo_data;
 	struct i2c_adapter *parent = priv->muxc->parent;
 
-	parent->unlock_bus(parent, flags);
+	i2c_unlock_bus(parent, flags);
+	rt_mutex_unlock(&parent->mux_lock);
 }
 
+struct i2c_adapter *i2c_root_adapter(struct device *dev)
+{
+	struct device *i2c;
+	struct i2c_adapter *i2c_root;
+
+	/*
+	 * Walk up the device tree to find an i2c adapter, indicating
+	 * that this is an i2c client device. Check all ancestors to
+	 * handle mfd devices etc.
+	 */
+	for (i2c = dev; i2c; i2c = i2c->parent) {
+		if (i2c->type == &i2c_adapter_type)
+			break;
+	}
+	if (!i2c)
+		return NULL;
+
+	/* Continue up the tree to find the root i2c adapter */
+	i2c_root = to_i2c_adapter(i2c);
+	while (i2c_parent_is_i2c_adapter(i2c_root))
+		i2c_root = i2c_parent_is_i2c_adapter(i2c_root);
+
+	return i2c_root;
+}
+EXPORT_SYMBOL_GPL(i2c_root_adapter);
+
 struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   struct device *dev, int max_adapters,
 				   int sizeof_priv, u32 flags,
@@ -143,6 +253,8 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 
 	muxc->parent = parent;
 	muxc->dev = dev;
+	if (flags & I2C_MUX_LOCKED)
+		muxc->mux_locked = true;
 	muxc->select = select;
 	muxc->deselect = deselect;
 	muxc->max_adapters = max_adapters;
@@ -176,10 +288,18 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 	/* Need to do algo dynamically because we don't know ahead
 	 * of time what sort of physical adapter we'll be dealing with.
 	 */
-	if (parent->algo->master_xfer)
-		priv->algo.master_xfer = i2c_mux_master_xfer;
-	if (parent->algo->smbus_xfer)
-		priv->algo.smbus_xfer = i2c_mux_smbus_xfer;
+	if (parent->algo->master_xfer) {
+		if (muxc->mux_locked)
+			priv->algo.master_xfer = i2c_mux_master_xfer;
+		else
+			priv->algo.master_xfer = __i2c_mux_master_xfer;
+	}
+	if (parent->algo->smbus_xfer) {
+		if (muxc->mux_locked)
+			priv->algo.smbus_xfer = i2c_mux_smbus_xfer;
+		else
+			priv->algo.smbus_xfer = __i2c_mux_smbus_xfer;
+	}
 	priv->algo.functionality = i2c_mux_functionality;
 
 	/* Now fill out new adapter structure */
@@ -192,9 +312,15 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 	priv->adap.retries = parent->retries;
 	priv->adap.timeout = parent->timeout;
 	priv->adap.quirks = parent->quirks;
-	priv->adap.lock_bus = i2c_parent_lock_bus;
-	priv->adap.trylock_bus = i2c_parent_trylock_bus;
-	priv->adap.unlock_bus = i2c_parent_unlock_bus;
+	if (muxc->mux_locked) {
+		priv->adap.lock_bus = i2c_mux_lock_bus;
+		priv->adap.trylock_bus = i2c_mux_trylock_bus;
+		priv->adap.unlock_bus = i2c_mux_unlock_bus;
+	} else {
+		priv->adap.lock_bus = i2c_parent_lock_bus;
+		priv->adap.trylock_bus = i2c_parent_trylock_bus;
+		priv->adap.unlock_bus = i2c_parent_unlock_bus;
+	}
 
 	/* Sanity check on class */
 	if (i2c_mux_parent_classes(parent) & class)
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index f6270ee934f9..e5cf26eefa97 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
+#include "../../gpio/gpiolib.h"
 #include <linux/of_gpio.h>
 
 struct gpiomux {
@@ -137,6 +138,7 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	struct i2c_mux_core *muxc;
 	struct gpiomux *mux;
 	struct i2c_adapter *parent;
+	struct i2c_adapter *root;
 	unsigned initial_state, gpio_base;
 	int i, ret;
 
@@ -184,6 +186,9 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, muxc);
 
+	root = i2c_root_adapter(&parent->dev);
+
+	muxc->mux_locked = true;
 	mux->gpio_base = gpio_base;
 
 	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
@@ -194,6 +199,9 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < mux->data.n_gpios; i++) {
+		struct device *gpio_dev;
+		struct gpio_desc *gpio_desc;
+
 		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
 		if (ret) {
 			dev_err(&pdev->dev, "Failed to request GPIO %d\n",
@@ -210,8 +218,18 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 			i++;	/* gpio_request above succeeded, so must free */
 			goto err_request_gpio;
 		}
+
+		if (!muxc->mux_locked)
+			continue;
+
+		gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]);
+		gpio_dev = &gpio_desc->gdev->dev;
+		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
 	}
 
+	if (muxc->mux_locked)
+		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+
 	for (i = 0; i < mux->data.n_values; i++) {
 		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
 		unsigned int class = mux->data.classes ? mux->data.classes[i] : 0;
diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c
index f4e62f4a50cc..35bb775e1b74 100644
--- a/drivers/i2c/muxes/i2c-mux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c
@@ -24,6 +24,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include "../../pinctrl/core.h"
 
 struct i2c_mux_pinctrl {
 	struct i2c_mux_pinctrl_platform_data *pdata;
@@ -120,10 +121,31 @@ static inline int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux,
 }
 #endif
 
+static struct i2c_adapter *i2c_mux_pinctrl_root_adapter(
+	struct pinctrl_state *state)
+{
+	struct i2c_adapter *root = NULL;
+	struct pinctrl_setting *setting;
+	struct i2c_adapter *pin_root;
+
+	list_for_each_entry(setting, &state->settings, node) {
+		pin_root = i2c_root_adapter(setting->pctldev->dev);
+		if (!pin_root)
+			return NULL;
+		if (!root)
+			root = pin_root;
+		else if (root != pin_root)
+			return NULL;
+	}
+
+	return root;
+}
+
 static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 {
 	struct i2c_mux_core *muxc;
 	struct i2c_mux_pinctrl *mux;
+	struct i2c_adapter *root;
 	int i, ret;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
@@ -202,6 +224,22 @@ static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 		goto err;
 	}
 
+	root = i2c_root_adapter(&muxc->parent->dev);
+
+	muxc->mux_locked = true;
+	for (i = 0; i < mux->pdata->bus_count; i++) {
+		if (root != i2c_mux_pinctrl_root_adapter(mux->states[i])) {
+			muxc->mux_locked = false;
+			break;
+		}
+	}
+	if (muxc->mux_locked && mux->pdata->pinctrl_state_idle &&
+	    root != i2c_mux_pinctrl_root_adapter(mux->state_idle))
+		muxc->mux_locked = false;
+
+	if (muxc->mux_locked)
+		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+
 	for (i = 0; i < mux->pdata->bus_count; i++) {
 		u32 bus = mux->pdata->base_bus_num ?
 				(mux->pdata->base_bus_num + i) : 0;
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 2fa93fe1345e..d4c1d12f900d 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -27,9 +27,12 @@
 
 #ifdef __KERNEL__
 
+#include <linux/bitops.h>
+
 struct i2c_mux_core {
 	struct i2c_adapter *parent;
 	struct device *dev;
+	bool mux_locked;
 
 	void *priv;
 
@@ -47,11 +50,16 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent,
 				   int (*select)(struct i2c_mux_core *, u32),
 				   int (*deselect)(struct i2c_mux_core *, u32));
 
+/* flags for i2c_mux_alloc */
+#define I2C_MUX_LOCKED BIT(0)
+
 static inline void *i2c_mux_priv(struct i2c_mux_core *muxc)
 {
 	return muxc->priv;
 }
 
+struct i2c_adapter *i2c_root_adapter(struct device *dev);
+
 /*
  * Called to create an i2c bus on a multiplexed bus segment.
  * The chan_id parameter is passed to the select and deselect
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 50934d6e1050..96a25ae14494 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -524,6 +524,7 @@ struct i2c_adapter {
 
 	/* data fields that are valid for all devices	*/
 	struct rt_mutex bus_lock;
+	struct rt_mutex mux_lock;
 
 	int timeout;			/* in jiffies */
 	int retries;
-- 
cgit v1.2.3


From 46cc6e4976e3d9058490f20d93bc7805f7f2d81e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 3 May 2016 16:56:03 -0700
Subject: tcp: fix lockdep splat in tcp_snd_una_update()

tcp_snd_una_update() and tcp_rcv_nxt_update() call
u64_stats_update_begin() either from process context or BH handler.

This triggers a lockdep splat on 32bit & SMP builds.

We could add u64_stats_update_begin_bh() variant but this would
slow down 32bit builds with useless local_disable_bh() and
local_enable_bh() pairs, since we own the socket lock at this point.

I add sock_owned_by_me() helper to have proper lockdep support
even on 64bit builds, and new u64_stats_update_begin_raw()
and u64_stats_update_end_raw methods.

Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible")
Reported-by: Fabio Estevam <festevam@gmail.com>
Diagnosed-by: Francois Romieu <romieu@fr.zoreil.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 14 ++++++++++++++
 include/net/sock.h             |  7 ++++++-
 net/ipv4/tcp_input.c           | 10 ++++++----
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index df89c9bcba7d..d3a2bb712af3 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -89,6 +89,20 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 #endif
 }
 
+static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	raw_write_seqcount_begin(&syncp->seq);
+#endif
+}
+
+static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	raw_write_seqcount_end(&syncp->seq);
+#endif
+}
+
 static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/include/net/sock.h b/include/net/sock.h
index 45f5b492c658..c9c8b19df27c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1421,11 +1421,16 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
  * accesses from user process context.
  */
 
-static inline bool sock_owned_by_user(const struct sock *sk)
+static inline void sock_owned_by_me(const struct sock *sk)
 {
 #ifdef CONFIG_LOCKDEP
 	WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
 #endif
+}
+
+static inline bool sock_owned_by_user(const struct sock *sk)
+{
+	sock_owned_by_me(sk);
 	return sk->sk_lock.owned;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6171f92be090..a914e0607895 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3355,9 +3355,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 {
 	u32 delta = ack - tp->snd_una;
 
-	u64_stats_update_begin(&tp->syncp);
+	sock_owned_by_me((struct sock *)tp);
+	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_acked += delta;
-	u64_stats_update_end(&tp->syncp);
+	u64_stats_update_end_raw(&tp->syncp);
 	tp->snd_una = ack;
 }
 
@@ -3366,9 +3367,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
 {
 	u32 delta = seq - tp->rcv_nxt;
 
-	u64_stats_update_begin(&tp->syncp);
+	sock_owned_by_me((struct sock *)tp);
+	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_received += delta;
-	u64_stats_update_end(&tp->syncp);
+	u64_stats_update_end_raw(&tp->syncp);
 	tp->rcv_nxt = seq;
 }
 
-- 
cgit v1.2.3


From 4d2458507d0b465c62ae80f3e81b8c008ec96b05 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Wed, 4 May 2016 19:33:59 -0300
Subject: ASoC: fsl_sai: Allow setting the SAI MCLK direction

On mx6ul the General Purpose Register 1 (GPR1) contains the following
bits for configuring the direction of the SAI MCLKs:
SAI1_MCLK_DIR, SAI2_MCLK_DIR, SAI3_MCLK_DIR

Introduce  the "fsl,sai-mclk-direction-output" optional property to allow
configuring the SAI_MCLK outputs.

Tested on a imx6ul-evk board.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/fsl-sai.txt |  5 +++++
 include/linux/mfd/syscon/imx6q-iomuxc-gpr.h         |  6 ++++++
 sound/soc/fsl/fsl_sai.c                             | 20 ++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/sound/fsl-sai.txt b/Documentation/devicetree/bindings/sound/fsl-sai.txt
index 777b941d6cbe..740b467adf7d 100644
--- a/Documentation/devicetree/bindings/sound/fsl-sai.txt
+++ b/Documentation/devicetree/bindings/sound/fsl-sai.txt
@@ -48,6 +48,11 @@ Required properties:
 			  receive data by following their own bit clocks and
 			  frame sync clocks separately.
 
+Optional properties (for mx6ul):
+
+  - fsl,sai-mclk-direction-output: This is a boolean property. If present,
+			 indicates that SAI will output the SAI MCLK clock.
+
 Note:
 - If both fsl,sai-asynchronous and fsl,sai-synchronous-rx are absent, the
   default synchronous mode (sync Rx with Tx) will be used, which means both
diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
index 238c8db953eb..68353822afce 100644
--- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h
@@ -447,5 +447,11 @@
 #define IMX6UL_GPR1_ENET2_CLK_OUTPUT		(0x1 << 18)
 #define IMX6UL_GPR1_ENET_CLK_DIR		(0x3 << 17)
 #define IMX6UL_GPR1_ENET_CLK_OUTPUT		(0x3 << 17)
+#define IMX6UL_GPR1_SAI1_MCLK_DIR		(0x1 << 19)
+#define IMX6UL_GPR1_SAI2_MCLK_DIR		(0x1 << 20)
+#define IMX6UL_GPR1_SAI3_MCLK_DIR		(0x1 << 21)
+#define IMX6UL_GPR1_SAI_MCLK_MASK		(0x7 << 19)
+#define MCLK_DIR(x) (x == 1 ? IMX6UL_GPR1_SAI1_MCLK_DIR : x == 2 ? \
+		     IMX6UL_GPR1_SAI2_MCLK_DIR : IMX6UL_GPR1_SAI3_MCLK_DIR)
 
 #endif /* __LINUX_IMX6Q_IOMUXC_GPR_H */
diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index d8b673f7c577..2147994ab46f 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -21,6 +21,8 @@
 #include <sound/core.h>
 #include <sound/dmaengine_pcm.h>
 #include <sound/pcm_params.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
 
 #include "fsl_sai.h"
 #include "imx-pcm.h"
@@ -786,10 +788,12 @@ static int fsl_sai_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct fsl_sai *sai;
+	struct regmap *gpr;
 	struct resource *res;
 	void __iomem *base;
 	char tmp[8];
 	int irq, ret, i;
+	int index;
 
 	sai = devm_kzalloc(&pdev->dev, sizeof(*sai), GFP_KERNEL);
 	if (!sai)
@@ -878,6 +882,22 @@ static int fsl_sai_probe(struct platform_device *pdev)
 		fsl_sai_dai.symmetric_samplebits = 0;
 	}
 
+	if (of_find_property(np, "fsl,sai-mclk-direction-output", NULL) &&
+	    of_device_is_compatible(pdev->dev.of_node, "fsl,imx6ul-sai")) {
+		gpr = syscon_regmap_lookup_by_compatible("fsl,imx6ul-iomuxc-gpr");
+		if (IS_ERR(gpr)) {
+			dev_err(&pdev->dev, "cannot find iomuxc registers\n");
+			return PTR_ERR(gpr);
+		}
+
+		index = of_alias_get_id(np, "sai");
+		if (index < 0)
+			return index;
+
+		regmap_update_bits(gpr, IOMUXC_GPR1, MCLK_DIR(index),
+				   MCLK_DIR(index));
+	}
+
 	sai->dma_params_rx.addr = res->start + FSL_SAI_RDR;
 	sai->dma_params_tx.addr = res->start + FSL_SAI_TDR;
 	sai->dma_params_rx.maxburst = FSL_SAI_MAXBURST_RX;
-- 
cgit v1.2.3


From 0ef5a50c1658d4d96a44f145bcb92ff3310c75b1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 May 2016 11:54:22 -0400
Subject: block: make bio_inc_remaining() interface accessible again

Commit 326e1dbb57 ("block: remove management of bi_remaining when
restoring original bi_end_io") made bio_inc_remaining() private to bio.c
because the only use-case that made sense was confined to the
bio_chain() interface.

Since that time DM thinp went on to use bio_chain() in its relatively
complex implementation of async discard support.  That implementation,
even when converted over to use the new async __blkdev_issue_discard()
interface, depends on deferred completion of the original discard bio --
which is most appropriately implemented using bio_inc_remaining().

DM thinp foolishly duplicated bio_inc_remaining(), local to dm-thin.c as
__bio_inc_remaining(), so re-exporting bio_inc_remaining() allows us to
put an end to that foolishness.

All said, bio_inc_remaining() should really only be used in conjunction
with bio_chain().  It isn't intended for generic bio reference counting.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 11 -----------
 include/linux/bio.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 807d25e466ec..0e4aa42bc30d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -311,17 +311,6 @@ static void bio_chain_endio(struct bio *bio)
 	bio_endio(__bio_chain_endio(bio));
 }
 
-/*
- * Increment chain count for the bio. Make sure the CHAIN flag update
- * is visible before the raised count.
- */
-static inline void bio_inc_remaining(struct bio *bio)
-{
-	bio_set_flag(bio, BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6b7481f62218..9faebf7f9a33 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -702,6 +702,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio_set_flag(bio, BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
-- 
cgit v1.2.3


From e5b2d30e42e66122c9b1b17529df743bc938c041 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:58:11 +0100
Subject: mtd: nand: sharpsl: switch to mtd_ooblayout_ops

Implementing the mtd_ooblayout_ops interface is the new way of exposing
ECC/OOB layout to MTD users.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 arch/arm/mach-pxa/spitz.c   | 55 ++++++++++++++++++++++++++++++++++++---------
 drivers/mtd/nand/sharpsl.c  |  2 +-
 include/linux/mtd/sharpsl.h |  2 +-
 3 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index d9578bc49fdc..bd7cd8b6a286 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -763,14 +763,49 @@ static struct nand_bbt_descr spitz_nand_bbt = {
 	.pattern	= scan_ff_pattern
 };
 
-static struct nand_ecclayout akita_oobinfo = {
-	.oobfree	= { {0x08, 0x09} },
-	.eccbytes	= 24,
-	.eccpos		= {
-			0x05, 0x01, 0x02, 0x03, 0x06, 0x07, 0x15, 0x11,
-			0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-			0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37,
-	},
+static int akita_ooblayout_ecc(struct mtd_info *mtd, int section,
+			       struct mtd_oob_region *oobregion)
+{
+	if (section > 12)
+		return -ERANGE;
+
+	switch (section % 3) {
+	case 0:
+		oobregion->offset = 5;
+		oobregion->length = 1;
+		break;
+
+	case 1:
+		oobregion->offset = 1;
+		oobregion->length = 3;
+		break;
+
+	case 2:
+		oobregion->offset = 6;
+		oobregion->length = 2;
+		break;
+	}
+
+	oobregion->offset += (section / 3) * 0x10;
+
+	return 0;
+}
+
+static int akita_ooblayout_free(struct mtd_info *mtd, int section,
+				struct mtd_oob_region *oobregion)
+{
+	if (section)
+		return -ERANGE;
+
+	oobregion->offset = 8;
+	oobregion->length = 9;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops akita_ooblayout_ops = {
+	.ecc = akita_ooblayout_ecc,
+	.free = akita_ooblayout_free,
 };
 
 static struct sharpsl_nand_platform_data spitz_nand_pdata = {
@@ -804,11 +839,11 @@ static void __init spitz_nand_init(void)
 	} else if (machine_is_akita()) {
 		spitz_nand_partitions[1].size = 58 * 1024 * 1024;
 		spitz_nand_bbt.len = 1;
-		spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+		spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
 	} else if (machine_is_borzoi()) {
 		spitz_nand_partitions[1].size = 32 * 1024 * 1024;
 		spitz_nand_bbt.len = 1;
-		spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+		spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
 	}
 
 	platform_device_register(&spitz_nand_device);
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index b7d1b55a160b..064ca1757589 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -148,6 +148,7 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
 	/* Link the private data with the MTD structure */
 	mtd = nand_to_mtd(this);
 	mtd->dev.parent = &pdev->dev;
+	mtd_set_ooblayout(mtd, data->ecc_layout);
 
 	platform_set_drvdata(pdev, sharpsl);
 
@@ -170,7 +171,6 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
 	this->ecc.bytes = 3;
 	this->ecc.strength = 1;
 	this->badblock_pattern = data->badblock_pattern;
-	this->ecc.layout = data->ecc_layout;
 	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	this->ecc.calculate = sharpsl_nand_calculate_ecc;
 	this->ecc.correct = nand_correct_data;
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index 25f4d2a845c1..65e91d0fa981 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -14,7 +14,7 @@
 
 struct sharpsl_nand_platform_data {
 	struct nand_bbt_descr	*badblock_pattern;
-	struct nand_ecclayout	*ecc_layout;
+	const struct mtd_ooblayout_ops *ecc_layout;
 	struct mtd_partition	*partitions;
 	unsigned int		nr_partitions;
 };
-- 
cgit v1.2.3


From 04a123a99f089773fc6ee6e21af5f831d87fe362 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 9 Feb 2016 15:01:21 +0100
Subject: mtd: nand: fsmc: get rid of the fsmc_nand_eccplace struct

Now that mtd_ooblayout_ecc() returns the ECC byte position using the
OOB free method, we can get rid of the fsmc_nand_eccplace struct.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/fsmc_nand.c | 60 +++++++++++---------------------------------
 include/linux/mtd/fsmc.h     | 18 -------------
 2 files changed, 15 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 275a98ca4f6a..13720405ec81 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -39,35 +39,6 @@
 #include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
-/*
- * ECC placement definitions in oobfree type format.
- * There are 13 bytes of ecc for every 512 byte block and it has to be read
- * consecutively and immediately after the 512 byte data block for hardware to
- * generate the error bit offsets in 512 byte data.
- * Managing the ecc bytes in the following way makes it easier for software to
- * read ecc bytes consecutive to data bytes. This way is similar to
- * oobfree structure maintained already in generic nand driver
- */
-static struct fsmc_eccplace fsmc_ecc4_lp_place = {
-	.eccplace = {
-		{.offset = 2, .length = 13},
-		{.offset = 18, .length = 13},
-		{.offset = 34, .length = 13},
-		{.offset = 50, .length = 13},
-		{.offset = 66, .length = 13},
-		{.offset = 82, .length = 13},
-		{.offset = 98, .length = 13},
-		{.offset = 114, .length = 13}
-	}
-};
-
-static struct fsmc_eccplace fsmc_ecc4_sp_place = {
-	.eccplace = {
-		{.offset = 0, .length = 4},
-		{.offset = 6, .length = 9}
-	}
-};
-
 static int fsmc_ecc1_ooblayout_ecc(struct mtd_info *mtd, int section,
 				   struct mtd_oob_region *oobregion)
 {
@@ -105,6 +76,12 @@ static const struct mtd_ooblayout_ops fsmc_ecc1_ooblayout_ops = {
 	.free = fsmc_ecc1_ooblayout_free,
 };
 
+/*
+ * ECC placement definitions in oobfree type format.
+ * There are 13 bytes of ecc for every 512 byte block and it has to be read
+ * consecutively and immediately after the 512 byte data block for hardware to
+ * generate the error bit offsets in 512 byte data.
+ */
 static int fsmc_ecc4_ooblayout_ecc(struct mtd_info *mtd, int section,
 				   struct mtd_oob_region *oobregion)
 {
@@ -155,7 +132,6 @@ static const struct mtd_ooblayout_ops fsmc_ecc4_ooblayout_ops = {
  * @partitions:		Partition info for a NAND Flash.
  * @nr_partitions:	Total number of partition of a NAND flash.
  *
- * @ecc_place:		ECC placing locations in oobfree type format.
  * @bank:		Bank number for probed device.
  * @clk:		Clock structure for FSMC.
  *
@@ -175,7 +151,6 @@ struct fsmc_nand_data {
 	struct mtd_partition	*partitions;
 	unsigned int		nr_partitions;
 
-	struct fsmc_eccplace	*ecc_place;
 	unsigned int		bank;
 	struct device		*dev;
 	enum access_mode	mode;
@@ -582,8 +557,6 @@ static void fsmc_write_buf_dma(struct mtd_info *mtd, const uint8_t *buf,
 static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 				 uint8_t *buf, int oob_required, int page)
 {
-	struct fsmc_nand_data *host = mtd_to_fsmc(mtd);
-	struct fsmc_eccplace *ecc_place = host->ecc_place;
 	int i, j, s, stat, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
 	int eccsteps = chip->ecc.steps;
@@ -606,9 +579,15 @@ static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 		chip->read_buf(mtd, p, eccsize);
 
 		for (j = 0; j < eccbytes;) {
-			off = ecc_place->eccplace[group].offset;
-			len = ecc_place->eccplace[group].length;
-			group++;
+			struct mtd_oob_region oobregion;
+			int ret;
+
+			ret = mtd_ooblayout_ecc(mtd, group++, &oobregion);
+			if (ret)
+				return ret;
+
+			off = oobregion.offset;
+			len = oobregion.length;
 
 			/*
 			 * length is intentionally kept a higher multiple of 2
@@ -956,19 +935,10 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	if (AMBA_REV_BITS(host->pid) >= 8) {
 		switch (mtd->oobsize) {
 		case 16:
-			host->ecc_place = &fsmc_ecc4_sp_place;
-			break;
 		case 64:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 128:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 224:
-			host->ecc_place = &fsmc_ecc4_lp_place;
-			break;
 		case 256:
-			host->ecc_place = &fsmc_ecc4_lp_place;
 			break;
 		default:
 			dev_warn(&pdev->dev, "No oob scheme defined for oobsize %d\n",
diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
index c8be32e9fc49..ad3c3488073c 100644
--- a/include/linux/mtd/fsmc.h
+++ b/include/linux/mtd/fsmc.h
@@ -103,24 +103,6 @@
 
 #define FSMC_BUSY_WAIT_TIMEOUT	(1 * HZ)
 
-/*
- * There are 13 bytes of ecc for every 512 byte block in FSMC version 8
- * and it has to be read consecutively and immediately after the 512
- * byte data block for hardware to generate the error bit offsets
- * Managing the ecc bytes in the following way is easier. This way is
- * similar to oobfree structure maintained already in u-boot nand driver
- */
-#define MAX_ECCPLACE_ENTRIES	32
-
-struct fsmc_nand_eccplace {
-	uint8_t offset;
-	uint8_t length;
-};
-
-struct fsmc_eccplace {
-	struct fsmc_nand_eccplace eccplace[MAX_ECCPLACE_ENTRIES];
-};
-
 struct fsmc_nand_timings {
 	uint8_t tclr;
 	uint8_t tar;
-- 
cgit v1.2.3


From a411679fb5fd7ee2df64a55c23c81538ceeb6d06 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Mon, 7 Dec 2015 22:46:45 +0100
Subject: mtd: onenand: switch to mtd_ooblayout_ops

Implementing the mtd_ooblayout_ops interface is the new way of exposing
ECC/OOB layout to MTD users. Modify the onenand drivers to switch to this
approach.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/onenand/onenand_base.c | 162 ++++++++++++++++++++++---------------
 include/linux/mtd/onenand.h        |   2 -
 2 files changed, 97 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index d0fa505d40bd..a4b029a417f0 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -68,21 +68,33 @@ MODULE_PARM_DESC(otp,	"Corresponding behaviour of OneNAND in OTP"
  * flexonenand_oob_128 - oob info for Flex-Onenand with 4KB page
  * For now, we expose only 64 out of 80 ecc bytes
  */
-static struct nand_ecclayout flexonenand_oob_128 = {
-	.eccbytes	= 64,
-	.eccpos		= {
-		6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-		38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-		54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-		70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-		86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-		102, 103, 104, 105
-		},
-	.oobfree	= {
-		{2, 4}, {18, 4}, {34, 4}, {50, 4},
-		{66, 4}, {82, 4}, {98, 4}, {114, 4}
-	}
+static int flexonenand_ooblayout_ecc(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 6;
+	oobregion->length = 10;
+
+	return 0;
+}
+
+static int flexonenand_ooblayout_free(struct mtd_info *mtd, int section,
+				      struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 2;
+	oobregion->length = 4;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops flexonenand_ooblayout_ops = {
+	.ecc = flexonenand_ooblayout_ecc,
+	.free = flexonenand_ooblayout_free,
 };
 
 /*
@@ -91,56 +103,77 @@ static struct nand_ecclayout flexonenand_oob_128 = {
  * Based on specification:
  * 4Gb M-die OneNAND Flash (KFM4G16Q4M, KFN8G16Q4M). Rev. 1.3, Apr. 2010
  *
- * For eccpos we expose only 64 bytes out of 72 (see struct nand_ecclayout)
- *
- * oobfree uses the spare area fields marked as
- * "Managed by internal ECC logic for Logical Sector Number area"
- */
-static struct nand_ecclayout onenand_oob_128 = {
-	.eccbytes	= 64,
-	.eccpos		= {
-		7, 8, 9, 10, 11, 12, 13, 14, 15,
-		23, 24, 25, 26, 27, 28, 29, 30, 31,
-		39, 40, 41, 42, 43, 44, 45, 46, 47,
-		55, 56, 57, 58, 59, 60, 61, 62, 63,
-		71, 72, 73, 74, 75, 76, 77, 78, 79,
-		87, 88, 89, 90, 91, 92, 93, 94, 95,
-		103, 104, 105, 106, 107, 108, 109, 110, 111,
-		119
-	},
-	.oobfree	= {
-		{2, 3}, {18, 3}, {34, 3}, {50, 3},
-		{66, 3}, {82, 3}, {98, 3}, {114, 3}
-	}
+ */
+static int onenand_ooblayout_128_ecc(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *oobregion)
+{
+	if (section > 7)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 7;
+	oobregion->length = 9;
+
+	return 0;
+}
+
+static int onenand_ooblayout_128_free(struct mtd_info *mtd, int section,
+				      struct mtd_oob_region *oobregion)
+{
+	if (section >= 8)
+		return -ERANGE;
+
+	/*
+	 * free bytes are using the spare area fields marked as
+	 * "Managed by internal ECC logic for Logical Sector Number area"
+	 */
+	oobregion->offset = (section * 16) + 2;
+	oobregion->length = 3;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_128_ooblayout_ops = {
+	.ecc = onenand_ooblayout_128_ecc,
+	.free = onenand_ooblayout_128_free,
 };
 
 /**
- * onenand_oob_64 - oob info for large (2KB) page
+ * onenand_oob_32_64 - oob info for large (2KB) page
  */
-static struct nand_ecclayout onenand_oob_64 = {
-	.eccbytes	= 20,
-	.eccpos		= {
-		8, 9, 10, 11, 12,
-		24, 25, 26, 27, 28,
-		40, 41, 42, 43, 44,
-		56, 57, 58, 59, 60,
-		},
-	.oobfree	= {
-		{2, 3}, {14, 2}, {18, 3}, {30, 2},
-		{34, 3}, {46, 2}, {50, 3}, {62, 2}
+static int onenand_ooblayout_32_64_ecc(struct mtd_info *mtd, int section,
+				       struct mtd_oob_region *oobregion)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	oobregion->offset = (section * 16) + 8;
+	oobregion->length = 5;
+
+	return 0;
+}
+
+static int onenand_ooblayout_32_64_free(struct mtd_info *mtd, int section,
+					struct mtd_oob_region *oobregion)
+{
+	int sections = (mtd->oobsize / 32) * 2;
+
+	if (section >= sections)
+		return -ERANGE;
+
+	if (section & 1) {
+		oobregion->offset = ((section - 1) * 16) + 14;
+		oobregion->length = 2;
+	} else  {
+		oobregion->offset = (section * 16) + 2;
+		oobregion->length = 3;
 	}
-};
 
-/**
- * onenand_oob_32 - oob info for middle (1KB) page
- */
-static struct nand_ecclayout onenand_oob_32 = {
-	.eccbytes	= 10,
-	.eccpos		= {
-		8, 9, 10, 11, 12,
-		24, 25, 26, 27, 28,
-		},
-	.oobfree	= { {2, 3}, {14, 2}, {18, 3}, {30, 2} }
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_32_64_ooblayout_ops = {
+	.ecc = onenand_ooblayout_32_64_ecc,
+	.free = onenand_ooblayout_32_64_free,
 };
 
 static const unsigned char ffchars[] = {
@@ -3957,22 +3990,22 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	switch (mtd->oobsize) {
 	case 128:
 		if (FLEXONENAND(this)) {
-			this->ecclayout = &flexonenand_oob_128;
+			mtd_set_ooblayout(mtd, &flexonenand_ooblayout_ops);
 			mtd->subpage_sft = 0;
 		} else {
-			this->ecclayout = &onenand_oob_128;
+			mtd_set_ooblayout(mtd, &onenand_oob_128_ooblayout_ops);
 			mtd->subpage_sft = 2;
 		}
 		if (ONENAND_IS_NOP_1(this))
 			mtd->subpage_sft = 0;
 		break;
 	case 64:
-		this->ecclayout = &onenand_oob_64;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		mtd->subpage_sft = 2;
 		break;
 
 	case 32:
-		this->ecclayout = &onenand_oob_32;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		mtd->subpage_sft = 1;
 		break;
 
@@ -3981,7 +4014,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 			__func__, mtd->oobsize);
 		mtd->subpage_sft = 0;
 		/* To prevent kernel oops */
-		this->ecclayout = &onenand_oob_32;
+		mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
 		break;
 	}
 
@@ -3997,7 +4030,6 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 
 	mtd->oobavail = ret;
 
-	mtd_set_ecclayout(mtd, this->ecclayout);
 	mtd->ecc_strength = 1;
 
 	/* Fill in remaining MTD driver data */
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 4596503c9da9..0aaa98b219a4 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -80,7 +80,6 @@ struct onenand_bufferram {
  * @page_buf:		[INTERN] page main data buffer
  * @oob_buf:		[INTERN] page oob data buffer
  * @subpagesize:	[INTERN] holds the subpagesize
- * @ecclayout:		[REPLACEABLE] the default ecc placement scheme
  * @bbm:		[REPLACEABLE] pointer to Bad Block Management
  * @priv:		[OPTIONAL] pointer to private chip date
  */
@@ -134,7 +133,6 @@ struct onenand_chip {
 #endif
 
 	int			subpagesize;
-	struct nand_ecclayout	*ecclayout;
 
 	void			*bbm;
 
-- 
cgit v1.2.3


From 7f2b092c9eeda055ae60af194a8edacaea5f7a10 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Feb 2016 19:24:10 +0100
Subject: mtd: nand: kill the ecc->layout field

Now that all NAND drivers have switched to mtd_ooblayout_ops, we can kill
the ecc->layout field.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 7 -------
 drivers/mtd/nand/nand_bch.c  | 8 --------
 include/linux/mtd/nand.h     | 2 --
 3 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 6c84a40dbddb..0001be19b90c 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4151,13 +4151,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Set the internal oob buffer location, just after the page data */
 	chip->oob_poi = chip->buffers->databuf + mtd->writesize;
 
-	/*
-	 * Set the provided ECC layout. If ecc->layout is NULL, the MTD core
-	 * will just leave mtd->ooblayout to NULL, if it's not NULL, it will
-	 * set ->ooblayout to the default ecclayout wrapper.
-	 */
-	mtd_set_ecclayout(mtd, ecc->layout);
-
 	/*
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
diff --git a/drivers/mtd/nand/nand_bch.c b/drivers/mtd/nand/nand_bch.c
index 1dbc418baac4..28ef7b178eac 100644
--- a/drivers/mtd/nand/nand_bch.c
+++ b/drivers/mtd/nand/nand_bch.c
@@ -158,14 +158,6 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
 	eccsteps = mtd->writesize/eccsize;
 
-	/*
-	 * Rely on the default ecclayout to ooblayout wrapper provided by MTD
-	 * core if ecc.layout is not NULL.
-	 * FIXME: this should be removed when all callers have moved to the
-	 * mtd_ooblayout_ops approach.
-	 */
-	mtd_set_ecclayout(mtd, nand->ecc.layout);
-
 	/* Check that we have an oob layout description. */
 	if (!mtd->ooblayout) {
 		pr_warn("missing oob scheme");
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f2ded7b1b3b8..e851839daf09 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -473,7 +473,6 @@ struct nand_hw_control {
  * @prepad:	padding information for syndrome based ECC generators
  * @postpad:	padding information for syndrome based ECC generators
  * @options:	ECC specific options (see NAND_ECC_XXX flags defined above)
- * @layout:	ECC layout control struct pointer
  * @priv:	pointer to private ECC control data
  * @hwctl:	function to control hardware ECC generator. Must only
  *		be provided if an hardware ECC is available
@@ -524,7 +523,6 @@ struct nand_ecc_ctrl {
 	int prepad;
 	int postpad;
 	unsigned int options;
-	struct nand_ecclayout	*layout;
 	void *priv;
 	void (*hwctl)(struct mtd_info *mtd, int mode);
 	int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
-- 
cgit v1.2.3


From aab616e31d1c7ec3726f7f5cbdaaec98759ebe93 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 4 Feb 2016 10:16:18 +0100
Subject: mtd: kill the nand_ecclayout struct

Now that all MTD drivers have moved to the mtd_ooblayout_ops model we can
safely remove the struct nand_ecclayout definition, and all the remaining
places where it was still used.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/mtdchar.c      |  12 ++---
 drivers/mtd/mtdcore.c      | 117 ---------------------------------------------
 include/linux/mtd/mtd.h    |  20 --------
 include/uapi/mtd/mtd-abi.h |   2 +-
 4 files changed, 7 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 3fad2c7425b0..2a47a3f0e730 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -465,12 +465,12 @@ static int mtdchar_readoob(struct file *file, struct mtd_info *mtd,
 }
 
 /*
- * Copies (and truncates, if necessary) data from the larger struct,
- * nand_ecclayout, to the smaller, deprecated layout struct,
- * nand_ecclayout_user. This is necessary only to support the deprecated
- * API ioctl ECCGETLAYOUT while allowing all new functionality to use
- * nand_ecclayout flexibly (i.e. the struct may change size in new
- * releases without requiring major rewrites).
+ * Copies (and truncates, if necessary) OOB layout information to the
+ * deprecated layout struct, nand_ecclayout_user. This is necessary only to
+ * support the deprecated API ioctl ECCGETLAYOUT while allowing all new
+ * functionality to use mtd_ooblayout_ops flexibly (i.e. mtd_ooblayout_ops
+ * can describe any kind of OOB layout with almost zero overhead from a
+ * memory usage point of view).
  */
 static int shrink_ecclayout(struct mtd_info *mtd,
 			    struct nand_ecclayout_user *to)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 134ed2f7b919..e3936b847c6b 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1376,123 +1376,6 @@ int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
 }
 EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
 
-/**
- * mtd_ecclayout_ecc - Default ooblayout_ecc iterator implementation
- * @mtd: MTD device structure
- * @section: ECC section. Depending on the layout you may have all the ECC
- *	     bytes stored in a single contiguous section, or one section
- *	     per ECC chunk (and sometime several sections for a single ECC
- *	     ECC chunk)
- * @oobecc: OOB region struct filled with the appropriate ECC position
- *	    information
- *
- * This function is just a wrapper around the mtd->ecclayout field and is
- * here to ease the transition to the mtd_ooblayout_ops approach.
- * All it does is convert the layout->eccpos information into proper oob
- * region definitions.
- *
- * Returns zero on success, a negative error code otherwise.
- */
-static int mtd_ecclayout_ecc(struct mtd_info *mtd, int section,
-			     struct mtd_oob_region *oobecc)
-{
-	int eccbyte = 0, cursection = 0, length = 0, eccpos = 0;
-
-	if (!mtd->ecclayout)
-		return -ENOTSUPP;
-
-	/*
-	 * This logic allows us to reuse the ->ecclayout information and
-	 * expose them as ECC regions (as done for the OOB free regions).
-	 *
-	 * TODO: this should be dropped as soon as we get rid of the
-	 * ->ecclayout field.
-	 */
-	for (eccbyte = 0; eccbyte < mtd->ecclayout->eccbytes; eccbyte++) {
-		eccpos = mtd->ecclayout->eccpos[eccbyte];
-
-		if (eccbyte < mtd->ecclayout->eccbytes - 1) {
-			int neccpos = mtd->ecclayout->eccpos[eccbyte + 1];
-
-			if (eccpos + 1 == neccpos) {
-				length++;
-				continue;
-			}
-		}
-
-		if (section == cursection)
-			break;
-
-		length = 0;
-		cursection++;
-	}
-
-	if (cursection != section || eccbyte >= mtd->ecclayout->eccbytes)
-		return -ERANGE;
-
-	oobecc->length = length + 1;
-	oobecc->offset = eccpos - length;
-
-	return 0;
-}
-
-/**
- * mtd_ecclayout_ecc - Default ooblayout_free iterator implementation
- * @mtd: MTD device structure
- * @section: Free section. Depending on the layout you may have all the free
- *	     bytes stored in a single contiguous section, or one section
- *	     per ECC chunk (and sometime several sections for a single ECC
- *	     ECC chunk)
- * @oobfree: OOB region struct filled with the appropriate free position
- *	     information
- *
- * This function is just a wrapper around the mtd->ecclayout field and is
- * here to ease the transition to the mtd_ooblayout_ops approach.
- * All it does is convert the layout->oobfree information into proper oob
- * region definitions.
- *
- * Returns zero on success, a negative error code otherwise.
- */
-static int mtd_ecclayout_free(struct mtd_info *mtd, int section,
-			      struct mtd_oob_region *oobfree)
-{
-	struct nand_ecclayout *layout = mtd->ecclayout;
-
-	if (!layout)
-		return -ENOTSUPP;
-
-	if (section >= MTD_MAX_OOBFREE_ENTRIES_LARGE ||
-	    !layout->oobfree[section].length)
-		return -ERANGE;
-
-	oobfree->offset = layout->oobfree[section].offset;
-	oobfree->length = layout->oobfree[section].length;
-
-	return 0;
-}
-
-static const struct mtd_ooblayout_ops mtd_ecclayout_wrapper_ops = {
-	.ecc = mtd_ecclayout_ecc,
-	.free = mtd_ecclayout_free,
-};
-
-/**
- * mtd_set_ecclayout - Attach an ecclayout to an MTD device
- * @mtd: MTD device structure
- * @ecclayout: The ecclayout to attach to the device
- *
- * Returns zero on success, a negative error code otherwise.
- */
-void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout)
-{
-	if (!mtd || !ecclayout)
-		return;
-
-	mtd->ecclayout = ecclayout;
-	mtd_set_ooblayout(mtd, &mtd_ecclayout_wrapper_ops);
-}
-EXPORT_SYMBOL_GPL(mtd_set_ecclayout);
-
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 177bf314ad70..29a170612203 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -96,21 +96,6 @@ struct mtd_oob_ops {
 
 #define MTD_MAX_OOBFREE_ENTRIES_LARGE	32
 #define MTD_MAX_ECCPOS_ENTRIES_LARGE	640
-/*
- * Internal ECC layout control structure. For historical reasons, there is a
- * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
- * for export to user-space via the ECCGETLAYOUT ioctl.
- * nand_ecclayout should be expandable in the future simply by the above macros.
- *
- * This structure is now deprecated, you should use struct nand_ecclayout_ops
- * to describe your OOB layout.
- */
-struct nand_ecclayout {
-	__u32 eccbytes;
-	__u32 eccpos[MTD_MAX_ECCPOS_ENTRIES_LARGE];
-	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
-};
-
 /**
  * struct mtd_oob_region - oob region definition
  * @offset: region offset
@@ -200,9 +185,6 @@ struct mtd_info {
 	const char *name;
 	int index;
 
-	/* [Deprecated] ECC layout structure pointer - read only! */
-	struct nand_ecclayout *ecclayout;
-
 	/* OOB layout description */
 	const struct mtd_ooblayout_ops *ooblayout;
 
@@ -308,8 +290,6 @@ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
 int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
 int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
 
-void mtd_set_ecclayout(struct mtd_info *mtd, struct nand_ecclayout *ecclayout);
-
 static inline void mtd_set_ooblayout(struct mtd_info *mtd,
 				     const struct mtd_ooblayout_ops *ooblayout)
 {
diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h
index 763bb6950402..0ec1da2ef652 100644
--- a/include/uapi/mtd/mtd-abi.h
+++ b/include/uapi/mtd/mtd-abi.h
@@ -228,7 +228,7 @@ struct nand_oobfree {
  * complete set of ECC information. The ioctl truncates the larger internal
  * structure to retain binary compatibility with the static declaration of the
  * ioctl. Note that the "MTD_MAX_..._ENTRIES" macros represent the max size of
- * the user struct, not the MAX size of the internal struct nand_ecclayout.
+ * the user struct, not the MAX size of the internal OOB layout representation.
  */
 struct nand_ecclayout_user {
 	__u32 eccbytes;
-- 
cgit v1.2.3


From d48f62b9a0a035d6c16de4a4dae315f7332a8939 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Fri, 1 Apr 2016 14:54:32 +0200
Subject: mtd: nand: move of_get_nand_xxx() helpers into nand_base.c

Now that all drivers go through nand_set_flash_node() to parse the generic
NAND properties, we can move all of_get_nand_xxx() helpers in to
nand_base.c, make them static and remove of_mtd.c and of_mtd.h.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c |  94 +++++++++++++++++++++++++-
 drivers/of/Makefile          |   1 -
 drivers/of/of_mtd.c          | 155 -------------------------------------------
 include/linux/of_mtd.h       |  56 ----------------
 4 files changed, 93 insertions(+), 213 deletions(-)
 delete mode 100644 drivers/of/of_mtd.c
 delete mode 100644 include/linux/of_mtd.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0001be19b90c..9f157bb3faa2 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -45,7 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
+#include <linux/of.h>
 
 static int nand_get_device(struct mtd_info *mtd, int new_state);
 
@@ -3971,6 +3971,98 @@ ident_done:
 	return type;
 }
 
+static const char * const nand_ecc_modes[] = {
+	[NAND_ECC_NONE]		= "none",
+	[NAND_ECC_SOFT]		= "soft",
+	[NAND_ECC_HW]		= "hw",
+	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
+	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
+	[NAND_ECC_SOFT_BCH]	= "soft_bch",
+};
+
+static int of_get_nand_ecc_mode(struct device_node *np)
+{
+	const char *pm;
+	int err, i;
+
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
+		if (!strcasecmp(pm, nand_ecc_modes[i]))
+			return i;
+
+	return -ENODEV;
+}
+
+static int of_get_nand_ecc_algo(struct device_node *np)
+{
+	const char *pm;
+	int err;
+
+	/*
+	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
+	 * It's not implemented yet as currently NAND subsystem ignores
+	 * algorithm explicitly set this way. Once it's handled we should
+	 * document & support new property.
+	 */
+
+	/*
+	 * For backward compatibility we also read "nand-ecc-mode" checking
+	 * for some obsoleted values that were specifying ECC algorithm.
+	 */
+	err = of_property_read_string(np, "nand-ecc-mode", &pm);
+	if (err < 0)
+		return err;
+
+	if (!strcasecmp(pm, "soft"))
+		return NAND_ECC_HAMMING;
+	else if (!strcasecmp(pm, "soft_bch"))
+		return NAND_ECC_BCH;
+
+	return -ENODEV;
+}
+
+static int of_get_nand_ecc_step_size(struct device_node *np)
+{
+	int ret;
+	u32 val;
+
+	ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
+	return ret ? ret : val;
+}
+
+static int of_get_nand_ecc_strength(struct device_node *np)
+{
+	int ret;
+	u32 val;
+
+	ret = of_property_read_u32(np, "nand-ecc-strength", &val);
+	return ret ? ret : val;
+}
+
+static int of_get_nand_bus_width(struct device_node *np)
+{
+	u32 val;
+
+	if (of_property_read_u32(np, "nand-bus-width", &val))
+		return 8;
+
+	switch (val) {
+	case 8:
+	case 16:
+		return val;
+	default:
+		return -EIO;
+	}
+}
+
+static bool of_get_nand_on_flash_bbt(struct device_node *np)
+{
+	return of_property_read_bool(np, "nand-on-flash-bbt");
+}
+
 static int nand_dt_init(struct nand_chip *chip)
 {
 	struct device_node *dn = nand_get_flash_node(chip);
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 156c072b3117..e31bdc123c6e 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_OF_UNITTEST) += unittest.o
 obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
 obj-$(CONFIG_OF_PCI)	+= of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
-obj-$(CONFIG_OF_MTD)	+= of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
deleted file mode 100644
index 15d056e181d2..000000000000
--- a/drivers/of/of_mtd.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- *
- */
-#include <linux/kernel.h>
-#include <linux/of_mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/export.h>
-
-/**
- * It maps 'enum nand_ecc_modes_t' found in include/linux/mtd/nand.h
- * into the device tree binding of 'nand-ecc', so that MTD
- * device driver can get nand ecc from device tree.
- */
-static const char *nand_ecc_modes[] = {
-	[NAND_ECC_NONE]		= "none",
-	[NAND_ECC_SOFT]		= "soft",
-	[NAND_ECC_HW]		= "hw",
-	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
-	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
-	[NAND_ECC_SOFT_BCH]	= "soft_bch",
-};
-
-/**
- * of_get_nand_ecc_mode - Get nand ecc mode for given device_node
- * @np:	Pointer to the given device_node
- *
- * The function gets ecc mode string from property 'nand-ecc-mode',
- * and return its index in nand_ecc_modes table, or errno in error case.
- */
-int of_get_nand_ecc_mode(struct device_node *np)
-{
-	const char *pm;
-	int err, i;
-
-	err = of_property_read_string(np, "nand-ecc-mode", &pm);
-	if (err < 0)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
-		if (!strcasecmp(pm, nand_ecc_modes[i]))
-			return i;
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
-
-/**
- * of_get_nand_ecc_algo - Get nand ecc algorithm for given device_node
- * @np:	Pointer to the given device_node
- *
- * The function gets ecc algorithm and returns its enum value, or errno in error
- * case.
- */
-int of_get_nand_ecc_algo(struct device_node *np)
-{
-	const char *pm;
-	int err;
-
-	/*
-	 * TODO: Read ECC algo OF property and map it to enum nand_ecc_algo.
-	 * It's not implemented yet as currently NAND subsystem ignores
-	 * algorithm explicitly set this way. Once it's handled we should
-	 * document & support new property.
-	 */
-
-	/*
-	 * For backward compatibility we also read "nand-ecc-mode" checking
-	 * for some obsoleted values that were specifying ECC algorithm.
-	 */
-	err = of_property_read_string(np, "nand-ecc-mode", &pm);
-	if (err < 0)
-		return err;
-
-	if (!strcasecmp(pm, "soft"))
-		return NAND_ECC_HAMMING;
-	else if (!strcasecmp(pm, "soft_bch"))
-		return NAND_ECC_BCH;
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_algo);
-
-/**
- * of_get_nand_ecc_step_size - Get ECC step size associated to
- * the required ECC strength (see below).
- * @np:	Pointer to the given device_node
- *
- * return the ECC step size, or errno in error case.
- */
-int of_get_nand_ecc_step_size(struct device_node *np)
-{
-	int ret;
-	u32 val;
-
-	ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
-	return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_step_size);
-
-/**
- * of_get_nand_ecc_strength - Get required ECC strength over the
- * correspnding step size as defined by 'nand-ecc-size'
- * @np:	Pointer to the given device_node
- *
- * return the ECC strength, or errno in error case.
- */
-int of_get_nand_ecc_strength(struct device_node *np)
-{
-	int ret;
-	u32 val;
-
-	ret = of_property_read_u32(np, "nand-ecc-strength", &val);
-	return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_strength);
-
-/**
- * of_get_nand_bus_width - Get nand bus witdh for given device_node
- * @np:	Pointer to the given device_node
- *
- * return bus width option, or errno in error case.
- */
-int of_get_nand_bus_width(struct device_node *np)
-{
-	u32 val;
-
-	if (of_property_read_u32(np, "nand-bus-width", &val))
-		return 8;
-
-	switch(val) {
-	case 8:
-	case 16:
-		return val;
-	default:
-		return -EIO;
-	}
-}
-EXPORT_SYMBOL_GPL(of_get_nand_bus_width);
-
-/**
- * of_get_nand_on_flash_bbt - Get nand on flash bbt for given device_node
- * @np:	Pointer to the given device_node
- *
- * return true if present false other wise
- */
-bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-	return of_property_read_bool(np, "nand-on-flash-bbt");
-}
-EXPORT_SYMBOL_GPL(of_get_nand_on_flash_bbt);
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
deleted file mode 100644
index 0f6aca5c6f2f..000000000000
--- a/include/linux/of_mtd.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- */
-
-#ifndef __LINUX_OF_MTD_H
-#define __LINUX_OF_MTD_H
-
-#ifdef CONFIG_OF_MTD
-
-#include <linux/of.h>
-int of_get_nand_ecc_mode(struct device_node *np);
-int of_get_nand_ecc_algo(struct device_node *np);
-int of_get_nand_ecc_step_size(struct device_node *np);
-int of_get_nand_ecc_strength(struct device_node *np);
-int of_get_nand_bus_width(struct device_node *np);
-bool of_get_nand_on_flash_bbt(struct device_node *np);
-
-#else /* CONFIG_OF_MTD */
-
-static inline int of_get_nand_ecc_mode(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_algo(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_step_size(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_strength(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline int of_get_nand_bus_width(struct device_node *np)
-{
-	return -ENOSYS;
-}
-
-static inline bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-	return false;
-}
-
-#endif /* CONFIG_OF_MTD */
-
-#endif /* __LINUX_OF_MTD_H */
-- 
cgit v1.2.3


From e4225ae8234cf5548c38dc887b233ad1d45b4d53 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 17 Apr 2016 22:53:07 +0200
Subject: mtd: mtd: drop NAND_ECC_SOFT_BCH enum value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This value should not be part of nand_ecc_modes_t as it specifies
algorithm not a mode. We successfully managed to introduce new "algo"
field which is respected now.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/fsmc_nand.c   |  3 +--
 drivers/mtd/nand/jz4780_nand.c |  1 -
 drivers/mtd/nand/mxc_nand.c    |  1 -
 drivers/mtd/nand/nand_base.c   | 11 +++--------
 drivers/mtd/nand/nandsim.c     |  2 +-
 drivers/mtd/nand/sunxi_nand.c  |  2 --
 include/linux/mtd/nand.h       |  1 -
 7 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 0f8c63f85f86..d4f454a4b35e 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -959,7 +959,6 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 			break;
 
 		case NAND_ECC_SOFT:
-		case NAND_ECC_SOFT_BCH:
 			if (nand->ecc.algo == NAND_ECC_BCH) {
 				dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
 				break;
@@ -974,7 +973,7 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 		 * Don't set layout for BCH4 SW ECC. This will be
 		 * generated later in nand_bch_init() later.
 		 */
-		if (nand->ecc.mode != NAND_ECC_SOFT_BCH) {
+		if (nand->ecc.mode == NAND_ECC_HW) {
 			switch (mtd->oobsize) {
 			case 16:
 			case 64:
diff --git a/drivers/mtd/nand/jz4780_nand.c b/drivers/mtd/nand/jz4780_nand.c
index 10f249ac148c..daf3c4217f4d 100644
--- a/drivers/mtd/nand/jz4780_nand.c
+++ b/drivers/mtd/nand/jz4780_nand.c
@@ -179,7 +179,6 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
 		chip->ecc.correct = jz4780_nand_ecc_correct;
 		/* fall through */
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		dev_info(dev, "using %s (strength %d, size %d, bytes %d)\n",
 			(nfc->bch) ? "hardware BCH" : "software ECC",
 			chip->ecc.strength, chip->ecc.size, chip->ecc.bytes);
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index aa0ba4b99395..5173fadc9a4e 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -1625,7 +1625,6 @@ static int mxcnd_probe(struct platform_device *pdev)
 		break;
 
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		break;
 
 	default:
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 1d74b802aa0a..c9d6230eab08 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4183,8 +4183,7 @@ static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-	if (WARN_ON(ecc->mode != NAND_ECC_SOFT &&
-		    ecc->mode != NAND_ECC_SOFT_BCH))
+	if (WARN_ON(ecc->mode != NAND_ECC_SOFT))
 		return -EINVAL;
 
 	switch (ecc->algo) {
@@ -4331,8 +4330,7 @@ int nand_scan_tail(struct mtd_info *mtd)
 	 * If no default placement scheme is given, select an appropriate one.
 	 */
 	if (!mtd->ooblayout &&
-	    !((ecc->mode == NAND_ECC_SOFT || ecc->mode == NAND_ECC_SOFT_BCH) &&
-	       ecc->algo == NAND_ECC_BCH)) {
+	    !(ecc->mode == NAND_ECC_SOFT && ecc->algo == NAND_ECC_BCH)) {
 		switch (mtd->oobsize) {
 		case 8:
 		case 16:
@@ -4426,7 +4424,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		ecc->algo = NAND_ECC_HAMMING;
 
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		ret = nand_set_ecc_soft_ops(mtd);
 		if (ret) {
 			ret = -EINVAL;
@@ -4514,7 +4511,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Large page NAND with SOFT_ECC should support subpage reads */
 	switch (ecc->mode) {
 	case NAND_ECC_SOFT:
-	case NAND_ECC_SOFT_BCH:
 		if (chip->page_shift > 9)
 			chip->options |= NAND_SUBPAGE_READ;
 		break;
@@ -4614,8 +4610,7 @@ void nand_release(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd_to_nand(mtd);
 
-	if ((chip->ecc.mode == NAND_ECC_SOFT ||
-	     chip->ecc.mode == NAND_ECC_SOFT_BCH) &&
+	if (chip->ecc.mode == NAND_ECC_SOFT &&
 	    chip->ecc.algo == NAND_ECC_BCH)
 		nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
 
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 66eebb9e77f3..794745dff7f1 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -2339,7 +2339,7 @@ static int __init ns_init_module(void)
 			retval = -EINVAL;
 			goto error;
 		}
-		chip->ecc.mode = NAND_ECC_SOFT_BCH;
+		chip->ecc.mode = NAND_ECC_SOFT;
 		chip->ecc.algo = NAND_ECC_BCH;
 		chip->ecc.size = 512;
 		chip->ecc.strength = bch;
diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c
index 1baf89836210..a83a690688b4 100644
--- a/drivers/mtd/nand/sunxi_nand.c
+++ b/drivers/mtd/nand/sunxi_nand.c
@@ -1612,8 +1612,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
 		return -EINVAL;
 
 	switch (ecc->mode) {
-	case NAND_ECC_SOFT_BCH:
-		break;
 	case NAND_ECC_HW:
 		ret = sunxi_nand_hw_ecc_ctrl_init(mtd, ecc, np);
 		if (ret)
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e851839daf09..fbe8e164a4ee 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -116,7 +116,6 @@ typedef enum {
 	NAND_ECC_HW,
 	NAND_ECC_HW_SYNDROME,
 	NAND_ECC_HW_OOB_FIRST,
-	NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
 enum nand_ecc_algo {
-- 
cgit v1.2.3


From 466c3fb618b8520b75be37fcb115e9610663b945 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Thu, 5 May 2016 22:35:54 -0400
Subject: jbd2: remove excess descriptions for handle_s

Commit bf6993276f74 ("jbd2: Use tracepoints for history file")
removed the members j_history, j_history_max and j_history_cur from struct
handle_s but the descriptions stayed lingering. Removing them.

Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/jbd2.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 39511484ad10..efb232c5f668 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -789,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end)
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *	number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
- * @j_history: Buffer storing the transactions statistics history
- * @j_history_max: Maximum number of transactions in the statistics history
- * @j_history_cur: Current number of transactions in the statistics history
  * @j_history_lock: Protect the transactions statistics history
  * @j_proc_entry: procfs entry for the jbd statistics directory
  * @j_stats: Overall statistics
-- 
cgit v1.2.3


From 73898db0430125606c86c798c0627aefef9af9ed Mon Sep 17 00:00:00 2001
From: Haggai Abramovsky <hagaya@mellanox.com>
Date: Wed, 4 May 2016 14:50:15 +0300
Subject: net/mlx4: Avoid wrong virtual mappings

The dma_alloc_coherent() function returns a virtual address which can
be used for coherent access to the underlying memory.  On some
architectures, like arm64, undefined behavior results if this memory is
also accessed via virtual mappings that are not coherent.  Because of
their undefined nature, operations like virt_to_page() return garbage
when passed virtual addresses obtained from dma_alloc_coherent().  Any
subsequent mappings via vmap() of the garbage page values are unusable
and result in bad things like bus errors (synchronous aborts in ARM64
speak).

The mlx4 driver contains code that does the equivalent of:
vmap(virt_to_page(dma_alloc_coherent)), this results in an OOPs when the
device is opened.

Prevent Ethernet driver to run this problematic code by forcing it to
allocate contiguous memory. As for the Infiniband driver, at first we
are trying to allocate contiguous memory, but in case of failure roll
back to work with fragmented memory.

Signed-off-by: Haggai Abramovsky <hagaya@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reported-by: David Daney <david.daney@cavium.com>
Tested-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/qp.c                   | 27 +++++--
 drivers/net/ethernet/mellanox/mlx4/alloc.c        | 93 ++++++++++-------------
 drivers/net/ethernet/mellanox/mlx4/en_cq.c        |  9 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_resources.c | 31 --------
 drivers/net/ethernet/mellanox/mlx4/en_rx.c        | 11 +--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c        | 14 +---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  2 -
 include/linux/mlx4/device.h                       |  4 +-
 9 files changed, 68 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index fd97534762b8..81b0e1fbec1d 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -419,7 +419,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 }
 
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp,
+			      bool shrink_wqe)
 {
 	int s;
 
@@ -477,7 +478,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	 * We set WQE size to at least 64 bytes, this way stamping
 	 * invalidates each WQE.
 	 */
-	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
 	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
 	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
 	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
@@ -642,6 +643,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 {
 	int qpn;
 	int err;
+	struct ib_qp_cap backup_cap;
 	struct mlx4_ib_sqp *sqp;
 	struct mlx4_ib_qp *qp;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
@@ -775,7 +777,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err;
 		}
 
-		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+		memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap));
+		err = set_kernel_sq_size(dev, &init_attr->cap,
+					 qp_type, qp, true);
 		if (err)
 			goto err;
 
@@ -787,9 +791,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			*qp->db.db = 0;
 		}
 
-		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
-			err = -ENOMEM;
-			goto err_db;
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size,
+				   &qp->buf, gfp)) {
+			memcpy(&init_attr->cap, &backup_cap,
+			       sizeof(backup_cap));
+			err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
+						 qp, false);
+			if (err)
+				goto err_db;
+
+			if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+					   PAGE_SIZE * 2, &qp->buf, gfp)) {
+				err = -ENOMEM;
+				goto err_db;
+			}
 		}
 
 		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 0c51c69f802f..249a4584401a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -576,41 +576,48 @@ out:
 
 	return res;
 }
-/*
- * Handling for queue buffers -- we allocate a bunch of memory and
- * register it in a memory region at HCA virtual address 0.  If the
- * requested size is > max_direct, we split the allocation into
- * multiple pages, so we don't require too much contiguous memory.
- */
 
-int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf, gfp_t gfp)
+static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size,
+				 struct mlx4_buf *buf, gfp_t gfp)
 {
 	dma_addr_t t;
 
-	if (size <= max_direct) {
-		buf->nbufs        = 1;
-		buf->npages       = 1;
-		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_alloc_coherent(&dev->persist->pdev->dev,
-						       size, &t, gfp);
-		if (!buf->direct.buf)
-			return -ENOMEM;
+	buf->nbufs        = 1;
+	buf->npages       = 1;
+	buf->page_shift   = get_order(size) + PAGE_SHIFT;
+	buf->direct.buf   =
+		dma_zalloc_coherent(&dev->persist->pdev->dev,
+				    size, &t, gfp);
+	if (!buf->direct.buf)
+		return -ENOMEM;
 
-		buf->direct.map = t;
+	buf->direct.map = t;
 
-		while (t & ((1 << buf->page_shift) - 1)) {
-			--buf->page_shift;
-			buf->npages *= 2;
-		}
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
 
-		memset(buf->direct.buf, 0, size);
+	return 0;
+}
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ *  requested size is > max_direct, we split the allocation into
+ *  multiple pages, so we don't require too much contiguous memory.
+ */
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf, gfp_t gfp)
+{
+	if (size <= max_direct) {
+		return mlx4_buf_direct_alloc(dev, size, buf, gfp);
 	} else {
+		dma_addr_t t;
 		int i;
 
-		buf->direct.buf  = NULL;
-		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
-		buf->npages      = buf->nbufs;
+		buf->direct.buf = NULL;
+		buf->nbufs	= (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages	= buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
 		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
 					   gfp);
@@ -619,28 +626,12 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->persist->pdev->dev,
-						   PAGE_SIZE,
-						   &t, gfp);
+				dma_zalloc_coherent(&dev->persist->pdev->dev,
+						    PAGE_SIZE, &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
 
 			buf->page_list[i].map = t;
-
-			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
-		}
-
-		if (BITS_PER_LONG == 64) {
-			struct page **pages;
-			pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
-			if (!pages)
-				goto err_free;
-			for (i = 0; i < buf->nbufs; ++i)
-				pages[i] = virt_to_page(buf->page_list[i].buf);
-			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
-			kfree(pages);
-			if (!buf->direct.buf)
-				goto err_free;
 		}
 	}
 
@@ -655,15 +646,11 @@ EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
 
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 {
-	int i;
-
-	if (buf->nbufs == 1)
+	if (buf->nbufs == 1) {
 		dma_free_coherent(&dev->persist->pdev->dev, size,
-				  buf->direct.buf,
-				  buf->direct.map);
-	else {
-		if (BITS_PER_LONG == 64)
-			vunmap(buf->direct.buf);
+				  buf->direct.buf, buf->direct.map);
+	} else {
+		int i;
 
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->page_list[i].buf)
@@ -789,7 +776,7 @@ void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
 EXPORT_SYMBOL_GPL(mlx4_db_free);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
-		       int size, int max_direct)
+		       int size)
 {
 	int err;
 
@@ -799,7 +786,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 
 	*wqres->db.db = 0;
 
-	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL);
+	err = mlx4_buf_direct_alloc(dev, size, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_db;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index af975a2b74c6..132cea655920 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -73,22 +73,16 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	 */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
-				cq->buf_size, 2 * PAGE_SIZE);
+				cq->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_cq;
 
-	err = mlx4_en_map_buffer(&cq->wqres.buf);
-	if (err)
-		goto err_res;
-
 	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
 	*pcq = cq;
 
 	return 0;
 
-err_res:
-	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
 err_cq:
 	kfree(cq);
 	*pcq = NULL;
@@ -177,7 +171,6 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_cq *cq = *pcq;
 
-	mlx4_en_unmap_buffer(&cq->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
 	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
 	    cq->is_tx == RX)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6f28ac58251c..92e0624f4cf0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2928,7 +2928,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
 	/* Allocate page for receive rings */
 	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
-				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+				MLX4_EN_PAGE_SIZE);
 	if (err) {
 		en_err(priv, "Failed to allocate page for rx qps\n");
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index 02e925d6f734..a6b0db0e0383 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -107,37 +107,6 @@ int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
 	return ret;
 }
 
-int mlx4_en_map_buffer(struct mlx4_buf *buf)
-{
-	struct page **pages;
-	int i;
-
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
-		return 0;
-
-	pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
-
-	for (i = 0; i < buf->nbufs; ++i)
-		pages[i] = virt_to_page(buf->page_list[i].buf);
-
-	buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
-	kfree(pages);
-	if (!buf->direct.buf)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
-{
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
-		return;
-
-	vunmap(buf->direct.buf);
-}
-
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
 {
     return;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b723e3bcab39..8ef6875b6cf9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -394,17 +394,11 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 
 	/* Allocate HW buffers on provided NUMA node */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
-	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
-				 ring->buf_size, 2 * PAGE_SIZE);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_info;
 
-	err = mlx4_en_map_buffer(&ring->wqres.buf);
-	if (err) {
-		en_err(priv, "Failed to map RX buffer\n");
-		goto err_hwq;
-	}
 	ring->buf = ring->wqres.buf.direct.buf;
 
 	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
@@ -412,8 +406,6 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	*pring = ring;
 	return 0;
 
-err_hwq:
-	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 err_info:
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
@@ -517,7 +509,6 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rx_ring *ring = *pring;
 
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 0f206a95429c..f6e61570cb2c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -94,20 +94,13 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
 	/* Allocate HW buffers on provided NUMA node */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
-	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
-				 2 * PAGE_SIZE);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
 		goto err_bounce;
 	}
 
-	err = mlx4_en_map_buffer(&ring->wqres.buf);
-	if (err) {
-		en_err(priv, "Failed to map TX buffer\n");
-		goto err_hwq_res;
-	}
-
 	ring->buf = ring->wqres.buf.direct.buf;
 
 	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
@@ -118,7 +111,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 				    MLX4_RESERVE_ETH_BF_QP);
 	if (err) {
 		en_err(priv, "failed reserving qp for TX ring\n");
-		goto err_map;
+		goto err_hwq_res;
 	}
 
 	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
@@ -155,8 +148,6 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
 err_reserve:
 	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
-err_map:
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq_res:
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 err_bounce:
@@ -183,7 +174,6 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 	mlx4_qp_remove(mdev->dev, &ring->qp);
 	mlx4_qp_free(mdev->dev, &ring->qp);
 	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
-	mlx4_en_unmap_buffer(&ring->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
 	kfree(ring->bounce_buf);
 	ring->bounce_buf = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 63b1aeae2c03..cc84e09f324a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -672,8 +672,6 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 		int is_tx, int rss, int qpn, int cqn, int user_prio,
 		struct mlx4_qp_context *context);
 void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
-int mlx4_en_map_buffer(struct mlx4_buf *buf);
-void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
 int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
 			    int loopback);
 void mlx4_en_calc_rx_buf(struct net_device *dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d1f904c8b2cb..80dec87a94f8 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1058,7 +1058,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
-	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+	if (buf->nbufs == 1)
 		return buf->direct.buf + offset;
 	else
 		return buf->page_list[offset >> PAGE_SHIFT].buf +
@@ -1098,7 +1098,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order,
 void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
-		       int size, int max_direct);
+		       int size);
 void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
 		       int size);
 
-- 
cgit v1.2.3


From 6f3ffc19157a14b182d9d0c449cd613cef421fe1 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Tue, 12 Jan 2016 18:17:19 +0100
Subject: timer: add setup_deferrable_timer macro

Add the trivial missing macro to setup a deferrable timer.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 61aa61dc410c..20ac746f3eb3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -145,6 +145,8 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_deferrable_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
-- 
cgit v1.2.3


From bc3c57c13256c748a4e71d2ea7481d0c953e88e1 Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Wed, 4 May 2016 17:01:36 -0500
Subject: rpmsg: add THIS_MODULE to rpmsg_driver in rpmsg core

Add register_rpmsg_driver helper macro that adds THIS_MODULE to
rpmsg_driver for the registering driver. We rename and modify
the existing register_rpmsg_driver to enable this.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Acked-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/virtio_rpmsg_bus.c | 8 +++++---
 include/linux/rpmsg.h            | 6 +++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 1fcd27c1f183..fe03b2aef450 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -436,17 +436,19 @@ static struct bus_type rpmsg_bus = {
 };
 
 /**
- * register_rpmsg_driver() - register an rpmsg driver with the rpmsg bus
+ * __register_rpmsg_driver() - register an rpmsg driver with the rpmsg bus
  * @rpdrv: pointer to a struct rpmsg_driver
+ * @owner: owning module/driver
  *
  * Returns 0 on success, and an appropriate error value on failure.
  */
-int register_rpmsg_driver(struct rpmsg_driver *rpdrv)
+int __register_rpmsg_driver(struct rpmsg_driver *rpdrv, struct module *owner)
 {
 	rpdrv->drv.bus = &rpmsg_bus;
+	rpdrv->drv.owner = owner;
 	return driver_register(&rpdrv->drv);
 }
-EXPORT_SYMBOL(register_rpmsg_driver);
+EXPORT_SYMBOL(__register_rpmsg_driver);
 
 /**
  * unregister_rpmsg_driver() - unregister an rpmsg driver from the rpmsg bus
diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 82a673905edb..7ff5790c185a 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -169,7 +169,7 @@ struct rpmsg_driver {
 
 int register_rpmsg_device(struct rpmsg_channel *dev);
 void unregister_rpmsg_device(struct rpmsg_channel *dev);
-int register_rpmsg_driver(struct rpmsg_driver *drv);
+int __register_rpmsg_driver(struct rpmsg_driver *drv, struct module *owner);
 void unregister_rpmsg_driver(struct rpmsg_driver *drv);
 void rpmsg_destroy_ept(struct rpmsg_endpoint *);
 struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_channel *,
@@ -177,6 +177,10 @@ struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_channel *,
 int
 rpmsg_send_offchannel_raw(struct rpmsg_channel *, u32, u32, void *, int, bool);
 
+/* use a macro to avoid include chaining to get THIS_MODULE */
+#define register_rpmsg_driver(drv) \
+	__register_rpmsg_driver(drv, THIS_MODULE)
+
 /**
  * rpmsg_send() - send a message across to the remote processor
  * @rpdev: the rpmsg channel
-- 
cgit v1.2.3


From f3d9f1ce079370d19b34fc9927b8b1355ac98503 Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Wed, 4 May 2016 17:01:38 -0500
Subject: rpmsg: add helper macro module_rpmsg_driver

This patch introduces the module_rpmsg_driver macro which is a
convenience macro for rpmsg driver modules similar to
module_platform_driver. It is intended to be used by drivers which
init/exit section does nothing but register/unregister the rpmsg driver.
By using this macro it is possible to eliminate a few lines of
boilerplate code per rpmsg driver.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 7ff5790c185a..ada50ff36da0 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -181,6 +181,18 @@ rpmsg_send_offchannel_raw(struct rpmsg_channel *, u32, u32, void *, int, bool);
 #define register_rpmsg_driver(drv) \
 	__register_rpmsg_driver(drv, THIS_MODULE)
 
+/**
+ * module_rpmsg_driver() - Helper macro for registering an rpmsg driver
+ * @__rpmsg_driver: rpmsg_driver struct
+ *
+ * Helper macro for rpmsg drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate.  Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_rpmsg_driver(__rpmsg_driver) \
+	module_driver(__rpmsg_driver, register_rpmsg_driver, \
+			unregister_rpmsg_driver)
+
 /**
  * rpmsg_send() - send a message across to the remote processor
  * @rpdev: the rpmsg channel
-- 
cgit v1.2.3


From 1145e6351a9fefe0965df4c6dba2a04156dc47d2 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:02:56 +0200
Subject: lightnvm: implement nvm_submit_ppa_list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nvm_submit_ppa function assumes that users manage all plane
blocks as a single block. Extend the API with nvm_submit_ppa_list
to allow the user to send its own ppa list. If the user submits more
than a single PPA, the user must take care to allocate and free
the corresponding ppa list.

Reviewed by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 88 +++++++++++++++++++++++++++++++++++++-----------
 include/linux/lightnvm.h |  2 ++
 2 files changed, 71 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index c2ef53a0d7f8..f4e04a505859 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -322,11 +322,10 @@ static void nvm_end_io_sync(struct nvm_rq *rqd)
 	complete(waiting);
 }
 
-int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
-				int opcode, int flags, void *buf, int len)
+int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
+						int flags, void *buf, int len)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	struct nvm_rq rqd;
 	struct bio *bio;
 	int ret;
 	unsigned long hang_check;
@@ -335,24 +334,17 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
 	if (IS_ERR_OR_NULL(bio))
 		return -ENOMEM;
 
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas);
-	if (ret) {
-		bio_put(bio);
-		return ret;
-	}
+	nvm_generic_to_addr_mode(dev, rqd);
 
-	rqd.opcode = opcode;
-	rqd.bio = bio;
-	rqd.wait = &wait;
-	rqd.dev = dev;
-	rqd.end_io = nvm_end_io_sync;
-	rqd.flags = flags;
-	nvm_generic_to_addr_mode(dev, &rqd);
+	rqd->dev = dev;
+	rqd->opcode = opcode;
+	rqd->flags = flags;
+	rqd->bio = bio;
+	rqd->wait = &wait;
+	rqd->end_io = nvm_end_io_sync;
 
-	ret = dev->ops->submit_io(dev, &rqd);
+	ret = dev->ops->submit_io(dev, rqd);
 	if (ret) {
-		nvm_free_rqd_ppalist(dev, &rqd);
 		bio_put(bio);
 		return ret;
 	}
@@ -364,9 +356,67 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
 	else
 		wait_for_completion_io(&wait);
 
+	return rqd->error;
+}
+
+/**
+ * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must
+ *			 take to free ppa list if necessary.
+ * @dev:	device
+ * @ppa_list:	user created ppa_list
+ * @nr_ppas:	length of ppa_list
+ * @opcode:	device opcode
+ * @flags:	device flags
+ * @buf:	data buffer
+ * @len:	data buffer length
+ */
+int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list,
+			int nr_ppas, int opcode, int flags, void *buf, int len)
+{
+	struct nvm_rq rqd;
+
+	if (dev->ops->max_phys_sect < nr_ppas)
+		return -EINVAL;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	rqd.nr_pages = nr_ppas;
+	if (nr_ppas > 1)
+		rqd.ppa_list = ppa_list;
+	else
+		rqd.ppa_addr = ppa_list[0];
+
+	return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
+}
+EXPORT_SYMBOL(nvm_submit_ppa_list);
+
+/**
+ * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded
+ *		    as single, dual, quad plane PPAs depending on device type.
+ * @dev:	device
+ * @ppa:	user created ppa_list
+ * @nr_ppas:	length of ppa_list
+ * @opcode:	device opcode
+ * @flags:	device flags
+ * @buf:	data buffer
+ * @len:	data buffer length
+ */
+int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
+				int opcode, int flags, void *buf, int len)
+{
+	struct nvm_rq rqd;
+	int ret;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas);
+	if (ret)
+		return ret;
+
+	ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
+
 	nvm_free_rqd_ppalist(dev, &rqd);
 
-	return rqd.error;
+	return ret;
 }
 EXPORT_SYMBOL(nvm_submit_ppa);
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index cdcb2ccbefa8..38814e262872 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -534,6 +534,8 @@ extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);
+extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
+							int, void *, int);
 
 /* sysblk.c */
 #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
-- 
cgit v1.2.3


From 4891d120b9cd419f4350b11e1231083745dcdc8b Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:02:57 +0200
Subject: lightnvm: add fpg_size and pfpg_size to struct nvm_dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flash page size (fpg) and size across planes (pfpg) are convenient
to know when allocating buffer sizes. This has previously been a
calculated in various places. Replace with the pre-calculated values.

Reviewed by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   |  2 ++
 drivers/lightnvm/sysblk.c | 17 +++++++----------
 include/linux/lightnvm.h  |  2 ++
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index f4e04a505859..652b8c7673ee 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -476,6 +476,8 @@ static int nvm_core_init(struct nvm_dev *dev)
 	dev->pgs_per_blk = grp->num_pg;
 	dev->blks_per_lun = grp->num_blk;
 	dev->nr_planes = grp->num_pln;
+	dev->fpg_size = grp->fpg_sz;
+	dev->pfpg_size = grp->fpg_sz * grp->num_pln;
 	dev->sec_size = grp->csecs;
 	dev->oob_size = grp->sos;
 	dev->sec_per_pg = grp->fpg_sz / grp->csecs;
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index b1e1404e6951..b6971f8ae3c0 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -154,13 +154,12 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
 						struct nvm_system_block *sblk)
 {
 	struct nvm_system_block *cur;
-	int pg, cursz, ret, found = 0;
+	int pg, ret, found = 0;
 
 	/* the full buffer for a flash page is allocated. Only the first of it
 	 * contains the system block information
 	 */
-	cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
-	cur = kmalloc(cursz, GFP_KERNEL);
+	cur = kmalloc(dev->pfpg_size, GFP_KERNEL);
 	if (!cur)
 		return -ENOMEM;
 
@@ -169,7 +168,7 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
 		ppa->g.pg = ppa_to_slc(dev, pg);
 
 		ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
-								cur, cursz);
+							cur, dev->pfpg_size);
 		if (ret) {
 			if (ret == NVM_RSP_ERR_EMPTYPAGE) {
 				pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
@@ -272,14 +271,12 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 {
 	struct nvm_system_block nvmsb;
 	void *buf;
-	int i, sect, ret = 0, bufsz;
+	int i, sect, ret = 0;
 	struct ppa_addr *ppas;
 
 	nvm_cpu_to_sysblk(&nvmsb, info);
 
-	/* buffer for flash page */
-	bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
-	buf = kzalloc(bufsz, GFP_KERNEL);
+	buf = kzalloc(dev->pfpg_size, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
@@ -309,7 +306,7 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 		}
 
 		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE,
-						NVM_IO_SLC_MODE, buf, bufsz);
+					NVM_IO_SLC_MODE, buf, dev->pfpg_size);
 		if (ret) {
 			pr_err("nvm: sysblk failed program (%u %u %u)\n",
 							ppas[0].g.ch,
@@ -319,7 +316,7 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 		}
 
 		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD,
-						NVM_IO_SLC_MODE, buf, bufsz);
+					NVM_IO_SLC_MODE, buf, dev->pfpg_size);
 		if (ret) {
 			pr_err("nvm: sysblk failed read (%u %u %u)\n",
 							ppas[0].g.ch,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 38814e262872..f7c607f96858 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -323,6 +323,8 @@ struct nvm_dev {
 	int sec_per_pg; /* only sectors for a single page */
 	int pgs_per_blk;
 	int blks_per_lun;
+	int fpg_size;
+	int pfpg_size; /* size of buffer if all pages are to be read */
 	int sec_size;
 	int oob_size;
 	int mccap;
-- 
cgit v1.2.3


From 22e8c9766a669d49cf3749d397082a5cd93374a9 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:02:58 +0200
Subject: lightnvm: move block fold outside of get_bb_tbl()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The get block table command returns a list of blocks and planes
with their associated state. Users, such as gennvm and sysblk,
manages all planes as a single virtual block.

It was therefore  natural to fold the bad block list before it is
returned. However, to allow users, which manages on a per-plane
block level, to also use the interface, the get_bb_tbl interface is
changed to not fold by default and instead let the caller fold if
necessary.

Reviewed by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 35 +++++++++++++++++++++++++++++++++
 drivers/lightnvm/gennvm.c    | 14 +++++++------
 drivers/lightnvm/sysblk.c    | 29 ++++++++++++++++++---------
 drivers/nvme/host/lightnvm.c | 47 ++++++--------------------------------------
 include/linux/lightnvm.h     |  6 ++++--
 5 files changed, 73 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 652b8c7673ee..4cadbe0cd537 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -420,6 +420,41 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
 }
 EXPORT_SYMBOL(nvm_submit_ppa);
 
+/*
+ * folds a bad block list from its plane representation to its virtual
+ * block representation. The fold is done in place and reduced size is
+ * returned.
+ *
+ * If any of the planes status are bad or grown bad block, the virtual block
+ * is marked bad. If not bad, the first plane state acts as the block state.
+ */
+int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
+{
+	int blk, offset, pl, blktype;
+
+	if (nr_blks != dev->blks_per_lun * dev->plane_mode)
+		return -EINVAL;
+
+	for (blk = 0; blk < dev->blks_per_lun; blk++) {
+		offset = blk * dev->plane_mode;
+		blktype = blks[offset];
+
+		/* Bad blocks on any planes take precedence over other types */
+		for (pl = 0; pl < dev->plane_mode; pl++) {
+			if (blks[offset + pl] &
+					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
+				blktype = blks[offset + pl];
+				break;
+			}
+		}
+
+		blks[blk] = blktype;
+	}
+
+	return dev->blks_per_lun;
+}
+EXPORT_SYMBOL(nvm_bb_tbl_fold);
+
 static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 {
 	int i;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 72e124a3927d..6096077c5dde 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -129,18 +129,21 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
 	return 0;
 }
 
-static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
-								void *private)
+static int gennvm_block_bb(struct nvm_dev *dev, struct ppa_addr ppa,
+					u8 *blks, int nr_blks, void *private)
 {
 	struct gen_nvm *gn = private;
-	struct nvm_dev *dev = gn->dev;
 	struct gen_lun *lun;
 	struct nvm_block *blk;
 	int i;
 
+	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
 	lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
 
-	for (i = 0; i < nr_blocks; i++) {
+	for (i = 0; i < nr_blks; i++) {
 		if (blks[i] == 0)
 			continue;
 
@@ -250,8 +253,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			ppa = generic_to_dev_addr(dev, ppa);
 
 			ret = dev->ops->get_bb_tbl(dev, ppa,
-						dev->blks_per_lun,
-						gennvm_block_bb, gn);
+							gennvm_block_bb, gn);
 			if (ret)
 				pr_err("gennvm: could not read BB table\n");
 		}
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index b6971f8ae3c0..7fce58833a07 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -93,12 +93,16 @@ void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
 	s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
 }
 
-static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
-								void *private)
+static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
+					u8 *blks, int nr_blks, void *private)
 {
 	struct sysblk_scan *s = private;
 	int i, nr_sysblk = 0;
 
+	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
 	for (i = 0; i < nr_blks; i++) {
 		if (blks[i] != NVM_BLK_T_HOST)
 			continue;
@@ -130,7 +134,7 @@ static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
 		dppa = generic_to_dev_addr(dev, ppas[i]);
 		s->row = i;
 
-		ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s);
+		ret = dev->ops->get_bb_tbl(dev, dppa, fn, s);
 		if (ret) {
 			pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
 							ppas[i].g.ch,
@@ -235,13 +239,17 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
 	return 0;
 }
 
-static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
-								void *private)
+static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa,
+					u8 *blks, int nr_blks, void *private)
 {
 	struct sysblk_scan *s = private;
 	struct ppa_addr *sppa;
 	int i, blkid = 0;
 
+	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
 	for (i = 0; i < nr_blks; i++) {
 		if (blks[i] == NVM_BLK_T_HOST)
 			return -EEXIST;
@@ -578,13 +586,16 @@ static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun)
 								BITS_PER_LONG;
 }
 
-static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
-								void *private)
+static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
+					u8 *blks, int nr_blks, void *private)
 {
 	struct factory_blks *f = private;
-	struct nvm_dev *dev = f->dev;
 	int i, lunoff;
 
+	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
 	lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun);
 
 	/* non-set bits correspond to the block must be erased */
@@ -661,7 +672,7 @@ static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa,
 
 	dev_ppa = generic_to_dev_addr(dev, ppa);
 
-	ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv);
+	ret = dev->ops->get_bb_tbl(dev, dev_ppa, fn, priv);
 	if (ret)
 		pr_err("nvm: failed bb tbl for ch%u lun%u\n",
 							ppa.g.ch, ppa.g.blk);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 9461dd639acd..d289980d2bc8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -387,41 +387,16 @@ out:
 	return ret;
 }
 
-static void nvme_nvm_bb_tbl_fold(struct nvm_dev *nvmdev,
-						int nr_dst_blks, u8 *dst_blks,
-						int nr_src_blks, u8 *src_blks)
-{
-	int blk, offset, pl, blktype;
-
-	for (blk = 0; blk < nr_dst_blks; blk++) {
-		offset = blk * nvmdev->plane_mode;
-		blktype = src_blks[offset];
-
-		/* Bad blocks on any planes take precedence over other types */
-		for (pl = 0; pl < nvmdev->plane_mode; pl++) {
-			if (src_blks[offset + pl] &
-					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
-				blktype = src_blks[offset + pl];
-				break;
-			}
-		}
-
-		dst_blks[blk] = blktype;
-	}
-}
-
 static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
-				int nr_dst_blks, nvm_bb_update_fn *update_bbtbl,
-				void *priv)
+				nvm_bb_update_fn *update_bbtbl, void *priv)
 {
 	struct request_queue *q = nvmdev->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_bb_tbl *bb_tbl;
-	u8 *dst_blks = NULL;
-	int nr_src_blks = nr_dst_blks * nvmdev->plane_mode;
-	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_src_blks;
+	int nr_blks = nvmdev->blks_per_lun * nvmdev->plane_mode;
+	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
 	int ret = 0;
 
 	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
@@ -432,12 +407,6 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 	if (!bb_tbl)
 		return -ENOMEM;
 
-	dst_blks = kzalloc(nr_dst_blks, GFP_KERNEL);
-	if (!dst_blks) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
 								bb_tbl, tblsz);
 	if (ret) {
@@ -459,21 +428,17 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 		goto out;
 	}
 
-	if (le32_to_cpu(bb_tbl->tblks) != nr_src_blks) {
+	if (le32_to_cpu(bb_tbl->tblks) != nr_blks) {
 		ret = -EINVAL;
 		dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)",
-				le32_to_cpu(bb_tbl->tblks), nr_src_blks);
+				le32_to_cpu(bb_tbl->tblks), nr_blks);
 		goto out;
 	}
 
-	nvme_nvm_bb_tbl_fold(nvmdev, nr_dst_blks, dst_blks,
-						nr_src_blks, bb_tbl->blk);
-
 	ppa = dev_to_generic_addr(nvmdev, ppa);
-	ret = update_bbtbl(ppa, nr_dst_blks, dst_blks, priv);
+	ret = update_bbtbl(nvmdev, ppa, bb_tbl->blk, nr_blks, priv);
 
 out:
-	kfree(dst_blks);
 	kfree(bb_tbl);
 	return ret;
 }
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index f7c607f96858..dacaa2850428 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -41,11 +41,12 @@ struct nvm_id;
 struct nvm_dev;
 
 typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
-typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
+typedef int (nvm_bb_update_fn)(struct nvm_dev *, struct ppa_addr, u8 *, int,
+								void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr,
 				nvm_bb_update_fn *, void *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -538,6 +539,7 @@ extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);
 extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
 							int, void *, int);
+extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 
 /* sysblk.c */
 #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
-- 
cgit v1.2.3


From 6063fe399d0913e7bd4462650cd4f31b479a83c9 Mon Sep 17 00:00:00 2001
From: "Simon A. F. Lund" <slund@cnexlabs.com>
Date: Fri, 6 May 2016 20:03:02 +0200
Subject: lightnvm: rename nvm_targets to nvm_tgt_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The functions nvm_register_target(), nvm_unregister_target() and
associated list refers to a target type that is being registered by a
target type module. Rename nvm_*_targets() to nvm_*_tgt_type(), so that
the intension is clear.

This enables target instances to use the _nvm_*_targets() naming.

Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 16 ++++++++--------
 drivers/lightnvm/rrpc.c  |  4 ++--
 include/linux/lightnvm.h |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 74fb049e0b83..240b4731c35e 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -30,7 +30,7 @@
 #include <linux/sched/sysctl.h>
 #include <uapi/linux/lightnvm.h>
 
-static LIST_HEAD(nvm_targets);
+static LIST_HEAD(nvm_tgt_types);
 static LIST_HEAD(nvm_mgrs);
 static LIST_HEAD(nvm_devices);
 static DECLARE_RWSEM(nvm_lock);
@@ -39,14 +39,14 @@ static struct nvm_tgt_type *nvm_find_target_type(const char *name)
 {
 	struct nvm_tgt_type *tt;
 
-	list_for_each_entry(tt, &nvm_targets, list)
+	list_for_each_entry(tt, &nvm_tgt_types, list)
 		if (!strcmp(name, tt->name))
 			return tt;
 
 	return NULL;
 }
 
-int nvm_register_target(struct nvm_tgt_type *tt)
+int nvm_register_tgt_type(struct nvm_tgt_type *tt)
 {
 	int ret = 0;
 
@@ -54,14 +54,14 @@ int nvm_register_target(struct nvm_tgt_type *tt)
 	if (nvm_find_target_type(tt->name))
 		ret = -EEXIST;
 	else
-		list_add(&tt->list, &nvm_targets);
+		list_add(&tt->list, &nvm_tgt_types);
 	up_write(&nvm_lock);
 
 	return ret;
 }
-EXPORT_SYMBOL(nvm_register_target);
+EXPORT_SYMBOL(nvm_register_tgt_type);
 
-void nvm_unregister_target(struct nvm_tgt_type *tt)
+void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
 {
 	if (!tt)
 		return;
@@ -70,7 +70,7 @@ void nvm_unregister_target(struct nvm_tgt_type *tt)
 	list_del(&tt->list);
 	up_write(&nvm_lock);
 }
-EXPORT_SYMBOL(nvm_unregister_target);
+EXPORT_SYMBOL(nvm_unregister_tgt_type);
 
 void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags,
 							dma_addr_t *dma_handler)
@@ -1020,7 +1020,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
 	info->version[2] = NVM_VERSION_PATCH;
 
 	down_write(&nvm_lock);
-	list_for_each_entry(tt, &nvm_targets, list) {
+	list_for_each_entry(tt, &nvm_tgt_types, list) {
 		struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
 
 		tgt->version[0] = tt->version[0];
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 3143b980ac06..c7fef71f4276 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1469,12 +1469,12 @@ static struct nvm_tgt_type tt_rrpc = {
 
 static int __init rrpc_module_init(void)
 {
-	return nvm_register_target(&tt_rrpc);
+	return nvm_register_tgt_type(&tt_rrpc);
 }
 
 static void rrpc_module_exit(void)
 {
-	nvm_unregister_target(&tt_rrpc);
+	nvm_unregister_tgt_type(&tt_rrpc);
 }
 
 module_init(rrpc_module_init);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index dacaa2850428..497da91510b2 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -453,8 +453,8 @@ struct nvm_tgt_type {
 	struct list_head list;
 };
 
-extern int nvm_register_target(struct nvm_tgt_type *);
-extern void nvm_unregister_target(struct nvm_tgt_type *);
+extern int nvm_register_tgt_type(struct nvm_tgt_type *);
+extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
 
 extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *);
 extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
-- 
cgit v1.2.3


From 6f8645cba54a4b2983bbd46f462b5891ffbd72fc Mon Sep 17 00:00:00 2001
From: "Simon A. F. Lund" <slund@cnexlabs.com>
Date: Fri, 6 May 2016 20:03:03 +0200
Subject: lightnvm: refactor dev->online_target to global nvm_targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A target name must be unique. However, a per-device registration of
targets is maintained on a dev->online_targets list, with a per-device
search for targets upon registration.

This results in a name collision when two targets, with the same name,
are created on two different targets, where the per-device list is not
shared.

Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 47 +++++++++++++++++++++++++----------------------
 include/linux/lightnvm.h |  1 -
 2 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 240b4731c35e..0296223392f7 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -33,8 +33,20 @@
 static LIST_HEAD(nvm_tgt_types);
 static LIST_HEAD(nvm_mgrs);
 static LIST_HEAD(nvm_devices);
+static LIST_HEAD(nvm_targets);
 static DECLARE_RWSEM(nvm_lock);
 
+static struct nvm_target *nvm_find_target(const char *name)
+{
+	struct nvm_target *tgt;
+
+	list_for_each_entry(tgt, &nvm_targets, list)
+		if (!strcmp(name, tgt->disk->disk_name))
+			return tgt;
+
+	return NULL;
+}
+
 static struct nvm_tgt_type *nvm_find_target_type(const char *name)
 {
 	struct nvm_tgt_type *tt;
@@ -564,7 +576,6 @@ static int nvm_core_init(struct nvm_dev *dev)
 		goto err_fmtype;
 	}
 
-	INIT_LIST_HEAD(&dev->online_targets);
 	mutex_init(&dev->mlock);
 	spin_lock_init(&dev->lock);
 
@@ -744,12 +755,11 @@ static int nvm_create_target(struct nvm_dev *dev,
 		return -EINVAL;
 	}
 
-	list_for_each_entry(t, &dev->online_targets, list) {
-		if (!strcmp(create->tgtname, t->disk->disk_name)) {
-			pr_err("nvm: target name already exists.\n");
-			up_write(&nvm_lock);
-			return -EINVAL;
-		}
+	t = nvm_find_target(create->tgtname);
+	if (t) {
+		pr_err("nvm: target name already exists.\n");
+		up_write(&nvm_lock);
+		return -EINVAL;
 	}
 	up_write(&nvm_lock);
 
@@ -789,7 +799,7 @@ static int nvm_create_target(struct nvm_dev *dev,
 	t->disk = tdisk;
 
 	down_write(&nvm_lock);
-	list_add_tail(&t->list, &dev->online_targets);
+	list_add_tail(&t->list, &nvm_targets);
 	up_write(&nvm_lock);
 
 	return 0;
@@ -852,26 +862,19 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 
 static int __nvm_configure_remove(struct nvm_ioctl_remove *remove)
 {
-	struct nvm_target *t = NULL;
-	struct nvm_dev *dev;
-	int ret = -1;
+	struct nvm_target *t;
 
 	down_write(&nvm_lock);
-	list_for_each_entry(dev, &nvm_devices, devices)
-		list_for_each_entry(t, &dev->online_targets, list) {
-			if (!strcmp(remove->tgtname, t->disk->disk_name)) {
-				nvm_remove_target(t);
-				ret = 0;
-				break;
-			}
-		}
-	up_write(&nvm_lock);
-
-	if (ret) {
+	t = nvm_find_target(remove->tgtname);
+	if (!t) {
 		pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname);
+		up_write(&nvm_lock);
 		return -EINVAL;
 	}
 
+	nvm_remove_target(t);
+	up_write(&nvm_lock);
+
 	return 0;
 }
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 497da91510b2..5eabdbaa87bc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -308,7 +308,6 @@ struct nvm_dev {
 	struct nvm_dev_ops *ops;
 
 	struct list_head devices;
-	struct list_head online_targets;
 
 	/* Media manager */
 	struct nvmm_type *mt;
-- 
cgit v1.2.3


From 5136061ce705210b501ed9ecd673a67b74ebe017 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:04 +0200
Subject: lightnvm: introduce nvm_for_each_lun_ppa() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users that wish to iterate all luns on a device. Must create a
struct ppa_addr and separate iterators for channels and luns. To set the
iterators, two loops are required, one to iterate channels, and another
to iterate luns. This leads to decrease in readability.

Introduce nvm_for_each_lun_ppa, which implements the nested loop and
sets ppa, channel, and lun variable for each loop body, eliminating
the boilerplate code.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/sysblk.c | 64 +++++++++++++++++++----------------------------
 include/linux/lightnvm.h  |  7 ++++++
 2 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index 7fce58833a07..a48093d17fea 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -631,33 +631,28 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
 
 	while (!done) {
 		done = 1;
-		for (ch = 0; ch < dev->nr_chnls; ch++) {
-			for (lun = 0; lun < dev->luns_per_chnl; lun++) {
-				idx = factory_blk_offset(dev, ch, lun);
-				offset = &f->blks[idx];
-
-				blkid = find_first_zero_bit(offset,
-							dev->blks_per_lun);
-				if (blkid >= dev->blks_per_lun)
-					continue;
-				set_bit(blkid, offset);
-
-				ppa.ppa = 0;
-				ppa.g.ch = ch;
-				ppa.g.lun = lun;
-				ppa.g.blk = blkid;
-				pr_debug("nvm: erase ppa (%u %u %u)\n",
-								ppa.g.ch,
-								ppa.g.lun,
-								ppa.g.blk);
-
-				erase_list[ppa_cnt] = ppa;
-				ppa_cnt++;
-				done = 0;
-
-				if (ppa_cnt == max_ppas)
-					return ppa_cnt;
-			}
+		nvm_for_each_lun_ppa(dev, ppa, ch, lun) {
+			idx = factory_blk_offset(dev, ch, lun);
+			offset = &f->blks[idx];
+
+			blkid = find_first_zero_bit(offset,
+						dev->blks_per_lun);
+			if (blkid >= dev->blks_per_lun)
+				continue;
+			set_bit(blkid, offset);
+
+			ppa.g.blk = blkid;
+			pr_debug("nvm: erase ppa (%u %u %u)\n",
+							ppa.g.ch,
+							ppa.g.lun,
+							ppa.g.blk);
+
+			erase_list[ppa_cnt] = ppa;
+			ppa_cnt++;
+			done = 0;
+
+			if (ppa_cnt == max_ppas)
+				return ppa_cnt;
 		}
 	}
 
@@ -684,17 +679,10 @@ static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f)
 	int ch, lun, ret;
 	struct ppa_addr ppa;
 
-	ppa.ppa = 0;
-	for (ch = 0; ch < dev->nr_chnls; ch++) {
-		for (lun = 0; lun < dev->luns_per_chnl; lun++) {
-			ppa.g.ch = ch;
-			ppa.g.lun = lun;
-
-			ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks,
-									f);
-			if (ret)
-				return ret;
-		}
+	nvm_for_each_lun_ppa(dev, ppa, ch, lun) {
+		ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, f);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 5eabdbaa87bc..3f256355b9fa 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -559,6 +559,13 @@ extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
 extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
 
 extern int nvm_dev_factory(struct nvm_dev *, int flags);
+
+#define nvm_for_each_lun_ppa(dev, ppa, chid, lunid)			\
+	for ((chid) = 0, (ppa).ppa = 0; (chid) < (dev)->nr_chnls;	\
+					(chid)++, (ppa).g.ch = (chid))	\
+		for ((lunid) = 0; (lunid) < (dev)->luns_per_chnl;	\
+					(lunid)++, (ppa).g.lun = (lunid))
+
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
-- 
cgit v1.2.3


From e11903f5dfeb4f59fe93316d47f2ee5982e91e60 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:05 +0200
Subject: lightnvm: refactor device ops->get_bb_tbl()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The device ops->get_bb_tbl() takes a callback, that allows the caller
to use its own callback function to update its data structures in the
returning function.

This makes it difficult to send parameters to the callback, and usually
is circumvented by small private structures, that both carry the callers
state and any flags needed to fulfill the update.

Refactor ops->get_bb_tbl() to fill a data buffer with the status of the
blocks returned, and let the user call the callback function manually.
That will provide the necessary flags and data structures and simplify
the logic around ops->get_bb_tbl().

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      |   8 +++
 drivers/lightnvm/gennvm.c    |  29 ++++++---
 drivers/lightnvm/sysblk.c    | 146 +++++++++++++++++++++++--------------------
 drivers/nvme/host/lightnvm.c |   6 +-
 include/linux/lightnvm.h     |   6 +-
 5 files changed, 111 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 0296223392f7..e6d7a98baeb2 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -467,6 +467,14 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
+int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks)
+{
+	ppa = generic_to_dev_addr(dev, ppa);
+
+	return dev->ops->get_bb_tbl(dev, ppa, blks);
+}
+EXPORT_SYMBOL(nvm_get_bb_tbl);
+
 static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 {
 	int i;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 6096077c5dde..9c6b141606e9 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -129,10 +129,10 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
 	return 0;
 }
 
-static int gennvm_block_bb(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks, void *private)
+static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa,
+							u8 *blks, int nr_blks)
 {
-	struct gen_nvm *gn = private;
+	struct nvm_dev *dev = gn->dev;
 	struct gen_lun *lun;
 	struct nvm_block *blk;
 	int i;
@@ -219,13 +219,21 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 	struct gen_lun *lun;
 	struct nvm_block *block;
 	sector_t lun_iter, blk_iter, cur_block_id = 0;
-	int ret;
+	int ret, nr_blks;
+	u8 *blks;
+
+	nr_blks = dev->blks_per_lun * dev->plane_mode;
+	blks = kmalloc(nr_blks, GFP_KERNEL);
+	if (!blks)
+		return -ENOMEM;
 
 	gennvm_for_each_lun(gn, lun, lun_iter) {
 		lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) *
 							dev->blks_per_lun);
-		if (!lun->vlun.blocks)
+		if (!lun->vlun.blocks) {
+			kfree(blks);
 			return -ENOMEM;
+		}
 
 		for (blk_iter = 0; blk_iter < dev->blks_per_lun; blk_iter++) {
 			block = &lun->vlun.blocks[blk_iter];
@@ -250,12 +258,14 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			ppa.ppa = 0;
 			ppa.g.ch = lun->vlun.chnl_id;
 			ppa.g.lun = lun->vlun.id;
-			ppa = generic_to_dev_addr(dev, ppa);
 
-			ret = dev->ops->get_bb_tbl(dev, ppa,
-							gennvm_block_bb, gn);
+			ret = nvm_get_bb_tbl(dev, ppa, blks);
+			if (ret)
+				pr_err("gennvm: could not get BB table\n");
+
+			ret = gennvm_block_bb(gn, ppa, blks, nr_blks);
 			if (ret)
-				pr_err("gennvm: could not read BB table\n");
+				pr_err("gennvm: BB table map failed\n");
 		}
 	}
 
@@ -268,6 +278,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 		}
 	}
 
+	kfree(blks);
 	return 0;
 }
 
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index a48093d17fea..dc99c0ac8701 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -93,10 +93,45 @@ void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
 	s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
 }
 
+static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa,
+					u8 *blks, int nr_blks,
+					struct sysblk_scan *s)
+{
+	struct ppa_addr *sppa;
+	int i, blkid = 0;
+
+	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
+	for (i = 0; i < nr_blks; i++) {
+		if (blks[i] == NVM_BLK_T_HOST)
+			return -EEXIST;
+
+		if (blks[i] != NVM_BLK_T_FREE)
+			continue;
+
+		sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
+		sppa->g.ch = ppa.g.ch;
+		sppa->g.lun = ppa.g.lun;
+		sppa->g.blk = i;
+		s->nr_ppas++;
+		blkid++;
+
+		pr_debug("nvm: use (%u %u %u) as sysblk\n",
+					sppa->g.ch, sppa->g.lun, sppa->g.blk);
+		if (blkid > MAX_BLKS_PR_SYSBLK - 1)
+			return 0;
+	}
+
+	pr_err("nvm: sysblk failed get sysblk\n");
+	return -EINVAL;
+}
+
 static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks, void *private)
+					u8 *blks, int nr_blks,
+					struct sysblk_scan *s)
 {
-	struct sysblk_scan *s = private;
 	int i, nr_sysblk = 0;
 
 	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
@@ -123,26 +158,42 @@ static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
 }
 
 static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
-				struct ppa_addr *ppas, nvm_bb_update_fn *fn)
+				struct ppa_addr *ppas, int get_free)
 {
-	struct ppa_addr dppa;
-	int i, ret = 0;
+	int i, nr_blks, ret = 0;
+	u8 *blks;
 
 	s->nr_ppas = 0;
+	nr_blks = dev->blks_per_lun * dev->plane_mode;
+
+	blks = kmalloc(nr_blks, GFP_KERNEL);
+	if (!blks)
+		return -ENOMEM;
 
 	for (i = 0; i < s->nr_rows; i++) {
-		dppa = generic_to_dev_addr(dev, ppas[i]);
 		s->row = i;
 
-		ret = dev->ops->get_bb_tbl(dev, dppa, fn, s);
+		ret = nvm_get_bb_tbl(dev, ppas[i], blks);
 		if (ret) {
 			pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
 							ppas[i].g.ch,
 							ppas[i].g.blk);
-			return ret;
+			goto err_get;
 		}
+
+		if (get_free)
+			ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks,
+									s);
+		else
+			ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks,
+									s);
+
+		if (ret)
+			goto err_get;
 	}
 
+err_get:
+	kfree(blks);
 	return ret;
 }
 
@@ -239,41 +290,6 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
 	return 0;
 }
 
-static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks, void *private)
-{
-	struct sysblk_scan *s = private;
-	struct ppa_addr *sppa;
-	int i, blkid = 0;
-
-	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-	if (nr_blks < 0)
-		return nr_blks;
-
-	for (i = 0; i < nr_blks; i++) {
-		if (blks[i] == NVM_BLK_T_HOST)
-			return -EEXIST;
-
-		if (blks[i] != NVM_BLK_T_FREE)
-			continue;
-
-		sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
-		sppa->g.ch = ppa.g.ch;
-		sppa->g.lun = ppa.g.lun;
-		sppa->g.blk = i;
-		s->nr_ppas++;
-		blkid++;
-
-		pr_debug("nvm: use (%u %u %u) as sysblk\n",
-					sppa->g.ch, sppa->g.lun, sppa->g.blk);
-		if (blkid > MAX_BLKS_PR_SYSBLK - 1)
-			return 0;
-	}
-
-	pr_err("nvm: sysblk failed get sysblk\n");
-	return -EINVAL;
-}
-
 static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 							struct sysblk_scan *s)
 {
@@ -393,7 +409,7 @@ int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
 	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
 
 	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
 	if (ret)
 		goto err_sysblk;
 
@@ -453,7 +469,7 @@ int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
 	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
 
 	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
 	if (ret)
 		goto err_sysblk;
 
@@ -551,7 +567,7 @@ int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
 	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
 
 	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks);
+	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1);
 	if (ret)
 		goto err_mark;
 
@@ -587,9 +603,9 @@ static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun)
 }
 
 static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks, void *private)
+					u8 *blks, int nr_blks,
+					struct factory_blks *f)
 {
-	struct factory_blks *f = private;
 	int i, lunoff;
 
 	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
@@ -659,32 +675,29 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
 	return ppa_cnt;
 }
 
-static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa,
-					nvm_bb_update_fn *fn, void *priv)
-{
-	struct ppa_addr dev_ppa;
-	int ret;
-
-	dev_ppa = generic_to_dev_addr(dev, ppa);
-
-	ret = dev->ops->get_bb_tbl(dev, dev_ppa, fn, priv);
-	if (ret)
-		pr_err("nvm: failed bb tbl for ch%u lun%u\n",
-							ppa.g.ch, ppa.g.blk);
-	return ret;
-}
-
 static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f)
 {
-	int ch, lun, ret;
 	struct ppa_addr ppa;
+	int ch, lun, nr_blks, ret;
+	u8 *blks;
+
+	nr_blks = dev->blks_per_lun * dev->plane_mode;
+	blks = kmalloc(nr_blks, GFP_KERNEL);
+	if (!blks)
+		return -ENOMEM;
 
 	nvm_for_each_lun_ppa(dev, ppa, ch, lun) {
-		ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, f);
+		ret = nvm_get_bb_tbl(dev, ppa, blks);
+		if (ret)
+			pr_err("nvm: failed bb tbl for ch%u lun%u\n",
+							ppa.g.ch, ppa.g.blk);
+
+		ret = nvm_factory_blks(dev, ppa, blks, nr_blks, f);
 		if (ret)
 			return ret;
 	}
 
+	kfree(blks);
 	return 0;
 }
 
@@ -722,8 +735,7 @@ int nvm_dev_factory(struct nvm_dev *dev, int flags)
 	if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
 		nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
 		mutex_lock(&dev->mlock);
-		ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas,
-							sysblk_get_host_blks);
+		ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
 		if (!ret)
 			ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
 		mutex_unlock(&dev->mlock);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index d289980d2bc8..45e3511d5d3d 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -388,7 +388,7 @@ out:
 }
 
 static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
-				nvm_bb_update_fn *update_bbtbl, void *priv)
+								u8 *blks)
 {
 	struct request_queue *q = nvmdev->q;
 	struct nvme_ns *ns = q->queuedata;
@@ -435,9 +435,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 		goto out;
 	}
 
-	ppa = dev_to_generic_addr(nvmdev, ppa);
-	ret = update_bbtbl(nvmdev, ppa, bb_tbl->blk, nr_blks, priv);
-
+	memcpy(blks, bb_tbl->blk, nvmdev->blks_per_lun * nvmdev->plane_mode);
 out:
 	kfree(bb_tbl);
 	return ret;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 3f256355b9fa..16d4f2edf5b4 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -41,13 +41,10 @@ struct nvm_id;
 struct nvm_dev;
 
 typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
-typedef int (nvm_bb_update_fn)(struct nvm_dev *, struct ppa_addr, u8 *, int,
-								void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr,
-				nvm_bb_update_fn *, void *);
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -539,6 +536,7 @@ extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
 							int, void *, int);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
+extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
 
 /* sysblk.c */
 #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
-- 
cgit v1.2.3


From 5ebc7d9fe13ff9bd3622d0be3cd39c8751459be6 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:07 +0200
Subject: lightnvm: make nvm_set_rqd_ppalist() aware of vblks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A virtual block enables a block to identify multiple physical blocks.
This is useful for metadata where a device media supports multiple
planes. In that case, a block, with multiple planes can be managed
as a single vblk. Reducing the metadata required by one forth.

nvm_set_rqd_ppalist() takes care of expanding a ppa_list with vblks
automatically. However, for some use-cases, where only a single physical
block is required, the ppa_list should not be expanded.

Therefore, add a vblk parameter to nvm_set_rqd_ppalist(), and only
expand the ppa_list if vblk is set.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   | 31 +++++++++++++++++--------------
 drivers/lightnvm/sysblk.c |  2 +-
 include/linux/lightnvm.h  |  2 +-
 3 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index e6d7a98baeb2..de5db7b2c456 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -251,33 +251,36 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 EXPORT_SYMBOL(nvm_generic_to_addr_mode);
 
 int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
-					struct ppa_addr *ppas, int nr_ppas)
+				struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
 	int i, plane_cnt, pl_idx;
 
-	if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
-		rqd->nr_pages = 1;
+	if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
+		rqd->nr_pages = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 
 		return 0;
 	}
 
-	plane_cnt = dev->plane_mode;
-	rqd->nr_pages = plane_cnt * nr_ppas;
-
-	if (dev->ops->max_phys_sect < rqd->nr_pages)
-		return -EINVAL;
-
+	rqd->nr_pages = nr_ppas;
 	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
 	if (!rqd->ppa_list) {
 		pr_err("nvm: failed to allocate dma memory\n");
 		return -ENOMEM;
 	}
 
-	for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+	if (!vblk) {
+		for (i = 0; i < nr_ppas; i++)
+			rqd->ppa_list[i] = ppas[i];
+	} else {
+		plane_cnt = dev->plane_mode;
+		rqd->nr_pages *= plane_cnt;
+
 		for (i = 0; i < nr_ppas; i++) {
-			ppas[i].g.pl = pl_idx;
-			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i];
+			for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+				ppas[i].g.pl = pl_idx;
+				rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i];
+			}
 		}
 	}
 
@@ -304,7 +307,7 @@ int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas)
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas);
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
 	if (ret)
 		return ret;
 
@@ -420,7 +423,7 @@ int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
 	int ret;
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas);
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1);
 	if (ret)
 		return ret;
 
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index bca690248bd7..737fbc3c5cf1 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -277,7 +277,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas);
+	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1);
 	nvm_generic_to_addr_mode(dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd, type);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 16d4f2edf5b4..9ae0b7c6deb2 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -526,7 +526,7 @@ extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
 extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
 extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
-							struct ppa_addr *, int);
+						struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
 extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
-- 
cgit v1.2.3


From 00ee6cc3b74bdb4dfb71838046517beb6839647c Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:09 +0200
Subject: lightnvm: refactor set_bb_tbl for accepting ppa list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The set_bb_tbl takes struct nvm_rq and only uses its ppa_list and
nr_pages internally. Instead, make these two variables explicit.
This allows a user to call it without initializing a struct nvm_rq
first.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/sysblk.c    | 2 +-
 drivers/nvme/host/lightnvm.c | 8 ++++----
 include/linux/lightnvm.h     | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index 737fbc3c5cf1..b98ca1998da5 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -280,7 +280,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
 	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1);
 	nvm_generic_to_addr_mode(dev, &rqd);
 
-	ret = dev->ops->set_bb_tbl(dev, &rqd, type);
+	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_pages, type);
 	nvm_free_rqd_ppalist(dev, &rqd);
 	if (ret) {
 		pr_err("nvm: sysblk failed bb mark\n");
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 45e3511d5d3d..d426d95c9eaa 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -441,8 +441,8 @@ out:
 	return ret;
 }
 
-static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
-								int type)
+static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
+							int nr_ppas, int type)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
 	struct nvme_nvm_command c = {};
@@ -450,8 +450,8 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
 
 	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
 	c.set_bb.nsid = cpu_to_le32(ns->ns_id);
-	c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-	c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1);
+	c.set_bb.spba = cpu_to_le64(ppas->ppa);
+	c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
 	c.set_bb.value = type;
 
 	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 9ae0b7c6deb2..af72ca75dced 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -45,7 +45,7 @@ typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 				nvm_l2p_update_fn *, void *);
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
-typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
+typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
-- 
cgit v1.2.3


From 003fad376b924fc3a61c659f38da70356ec144fa Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 6 May 2016 20:03:12 +0200
Subject: lightnvm: enable metadata to be sent to device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable metadata buffer to be sent to the device through the metadata
field on the physical rw nvme command. The size of the metadata buffer
must follow dev->oob_size * # of PPAs.

Signed-off-by: Javier González <javier@cnexlabs.com>
Updated description.
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 1 +
 include/linux/lightnvm.h     | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index d426d95c9eaa..69a47fb2b6e3 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -467,6 +467,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 	c->ph_rw.opcode = rqd->opcode;
 	c->ph_rw.nsid = cpu_to_le32(ns->ns_id);
 	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
+	c->ph_rw.metadata = cpu_to_le64(rqd->meta_list);
 	c->ph_rw.control = cpu_to_le16(rqd->flags);
 	c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1);
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index af72ca75dced..678df4d4354d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -230,8 +230,8 @@ struct nvm_rq {
 
 	struct ppa_addr *ppa_list;
 
-	void *metadata;
-	dma_addr_t dma_metadata;
+	void *meta_list;
+	dma_addr_t dma_meta_list;
 
 	struct completion *wait;
 	nvm_end_io_fn *end_io;
-- 
cgit v1.2.3


From 75b8564932ef646e8620deffacbe134846333948 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 6 May 2016 20:03:13 +0200
Subject: lightnvm: rename dma helper functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Until now, the dma pool have been exclusively used to allocate the ppa
list being sent to the device. In pblk (upcoming), we use these pools to
allocate metadata too. Thus, we generalize the names of some variables
on the dma helper functions to make the code more readable.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 16 ++++++++--------
 drivers/nvme/host/lightnvm.c |  4 ++--
 include/linux/lightnvm.h     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index de5db7b2c456..1873a3bc913d 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -87,15 +87,15 @@ EXPORT_SYMBOL(nvm_unregister_tgt_type);
 void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags,
 							dma_addr_t *dma_handler)
 {
-	return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags,
+	return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags,
 								dma_handler);
 }
 EXPORT_SYMBOL(nvm_dev_dma_alloc);
 
-void nvm_dev_dma_free(struct nvm_dev *dev, void *ppa_list,
+void nvm_dev_dma_free(struct nvm_dev *dev, void *addr,
 							dma_addr_t dma_handler)
 {
-	dev->ops->dev_dma_free(dev->ppalist_pool, ppa_list, dma_handler);
+	dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler);
 }
 EXPORT_SYMBOL(nvm_dev_dma_free);
 
@@ -652,8 +652,8 @@ err:
 
 static void nvm_exit(struct nvm_dev *dev)
 {
-	if (dev->ppalist_pool)
-		dev->ops->destroy_dma_pool(dev->ppalist_pool);
+	if (dev->dma_pool)
+		dev->ops->destroy_dma_pool(dev->dma_pool);
 	nvm_free(dev);
 
 	pr_info("nvm: successfully unloaded\n");
@@ -687,9 +687,9 @@ int nvm_register(struct request_queue *q, char *disk_name,
 	}
 
 	if (dev->ops->max_phys_sect > 1) {
-		dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist");
-		if (!dev->ppalist_pool) {
-			pr_err("nvm: could not create ppa pool\n");
+		dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+		if (!dev->dma_pool) {
+			pr_err("nvm: could not create dma pool\n");
 			ret = -ENOMEM;
 			goto err_init;
 		}
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 69a47fb2b6e3..8461f5a36d14 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -565,10 +565,10 @@ static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
 	return dma_pool_alloc(pool, mem_flags, dma_handler);
 }
 
-static void nvme_nvm_dev_dma_free(void *pool, void *ppa_list,
+static void nvme_nvm_dev_dma_free(void *pool, void *addr,
 							dma_addr_t dma_handler)
 {
-	dma_pool_free(pool, ppa_list, dma_handler);
+	dma_pool_free(pool, addr, dma_handler);
 }
 
 static struct nvm_dev_ops nvme_nvm_dev_ops = {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 678df4d4354d..0e8e01956325 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -347,7 +347,7 @@ struct nvm_dev {
 	unsigned max_pages_per_blk;
 
 	unsigned long *lun_map;
-	void *ppalist_pool;
+	void *dma_pool;
 
 	struct nvm_id identity;
 
-- 
cgit v1.2.3


From 976bdfcae32ea10c2c8c2ecaeb0d85873f634dad Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:17 +0200
Subject: lightnvm: remove mgt targets on mgt removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Targets associated with a device manager are not freed on device
removal. They have to be manually removed before shutdown. Make sure
any outstanding targets are freed upon shutdown.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 69 ++++++++++++++++++++++++++++++------------------
 include/linux/lightnvm.h |  1 +
 2 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 1873a3bc913d..d3af77102e47 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -596,13 +596,52 @@ err_fmtype:
 	return ret;
 }
 
+static void nvm_remove_target(struct nvm_target *t)
+{
+	struct nvm_tgt_type *tt = t->type;
+	struct gendisk *tdisk = t->disk;
+	struct request_queue *q = tdisk->queue;
+
+	lockdep_assert_held(&nvm_lock);
+
+	del_gendisk(tdisk);
+	blk_cleanup_queue(q);
+
+	if (tt->exit)
+		tt->exit(tdisk->private_data);
+
+	put_disk(tdisk);
+
+	list_del(&t->list);
+	kfree(t);
+}
+
+static void nvm_free_mgr(struct nvm_dev *dev)
+{
+	struct nvm_target *tgt, *tmp;
+
+	if (!dev->mt)
+		return;
+
+	down_write(&nvm_lock);
+	list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) {
+		if (tgt->dev != dev)
+			continue;
+
+		nvm_remove_target(tgt);
+	}
+	up_write(&nvm_lock);
+
+	dev->mt->unregister_mgr(dev);
+	dev->mt = NULL;
+}
+
 static void nvm_free(struct nvm_dev *dev)
 {
 	if (!dev)
 		return;
 
-	if (dev->mt)
-		dev->mt->unregister_mgr(dev);
+	nvm_free_mgr(dev);
 
 	kfree(dev->lptbl);
 	kfree(dev->lun_map);
@@ -808,6 +847,7 @@ static int nvm_create_target(struct nvm_dev *dev,
 
 	t->type = tt;
 	t->disk = tdisk;
+	t->dev = dev;
 
 	down_write(&nvm_lock);
 	list_add_tail(&t->list, &nvm_targets);
@@ -823,26 +863,6 @@ err_t:
 	return -ENOMEM;
 }
 
-static void nvm_remove_target(struct nvm_target *t)
-{
-	struct nvm_tgt_type *tt = t->type;
-	struct gendisk *tdisk = t->disk;
-	struct request_queue *q = tdisk->queue;
-
-	lockdep_assert_held(&nvm_lock);
-
-	del_gendisk(tdisk);
-	blk_cleanup_queue(q);
-
-	if (tt->exit)
-		tt->exit(tdisk->private_data);
-
-	put_disk(tdisk);
-
-	list_del(&t->list);
-	kfree(t);
-}
-
 static int __nvm_configure_create(struct nvm_ioctl_create *create)
 {
 	struct nvm_dev *dev;
@@ -1231,10 +1251,7 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
 		return -EINVAL;
 	}
 
-	if (dev->mt) {
-		dev->mt->unregister_mgr(dev);
-		dev->mt = NULL;
-	}
+	nvm_free_mgr(dev);
 
 	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT)
 		return nvm_dev_factory(dev, fact.flags);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 0e8e01956325..cde31ffe2d62 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -200,6 +200,7 @@ struct nvm_id {
 
 struct nvm_target {
 	struct list_head list;
+	struct nvm_dev *dev;
 	struct nvm_tgt_type *type;
 	struct gendisk *disk;
 };
-- 
cgit v1.2.3


From 04a8aa173bd9410849526a80dcf733da9c3e142d Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:18 +0200
Subject: lightnvm: expose gennvm_mark_blk to targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Targets can update a block state when having a reference to an
in-memory virtual block. In the case that a target does not keep the
block metadata in memory, it does not have a way to update this
structure.

Therefore, expose gennvm_mark_blk() through the media managers
->mark_blk() callback and let targets update the state structure through
this callback.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 27 ++++++++++++++-------------
 include/linux/lightnvm.h  |  4 ++++
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 61790aebb8da..33a66d8c039d 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -407,29 +407,28 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 	spin_unlock(&vlun->lock);
 }
 
-static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa,
-								int type)
+static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 {
 	struct gen_nvm *gn = dev->mp;
 	struct gen_lun *lun;
 	struct nvm_block *blk;
 
 	pr_debug("gennvm: ppa  (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
-			ppa->g.ch, ppa->g.lun, ppa->g.blk, ppa->g.pg, type);
+			ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type);
 
-	if (unlikely(ppa->g.ch > dev->nr_chnls ||
-					ppa->g.lun > dev->luns_per_chnl ||
-					ppa->g.blk > dev->blks_per_lun)) {
+	if (unlikely(ppa.g.ch > dev->nr_chnls ||
+					ppa.g.lun > dev->luns_per_chnl ||
+					ppa.g.blk > dev->blks_per_lun)) {
 		WARN_ON_ONCE(1);
 		pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
-				ppa->g.ch, dev->nr_chnls,
-				ppa->g.lun, dev->luns_per_chnl,
-				ppa->g.blk, dev->blks_per_lun);
+				ppa.g.ch, dev->nr_chnls,
+				ppa.g.lun, dev->luns_per_chnl,
+				ppa.g.blk, dev->blks_per_lun);
 		return;
 	}
 
-	lun = &gn->luns[ppa->g.lun * ppa->g.ch];
-	blk = &lun->vlun.blocks[ppa->g.blk];
+	lun = &gn->luns[ppa.g.lun * ppa.g.ch];
+	blk = &lun->vlun.blocks[ppa.g.blk];
 
 	/* will be moved to bb list on put_blk from target */
 	blk->state = type;
@@ -448,12 +447,12 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
 
 	/* look up blocks and mark them as bad */
 	if (rqd->nr_pages == 1) {
-		gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD);
+		gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
 		return;
 	}
 
 	while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs)
-		gennvm_blk_set_type(dev, &rqd->ppa_list[bit], NVM_BLK_ST_BAD);
+		gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD);
 }
 
 static void gennvm_end_io(struct nvm_rq *rqd)
@@ -544,6 +543,8 @@ static struct nvmm_type gennvm = {
 	.submit_io		= gennvm_submit_io,
 	.erase_blk		= gennvm_erase_blk,
 
+	.mark_blk		= gennvm_mark_blk,
+
 	.get_lun		= gennvm_get_lun,
 	.reserve_lun		= gennvm_reserve_lun,
 	.release_lun		= gennvm_release_lun,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index cde31ffe2d62..3c53911e5758 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -467,6 +467,7 @@ typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
 								unsigned long);
+typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
 typedef int (nvmm_reserve_lun)(struct nvm_dev *, int);
 typedef void (nvmm_release_lun)(struct nvm_dev *, int);
@@ -494,6 +495,9 @@ struct nvmm_type {
 	nvmm_submit_io_fn *submit_io;
 	nvmm_erase_blk_fn *erase_blk;
 
+	/* Bad block mgmt */
+	nvmm_mark_blk_fn *mark_blk;
+
 	/* Configuration management */
 	nvmm_get_lun_fn *get_lun;
 	nvmm_reserve_lun *reserve_lun;
-- 
cgit v1.2.3


From df414b33bb1eb3a0ae52ccd4ecfec9323a4f89dc Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Fri, 6 May 2016 20:03:19 +0200
Subject: lightnvm: add is_cached entry to struct ppa_addr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A target requires a method to identify PPAs that are either cached in
memory or on disk. This can efficiently be maintained within the PPA.
The target host-side translation table can then lookup a PPA and know
from the PPA if it is cached or on disk. In the case it is cached, it is
the responsibility of the target to maintain this cache.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 3c53911e5758..3a810d7bd25e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -18,7 +18,7 @@ enum {
 #define NVM_SEC_BITS (8)
 #define NVM_PL_BITS  (8)
 #define NVM_LUN_BITS (8)
-#define NVM_CH_BITS  (8)
+#define NVM_CH_BITS  (7)
 
 struct ppa_addr {
 	/* Generic structure for all addresses */
@@ -30,8 +30,14 @@ struct ppa_addr {
 			u64 pl		: NVM_PL_BITS;
 			u64 lun		: NVM_LUN_BITS;
 			u64 ch		: NVM_CH_BITS;
+			u64 reserved	: 1;
 		} g;
 
+		struct {
+			u64 line	: 63;
+			u64 is_cached	: 1;
+		} c;
+
 		u64 ppa;
 	};
 };
-- 
cgit v1.2.3


From 6d5be9590b5e15124e3c8b319c8d7ce01abcf07d Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Fri, 6 May 2016 20:03:20 +0200
Subject: lightnvm: rename nr_pages to nr_ppas on nvm_rq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The number of ppas contained on a request is not necessarily the number
of pages that it maps to neither on the target nor on the device side.
In order to avoid confusion, rename nr_pages to nr_ppas since it is what
the variable actually contains.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 16 ++++++++--------
 drivers/lightnvm/gennvm.c    |  2 +-
 drivers/lightnvm/rrpc.c      |  6 +++---
 drivers/lightnvm/rrpc.h      |  2 +-
 drivers/lightnvm/sysblk.c    |  2 +-
 drivers/nvme/host/lightnvm.c |  4 ++--
 include/linux/lightnvm.h     |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index d3af77102e47..160c1a6838e1 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -226,8 +226,8 @@ void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
 	int i;
 
-	if (rqd->nr_pages > 1) {
-		for (i = 0; i < rqd->nr_pages; i++)
+	if (rqd->nr_ppas > 1) {
+		for (i = 0; i < rqd->nr_ppas; i++)
 			rqd->ppa_list[i] = dev_to_generic_addr(dev,
 							rqd->ppa_list[i]);
 	} else {
@@ -240,8 +240,8 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
 	int i;
 
-	if (rqd->nr_pages > 1) {
-		for (i = 0; i < rqd->nr_pages; i++)
+	if (rqd->nr_ppas > 1) {
+		for (i = 0; i < rqd->nr_ppas; i++)
 			rqd->ppa_list[i] = generic_to_dev_addr(dev,
 							rqd->ppa_list[i]);
 	} else {
@@ -256,13 +256,13 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 	int i, plane_cnt, pl_idx;
 
 	if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
-		rqd->nr_pages = nr_ppas;
+		rqd->nr_ppas = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 
 		return 0;
 	}
 
-	rqd->nr_pages = nr_ppas;
+	rqd->nr_ppas = nr_ppas;
 	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
 	if (!rqd->ppa_list) {
 		pr_err("nvm: failed to allocate dma memory\n");
@@ -274,7 +274,7 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 			rqd->ppa_list[i] = ppas[i];
 	} else {
 		plane_cnt = dev->plane_mode;
-		rqd->nr_pages *= plane_cnt;
+		rqd->nr_ppas *= plane_cnt;
 
 		for (i = 0; i < nr_ppas; i++) {
 			for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
@@ -395,7 +395,7 @@ int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list,
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	rqd.nr_pages = nr_ppas;
+	rqd.nr_ppas = nr_ppas;
 	if (nr_ppas > 1)
 		rqd.ppa_list = ppa_list;
 	else
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 33a66d8c039d..ec9fb6876e38 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -446,7 +446,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
 	nvm_addr_to_generic_mode(dev, rqd);
 
 	/* look up blocks and mark them as bad */
-	if (rqd->nr_pages == 1) {
+	if (rqd->nr_ppas == 1) {
 		gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
 		return;
 	}
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 48862ead7c99..72aca96f467b 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -695,7 +695,7 @@ static void rrpc_end_io(struct nvm_rq *rqd)
 {
 	struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
 	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
-	uint8_t npages = rqd->nr_pages;
+	uint8_t npages = rqd->nr_ppas;
 	sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
 
 	if (bio_data_dir(rqd->bio) == WRITE)
@@ -883,7 +883,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 	bio_get(bio);
 	rqd->bio = bio;
 	rqd->ins = &rrpc->instance;
-	rqd->nr_pages = nr_pages;
+	rqd->nr_ppas = nr_pages;
 	rrq->flags = flags;
 
 	err = nvm_submit_io(rrpc->dev, rqd);
@@ -892,7 +892,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 		bio_put(bio);
 		if (!(flags & NVM_IOTYPE_GC)) {
 			rrpc_unlock_rq(rrpc, rqd);
-			if (rqd->nr_pages > 1)
+			if (rqd->nr_ppas > 1)
 				nvm_dev_dma_free(rrpc->dev,
 			rqd->ppa_list, rqd->dma_ppa_list);
 		}
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 2653484a3b40..87e84b5fc1cc 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -251,7 +251,7 @@ static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
 static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
 {
 	struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-	uint8_t pages = rqd->nr_pages;
+	uint8_t pages = rqd->nr_ppas;
 
 	BUG_ON((r->l_start + pages) > rrpc->nr_sects);
 
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index b98ca1998da5..994697ac786e 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -280,7 +280,7 @@ static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
 	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1);
 	nvm_generic_to_addr_mode(dev, &rqd);
 
-	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_pages, type);
+	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
 	nvm_free_rqd_ppalist(dev, &rqd);
 	if (ret) {
 		pr_err("nvm: sysblk failed bb mark\n");
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 65de1e56e59a..a0af0558354c 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -471,7 +471,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
 	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
 	c->ph_rw.control = cpu_to_le16(rqd->flags);
-	c->ph_rw.length = cpu_to_le16(rqd->nr_pages - 1);
+	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
 
 	if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
 		c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
@@ -542,7 +542,7 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
 	c.erase.opcode = NVM_OP_ERASE;
 	c.erase.nsid = cpu_to_le32(ns->ns_id);
 	c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-	c.erase.length = cpu_to_le16(rqd->nr_pages - 1);
+	c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
 
 	return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
 }
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 3a810d7bd25e..b2991c724640 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -244,7 +244,7 @@ struct nvm_rq {
 	nvm_end_io_fn *end_io;
 
 	uint8_t opcode;
-	uint16_t nr_pages;
+	uint16_t nr_ppas;
 	uint16_t flags;
 
 	u64 ppa_status; /* ppa media status */
-- 
cgit v1.2.3


From 116f7d4a21fe450efc652c4850eb27cda36c9db0 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Fri, 6 May 2016 20:03:21 +0200
Subject: lightnvm: reserved space calculation incorrect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nvm_dev->max_pages_per_blk variable was removed in favor of the new
nvm->sec_per_blk variable. The ->max_pages_per_blk variable was still
used in rrpc_capacity, reporting the reserved capacity to zero. Replace
with ->sec_per_blk to calculate the reserved area again.

Signed-off-by: Javier González <javier@cnexlabs.com>
Updated patch description. Was "lightnvm: eliminate redundant variable"
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/rrpc.c  | 2 +-
 include/linux/lightnvm.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 72aca96f467b..2103e97a974f 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1264,7 +1264,7 @@ static sector_t rrpc_capacity(void *private)
 	sector_t reserved, provisioned;
 
 	/* cur, gc, and two emergency blocks for each lun */
-	reserved = rrpc->nr_luns * dev->max_pages_per_blk * 4;
+	reserved = rrpc->nr_luns * dev->sec_per_blk * 4;
 	provisioned = rrpc->nr_sects - reserved;
 
 	if (reserved > rrpc->nr_sects) {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index b2991c724640..ef2c7d2e76c4 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -351,7 +351,6 @@ struct nvm_dev {
 	unsigned long total_blocks;
 	unsigned long total_secs;
 	int nr_luns;
-	unsigned max_pages_per_blk;
 
 	unsigned long *lun_map;
 	void *dma_pool;
-- 
cgit v1.2.3


From db58ba45920255e967cc1d62a430cebd634b5046 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:12 -0700
Subject: bpf: wire in data and data_end for cls_act_bpf

allow cls_bpf and act_bpf programs access skb->data and skb->data_end pointers.
The bpf helpers that change skb->data need to update data_end pointer as well.
The verifier checks that programs always reload data, data_end pointers
after calls to such bpf helpers.
We cannot add 'data_end' pointer to struct qdisc_skb_cb directly,
since it's embedded as-is by infiniband ipoib, so wrapper struct is needed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 16 ++++++++++++++++
 net/core/filter.c      | 51 ++++++++++++++++++++++++++++++++++++++++++++------
 net/sched/act_bpf.c    |  2 ++
 net/sched/cls_bpf.c    |  2 ++
 4 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43aa1f8855c7..ec1411c89105 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -352,6 +352,22 @@ struct sk_filter {
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
+struct bpf_skb_data_end {
+	struct qdisc_skb_cb qdisc_cb;
+	void *data_end;
+};
+
+/* compute the linear packet data range [data, data_end) which
+ * will be accessed by cls_bpf and act_bpf programs
+ */
+static inline void bpf_compute_data_end(struct sk_buff *skb)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
+	cb->data_end = skb->data + skb_headlen(skb);
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
 	/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/net/core/filter.c b/net/core/filter.c
index 218e5de8c402..71c2a1f473ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1344,6 +1344,21 @@ struct bpf_scratchpad {
 
 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
 
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+					unsigned int write_len)
+{
+	int err;
+
+	if (!skb_cloned(skb))
+		return 0;
+	if (skb_clone_writable(skb, write_len))
+		return 0;
+	err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+	if (!err)
+		bpf_compute_data_end(skb);
+	return err;
+}
+
 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 {
 	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
@@ -1366,7 +1381,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 */
 	if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + len)))
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, sp->buff);
@@ -1444,7 +1459,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1499,7 +1514,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
+	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1699,12 +1714,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	__be16 vlan_proto = (__force __be16) r2;
+	int ret;
 
 	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
 		     vlan_proto != htons(ETH_P_8021AD)))
 		vlan_proto = htons(ETH_P_8021Q);
 
-	return skb_vlan_push(skb, vlan_proto, vlan_tci);
+	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+	bpf_compute_data_end(skb);
+	return ret;
 }
 
 const struct bpf_func_proto bpf_skb_vlan_push_proto = {
@@ -1720,8 +1738,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	int ret;
 
-	return skb_vlan_pop(skb);
+	ret = skb_vlan_pop(skb);
+	bpf_compute_data_end(skb);
+	return ret;
 }
 
 const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
@@ -2066,8 +2087,12 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
-	if (off == offsetof(struct __sk_buff, tc_classid))
+	switch (off) {
+	case offsetof(struct __sk_buff, tc_classid):
+	case offsetof(struct __sk_buff, data):
+	case offsetof(struct __sk_buff, data_end):
 		return false;
+	}
 
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -2215,6 +2240,20 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
 		break;
 
+	case offsetof(struct __sk_buff, data):
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+				      dst_reg, src_reg,
+				      offsetof(struct sk_buff, data));
+		break;
+
+	case offsetof(struct __sk_buff, data_end):
+		ctx_off -= offsetof(struct __sk_buff, data_end);
+		ctx_off += offsetof(struct sk_buff, cb);
+		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
+				      dst_reg, src_reg, ctx_off);
+		break;
+
 	case offsetof(struct __sk_buff, tc_index):
 #ifdef CONFIG_NET_SCHED
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 4fd703362563..c7123e01c2ca 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 425fe6a0eda3..7b342c779da7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.2.3


From b5a7aef1ef436ec005fef0efe31a676ec5f4ab31 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 4 May 2016 22:05:01 -0700
Subject: fscrypto/f2fs: allow fs-specific key prefix for fs encryption

This patch allows fscrypto to handle a second key prefix given by filesystem.
The main reason is to provide backward compatibility, since previously f2fs
used "f2fs:" as a crypto prefix instead of "fscrypt:".
Later, ext4 should also provide key_prefix() to give "ext4:".

One concern decribed by Ted would be kinda double check overhead of prefixes.
In x86, for example, validate_user_key consumes 8 ms after boot-up, which turns
out derive_key_aes() consumed most of the time to load specific crypto module.
After such the cold miss, it shows almost zero latencies, which treats as a
negligible overhead.
Note that request_key() detects wrong prefix in prior to derive_key_aes() even.

Cc: Ted Tso <tytso@mit.edu>
Cc: stable@vger.kernel.org # v4.6
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/keyinfo.c      | 120 ++++++++++++++++++++++++++++++-----------------
 fs/f2fs/f2fs.h           |   8 ++++
 fs/f2fs/super.c          |  13 +++++
 include/linux/fscrypto.h |   1 +
 4 files changed, 98 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 06f5aa478bf2..1ac263eddc4e 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -78,6 +78,67 @@ out:
 	return res;
 }
 
+static int validate_user_key(struct fscrypt_info *crypt_info,
+			struct fscrypt_context *ctx, u8 *raw_key,
+			u8 *prefix, int prefix_size)
+{
+	u8 *full_key_descriptor;
+	struct key *keyring_key;
+	struct fscrypt_key *master_key;
+	const struct user_key_payload *ukp;
+	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+	int res;
+
+	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
+	if (!full_key_descriptor)
+		return -ENOMEM;
+
+	memcpy(full_key_descriptor, prefix, prefix_size);
+	sprintf(full_key_descriptor + prefix_size,
+			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
+			ctx->master_key_descriptor);
+	full_key_descriptor[full_key_len - 1] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	kfree(full_key_descriptor);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	if (keyring_key->type != &key_type_logon) {
+		printk_once(KERN_WARNING
+				"%s: key type must be logon\n", __func__);
+		res = -ENOKEY;
+		goto out;
+	}
+	down_read(&keyring_key->sem);
+	ukp = user_key_payload(keyring_key);
+	if (ukp->datalen != sizeof(struct fscrypt_key)) {
+		res = -EINVAL;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	master_key = (struct fscrypt_key *)ukp->data;
+	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+		printk_once(KERN_WARNING
+				"%s: key size incorrect: %d\n",
+				__func__, master_key->size);
+		res = -ENOKEY;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+	up_read(&keyring_key->sem);
+	if (res)
+		goto out;
+
+	crypt_info->ci_keyring_key = keyring_key;
+	return 0;
+out:
+	key_put(keyring_key);
+	return res;
+}
+
 static void put_crypt_info(struct fscrypt_info *ci)
 {
 	if (!ci)
@@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 int get_crypt_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
-	u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
-				(FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
-	struct key *keyring_key = NULL;
-	struct fscrypt_key *master_key;
 	struct fscrypt_context ctx;
-	const struct user_key_payload *ukp;
 	struct crypto_skcipher *ctfm;
 	const char *cipher_str;
 	u8 raw_key[FS_MAX_KEY_SIZE];
@@ -167,48 +223,24 @@ retry:
 		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
 		goto got_key;
 	}
-	memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
-					FS_KEY_DESC_PREFIX_SIZE);
-	sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
-					"%*phN", FS_KEY_DESCRIPTOR_SIZE,
-					ctx.master_key_descriptor);
-	full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
-					(2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	if (IS_ERR(keyring_key)) {
-		res = PTR_ERR(keyring_key);
-		keyring_key = NULL;
-		goto out;
-	}
-	crypt_info->ci_keyring_key = keyring_key;
-	if (keyring_key->type != &key_type_logon) {
-		printk_once(KERN_WARNING
-				"%s: key type must be logon\n", __func__);
-		res = -ENOKEY;
-		goto out;
-	}
-	down_read(&keyring_key->sem);
-	ukp = user_key_payload(keyring_key);
-	if (ukp->datalen != sizeof(struct fscrypt_key)) {
-		res = -EINVAL;
-		up_read(&keyring_key->sem);
-		goto out;
-	}
-	master_key = (struct fscrypt_key *)ukp->data;
-	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
 
-	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
-		printk_once(KERN_WARNING
-				"%s: key size incorrect: %d\n",
-				__func__, master_key->size);
-		res = -ENOKEY;
-		up_read(&keyring_key->sem);
+	res = validate_user_key(crypt_info, &ctx, raw_key,
+			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	if (res && inode->i_sb->s_cop->key_prefix) {
+		u8 *prefix = NULL;
+		int prefix_size, res2;
+
+		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
+		res2 = validate_user_key(crypt_info, &ctx, raw_key,
+							prefix, prefix_size);
+		if (res2) {
+			if (res2 == -ENOKEY)
+				res = -ENOKEY;
+			goto out;
+		}
+	} else if (res) {
 		goto out;
 	}
-	res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
-	up_read(&keyring_key->sem);
-	if (res)
-		goto out;
 got_key:
 	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
 	if (!ctfm || IS_ERR(ctfm)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ccf8bf4debfc..dbd277eb9da7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -711,6 +711,10 @@ enum {
 	MAX_TIME,
 };
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -718,6 +722,10 @@ struct f2fs_sb_info {
 	int valid_super_block;			/* valid super block no */
 	int s_flag;				/* flags for sbi */
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
+	u8 key_prefix_size;
+#endif
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
 	struct inode *node_inode;		/* cache node blocks */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8a28f799a24f..28c8992da6f6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -964,6 +964,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 				ctx, len, NULL);
 }
 
+static int f2fs_key_prefix(struct inode *inode, u8 **key)
+{
+	*key = F2FS_I_SB(inode)->key_prefix;
+	return F2FS_I_SB(inode)->key_prefix_size;
+}
+
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
@@ -980,6 +986,7 @@ static unsigned f2fs_max_namelen(struct inode *inode)
 
 static struct fscrypt_operations f2fs_cryptops = {
 	.get_context	= f2fs_get_context,
+	.key_prefix	= f2fs_key_prefix,
 	.set_context	= f2fs_set_context,
 	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
@@ -1305,6 +1312,12 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
+				F2FS_KEY_DESC_PREFIX_SIZE);
+	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
+#endif
 }
 
 /*
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 6027f6bbb061..cfa6cde25f8e 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -175,6 +175,7 @@ struct fscrypt_name {
  */
 struct fscrypt_operations {
 	int (*get_context)(struct inode *, void *, size_t);
+	int (*key_prefix)(struct inode *, u8 **);
 	int (*prepare_context)(struct inode *);
 	int (*set_context)(struct inode *, const void *, size_t, void *);
 	int (*dummy_context)(struct inode *);
-- 
cgit v1.2.3


From 824815c4cd13e384ef5a4be725ec4b06e4ad2c2a Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 31 Mar 2016 16:23:51 -0400
Subject: reservation: add reservation_object_get_excl_rcu()

In the atomic modesetting path, each driver simply wants to grab a ref
to the exclusive fence from a reservation object to store in the incoming
drm_plane_state, without doing the whole RCU dance.  Since each driver
will need to do this, lets make a helper.

v2: rename to _rcu instead of _unlocked to be more consistent

Signed-off-by: Rob Clark <robdclark@gmail.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 include/linux/reservation.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 5a0b64cf68b4..49d057655d62 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -120,6 +120,24 @@ reservation_object_get_excl(struct reservation_object *obj)
 					 reservation_object_held(obj));
 }
 
+static inline struct fence *
+reservation_object_get_excl_rcu(struct reservation_object *obj)
+{
+	struct fence *fence;
+	unsigned seq;
+retry:
+	seq = read_seqcount_begin(&obj->seq);
+	rcu_read_lock();
+	fence = rcu_dereference(obj->fence_excl);
+	if (read_seqcount_retry(&obj->seq, seq)) {
+		rcu_read_unlock();
+		goto retry;
+	}
+	fence = fence_get(fence);
+	rcu_read_unlock();
+	return fence;
+}
+
 int reservation_object_reserve_shared(struct reservation_object *obj);
 void reservation_object_add_shared_fence(struct reservation_object *obj,
 					 struct fence *fence);
-- 
cgit v1.2.3


From 43315f31adc2bf3b35e04dcf2372c3bb08014ed1 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 6 May 2016 07:09:07 -0700
Subject: soc: qcom: smd: Introduce compile stubs

Introduce compile stubs for the SMD API, allowing consumers to be
compile tested.

Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/soc/qcom/smd.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index d0cb6d189a0a..46a984f5e3a3 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -45,13 +45,39 @@ struct qcom_smd_driver {
 	int (*callback)(struct qcom_smd_device *, const void *, size_t);
 };
 
+#if IS_ENABLED(CONFIG_QCOM_SMD)
+
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
 void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
+int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
+
+#else
+
+static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv)
+{
+	return -ENXIO;
+}
+
+static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+}
+
+static inline int qcom_smd_send(struct qcom_smd_channel *channel,
+				const void *data, int len)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+	return -ENXIO;
+}
+
+#endif
+
 #define module_qcom_smd_driver(__smd_driver) \
 	module_driver(__smd_driver, qcom_smd_driver_register, \
 		      qcom_smd_driver_unregister)
 
-int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
 #endif
-- 
cgit v1.2.3


From bdabad3e363d825ddf9679dd431cca0b2c30f881 Mon Sep 17 00:00:00 2001
From: Courtney Cavin <courtney.cavin@sonymobile.com>
Date: Fri, 6 May 2016 07:09:08 -0700
Subject: net: Add Qualcomm IPC router

Add an implementation of Qualcomm's IPC router protocol, used to
communicate with service providing remote processors.

Signed-off-by: Courtney Cavin <courtney.cavin@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
[bjorn: Cope with 0 being a valid node id and implement RTM_NEWADDR]
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h    |    4 +-
 include/uapi/linux/qrtr.h |   12 +
 net/Kconfig               |    1 +
 net/Makefile              |    1 +
 net/qrtr/Kconfig          |   24 ++
 net/qrtr/Makefile         |    2 +
 net/qrtr/qrtr.c           | 1007 +++++++++++++++++++++++++++++++++++++++++++++
 net/qrtr/qrtr.h           |   31 ++
 net/qrtr/smd.c            |  117 ++++++
 9 files changed, 1198 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/qrtr.h
 create mode 100644 net/qrtr/Kconfig
 create mode 100644 net/qrtr/Makefile
 create mode 100644 net/qrtr/qrtr.c
 create mode 100644 net/qrtr/qrtr.h
 create mode 100644 net/qrtr/smd.c

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 73bf6c6a833b..b5cc5a6d7011 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -201,8 +201,9 @@ struct ucred {
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
 #define AF_KCM		41	/* Kernel Connection Multiplexor*/
+#define AF_QIPCRTR	42	/* Qualcomm IPC Router          */
 
-#define AF_MAX		42	/* For now.. */
+#define AF_MAX		43	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -249,6 +250,7 @@ struct ucred {
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
 #define PF_KCM		AF_KCM
+#define PF_QIPCRTR	AF_QIPCRTR
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
new file mode 100644
index 000000000000..66c0748d26e2
--- /dev/null
+++ b/include/uapi/linux/qrtr.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_QRTR_H
+#define _LINUX_QRTR_H
+
+#include <linux/socket.h>
+
+struct sockaddr_qrtr {
+	__kernel_sa_family_t sq_family;
+	__u32 sq_node;
+	__u32 sq_port;
+};
+
+#endif /* _LINUX_QRTR_H */
diff --git a/net/Kconfig b/net/Kconfig
index a8934d8c8fda..b841c42e5c9b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -236,6 +236,7 @@ source "net/mpls/Kconfig"
 source "net/hsr/Kconfig"
 source "net/switchdev/Kconfig"
 source "net/l3mdev/Kconfig"
+source "net/qrtr/Kconfig"
 
 config RPS
 	bool
diff --git a/net/Makefile b/net/Makefile
index 81d14119eab5..bdd14553a774 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ endif
 ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
 obj-y				+= l3mdev/
 endif
+obj-$(CONFIG_QRTR)		+= qrtr/
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
new file mode 100644
index 000000000000..673fd1f86ebe
--- /dev/null
+++ b/net/qrtr/Kconfig
@@ -0,0 +1,24 @@
+# Qualcomm IPC Router configuration
+#
+
+config QRTR
+	tristate "Qualcomm IPC Router support"
+	depends on ARCH_QCOM || COMPILE_TEST
+	---help---
+	  Say Y if you intend to use Qualcomm IPC router protocol.  The
+	  protocol is used to communicate with services provided by other
+	  hardware blocks in the system.
+
+	  In order to do service lookups, a userspace daemon is required to
+	  maintain a service listing.
+
+if QRTR
+
+config QRTR_SMD
+	tristate "SMD IPC Router channels"
+	depends on QCOM_SMD || COMPILE_TEST
+	---help---
+	  Say Y here to support SMD based ipcrouter channels.  SMD is the
+	  most common transport for IPC Router.
+
+endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
new file mode 100644
index 000000000000..6c00dc623b7e
--- /dev/null
+++ b/net/qrtr/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_QRTR) := qrtr.o
+obj-$(CONFIG_QRTR_SMD) += smd.o
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
new file mode 100644
index 000000000000..c985ecbe9bd6
--- /dev/null
+++ b/net/qrtr/qrtr.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/qrtr.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+
+#include <net/sock.h>
+
+#include "qrtr.h"
+
+#define QRTR_PROTO_VER 1
+
+/* auto-bind range */
+#define QRTR_MIN_EPH_SOCKET 0x4000
+#define QRTR_MAX_EPH_SOCKET 0x7fff
+
+enum qrtr_pkt_type {
+	QRTR_TYPE_DATA		= 1,
+	QRTR_TYPE_HELLO		= 2,
+	QRTR_TYPE_BYE		= 3,
+	QRTR_TYPE_NEW_SERVER	= 4,
+	QRTR_TYPE_DEL_SERVER	= 5,
+	QRTR_TYPE_DEL_CLIENT	= 6,
+	QRTR_TYPE_RESUME_TX	= 7,
+	QRTR_TYPE_EXIT		= 8,
+	QRTR_TYPE_PING		= 9,
+};
+
+/**
+ * struct qrtr_hdr - (I|R)PCrouter packet header
+ * @version: protocol version
+ * @type: packet type; one of QRTR_TYPE_*
+ * @src_node_id: source node
+ * @src_port_id: source port
+ * @confirm_rx: boolean; whether a resume-tx packet should be send in reply
+ * @size: length of packet, excluding this header
+ * @dst_node_id: destination node
+ * @dst_port_id: destination port
+ */
+struct qrtr_hdr {
+	__le32 version;
+	__le32 type;
+	__le32 src_node_id;
+	__le32 src_port_id;
+	__le32 confirm_rx;
+	__le32 size;
+	__le32 dst_node_id;
+	__le32 dst_port_id;
+} __packed;
+
+#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr)
+#define QRTR_NODE_BCAST ((unsigned int)-1)
+#define QRTR_PORT_CTRL ((unsigned int)-2)
+
+struct qrtr_sock {
+	/* WARNING: sk must be the first member */
+	struct sock sk;
+	struct sockaddr_qrtr us;
+	struct sockaddr_qrtr peer;
+};
+
+static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
+{
+	BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0);
+	return container_of(sk, struct qrtr_sock, sk);
+}
+
+static unsigned int qrtr_local_nid = -1;
+
+/* for node ids */
+static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
+/* broadcast list */
+static LIST_HEAD(qrtr_all_nodes);
+/* lock for qrtr_nodes, qrtr_all_nodes and node reference */
+static DEFINE_MUTEX(qrtr_node_lock);
+
+/* local port allocation management */
+static DEFINE_IDR(qrtr_ports);
+static DEFINE_MUTEX(qrtr_port_lock);
+
+/**
+ * struct qrtr_node - endpoint node
+ * @ep_lock: lock for endpoint management and callbacks
+ * @ep: endpoint
+ * @ref: reference count for node
+ * @nid: node id
+ * @rx_queue: receive queue
+ * @work: scheduled work struct for recv work
+ * @item: list item for broadcast list
+ */
+struct qrtr_node {
+	struct mutex ep_lock;
+	struct qrtr_endpoint *ep;
+	struct kref ref;
+	unsigned int nid;
+
+	struct sk_buff_head rx_queue;
+	struct work_struct work;
+	struct list_head item;
+};
+
+/* Release node resources and free the node.
+ *
+ * Do not call directly, use qrtr_node_release.  To be used with
+ * kref_put_mutex.  As such, the node mutex is expected to be locked on call.
+ */
+static void __qrtr_node_release(struct kref *kref)
+{
+	struct qrtr_node *node = container_of(kref, struct qrtr_node, ref);
+
+	if (node->nid != QRTR_EP_NID_AUTO)
+		radix_tree_delete(&qrtr_nodes, node->nid);
+
+	list_del(&node->item);
+	mutex_unlock(&qrtr_node_lock);
+
+	skb_queue_purge(&node->rx_queue);
+	kfree(node);
+}
+
+/* Increment reference to node. */
+static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node)
+{
+	if (node)
+		kref_get(&node->ref);
+	return node;
+}
+
+/* Decrement reference to node and release as necessary. */
+static void qrtr_node_release(struct qrtr_node *node)
+{
+	if (!node)
+		return;
+	kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock);
+}
+
+/* Pass an outgoing packet socket buffer to the endpoint driver. */
+static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	int rc = -ENODEV;
+
+	mutex_lock(&node->ep_lock);
+	if (node->ep)
+		rc = node->ep->xmit(node->ep, skb);
+	else
+		kfree_skb(skb);
+	mutex_unlock(&node->ep_lock);
+
+	return rc;
+}
+
+/* Lookup node by id.
+ *
+ * callers must release with qrtr_node_release()
+ */
+static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
+{
+	struct qrtr_node *node;
+
+	mutex_lock(&qrtr_node_lock);
+	node = radix_tree_lookup(&qrtr_nodes, nid);
+	node = qrtr_node_acquire(node);
+	mutex_unlock(&qrtr_node_lock);
+
+	return node;
+}
+
+/* Assign node id to node.
+ *
+ * This is mostly useful for automatic node id assignment, based on
+ * the source id in the incoming packet.
+ */
+static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid)
+{
+	if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO)
+		return;
+
+	mutex_lock(&qrtr_node_lock);
+	radix_tree_insert(&qrtr_nodes, nid, node);
+	node->nid = nid;
+	mutex_unlock(&qrtr_node_lock);
+}
+
+/**
+ * qrtr_endpoint_post() - post incoming data
+ * @ep: endpoint handle
+ * @data: data pointer
+ * @len: size of data in bytes
+ *
+ * Return: 0 on success; negative error code on failure
+ */
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
+{
+	struct qrtr_node *node = ep->node;
+	const struct qrtr_hdr *phdr = data;
+	struct sk_buff *skb;
+	unsigned int psize;
+	unsigned int size;
+	unsigned int type;
+	unsigned int ver;
+	unsigned int dst;
+
+	if (len < QRTR_HDR_SIZE || len & 3)
+		return -EINVAL;
+
+	ver = le32_to_cpu(phdr->version);
+	size = le32_to_cpu(phdr->size);
+	type = le32_to_cpu(phdr->type);
+	dst = le32_to_cpu(phdr->dst_port_id);
+
+	psize = (size + 3) & ~3;
+
+	if (ver != QRTR_PROTO_VER)
+		return -EINVAL;
+
+	if (len != psize + QRTR_HDR_SIZE)
+		return -EINVAL;
+
+	if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA)
+		return -EINVAL;
+
+	skb = netdev_alloc_skb(NULL, len);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reset_transport_header(skb);
+	memcpy(skb_put(skb, len), data, len);
+
+	skb_queue_tail(&node->rx_queue, skb);
+	schedule_work(&node->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_post);
+
+/* Allocate and construct a resume-tx packet. */
+static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
+					    u32 dst_node, u32 port)
+{
+	const int pkt_len = 20;
+	struct qrtr_hdr *hdr;
+	struct sk_buff *skb;
+	u32 *buf;
+
+	skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+	skb_reset_transport_header(skb);
+
+	hdr = (struct qrtr_hdr *)skb_put(skb, QRTR_HDR_SIZE);
+	hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+	hdr->type = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+	hdr->src_node_id = cpu_to_le32(src_node);
+	hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+	hdr->confirm_rx = cpu_to_le32(0);
+	hdr->size = cpu_to_le32(pkt_len);
+	hdr->dst_node_id = cpu_to_le32(dst_node);
+	hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
+
+	buf = (u32 *)skb_put(skb, pkt_len);
+	memset(buf, 0, pkt_len);
+	buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+	buf[1] = cpu_to_le32(src_node);
+	buf[2] = cpu_to_le32(port);
+
+	return skb;
+}
+
+static struct qrtr_sock *qrtr_port_lookup(int port);
+static void qrtr_port_put(struct qrtr_sock *ipc);
+
+/* Handle and route a received packet.
+ *
+ * This will auto-reply with resume-tx packet as necessary.
+ */
+static void qrtr_node_rx_work(struct work_struct *work)
+{
+	struct qrtr_node *node = container_of(work, struct qrtr_node, work);
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&node->rx_queue)) != NULL) {
+		const struct qrtr_hdr *phdr;
+		u32 dst_node, dst_port;
+		struct qrtr_sock *ipc;
+		u32 src_node;
+		int confirm;
+
+		phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+		src_node = le32_to_cpu(phdr->src_node_id);
+		dst_node = le32_to_cpu(phdr->dst_node_id);
+		dst_port = le32_to_cpu(phdr->dst_port_id);
+		confirm = !!phdr->confirm_rx;
+
+		qrtr_node_assign(node, src_node);
+
+		ipc = qrtr_port_lookup(dst_port);
+		if (!ipc) {
+			kfree_skb(skb);
+		} else {
+			if (sock_queue_rcv_skb(&ipc->sk, skb))
+				kfree_skb(skb);
+
+			qrtr_port_put(ipc);
+		}
+
+		if (confirm) {
+			skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port);
+			if (!skb)
+				break;
+			if (qrtr_node_enqueue(node, skb))
+				break;
+		}
+	}
+}
+
+/**
+ * qrtr_endpoint_register() - register a new endpoint
+ * @ep: endpoint to register
+ * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment
+ * Return: 0 on success; negative error code on failure
+ *
+ * The specified endpoint must have the xmit function pointer set on call.
+ */
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid)
+{
+	struct qrtr_node *node;
+
+	if (!ep || !ep->xmit)
+		return -EINVAL;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	INIT_WORK(&node->work, qrtr_node_rx_work);
+	kref_init(&node->ref);
+	mutex_init(&node->ep_lock);
+	skb_queue_head_init(&node->rx_queue);
+	node->nid = QRTR_EP_NID_AUTO;
+	node->ep = ep;
+
+	qrtr_node_assign(node, nid);
+
+	mutex_lock(&qrtr_node_lock);
+	list_add(&node->item, &qrtr_all_nodes);
+	mutex_unlock(&qrtr_node_lock);
+	ep->node = node;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_register);
+
+/**
+ * qrtr_endpoint_unregister - unregister endpoint
+ * @ep: endpoint to unregister
+ */
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
+{
+	struct qrtr_node *node = ep->node;
+
+	mutex_lock(&node->ep_lock);
+	node->ep = NULL;
+	mutex_unlock(&node->ep_lock);
+
+	qrtr_node_release(node);
+	ep->node = NULL;
+}
+EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister);
+
+/* Lookup socket by port.
+ *
+ * Callers must release with qrtr_port_put()
+ */
+static struct qrtr_sock *qrtr_port_lookup(int port)
+{
+	struct qrtr_sock *ipc;
+
+	if (port == QRTR_PORT_CTRL)
+		port = 0;
+
+	mutex_lock(&qrtr_port_lock);
+	ipc = idr_find(&qrtr_ports, port);
+	if (ipc)
+		sock_hold(&ipc->sk);
+	mutex_unlock(&qrtr_port_lock);
+
+	return ipc;
+}
+
+/* Release acquired socket. */
+static void qrtr_port_put(struct qrtr_sock *ipc)
+{
+	sock_put(&ipc->sk);
+}
+
+/* Remove port assignment. */
+static void qrtr_port_remove(struct qrtr_sock *ipc)
+{
+	int port = ipc->us.sq_port;
+
+	if (port == QRTR_PORT_CTRL)
+		port = 0;
+
+	__sock_put(&ipc->sk);
+
+	mutex_lock(&qrtr_port_lock);
+	idr_remove(&qrtr_ports, port);
+	mutex_unlock(&qrtr_port_lock);
+}
+
+/* Assign port number to socket.
+ *
+ * Specify port in the integer pointed to by port, and it will be adjusted
+ * on return as necesssary.
+ *
+ * Port may be:
+ *   0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET]
+ *   <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN
+ *   >QRTR_MIN_EPH_SOCKET: Specified; available to all
+ */
+static int qrtr_port_assign(struct qrtr_sock *ipc, int *port)
+{
+	int rc;
+
+	mutex_lock(&qrtr_port_lock);
+	if (!*port) {
+		rc = idr_alloc(&qrtr_ports, ipc,
+			       QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1,
+			       GFP_ATOMIC);
+		if (rc >= 0)
+			*port = rc;
+	} else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) {
+		rc = -EACCES;
+	} else if (*port == QRTR_PORT_CTRL) {
+		rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC);
+	} else {
+		rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC);
+		if (rc >= 0)
+			*port = rc;
+	}
+	mutex_unlock(&qrtr_port_lock);
+
+	if (rc == -ENOSPC)
+		return -EADDRINUSE;
+	else if (rc < 0)
+		return rc;
+
+	sock_hold(&ipc->sk);
+
+	return 0;
+}
+
+/* Bind socket to address.
+ *
+ * Socket should be locked upon call.
+ */
+static int __qrtr_bind(struct socket *sock,
+		       const struct sockaddr_qrtr *addr, int zapped)
+{
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int port;
+	int rc;
+
+	/* rebinding ok */
+	if (!zapped && addr->sq_port == ipc->us.sq_port)
+		return 0;
+
+	port = addr->sq_port;
+	rc = qrtr_port_assign(ipc, &port);
+	if (rc)
+		return rc;
+
+	/* unbind previous, if any */
+	if (!zapped)
+		qrtr_port_remove(ipc);
+	ipc->us.sq_port = port;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	return 0;
+}
+
+/* Auto bind to an ephemeral port. */
+static int qrtr_autobind(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_qrtr addr;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		return 0;
+
+	addr.sq_family = AF_QIPCRTR;
+	addr.sq_node = qrtr_local_nid;
+	addr.sq_port = 0;
+
+	return __qrtr_bind(sock, &addr, 1);
+}
+
+/* Bind socket to specified sockaddr. */
+static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int rc;
+
+	if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+		return -EINVAL;
+
+	if (addr->sq_node != ipc->us.sq_node)
+		return -EINVAL;
+
+	lock_sock(sk);
+	rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED));
+	release_sock(sk);
+
+	return rc;
+}
+
+/* Queue packet to local peer socket. */
+static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	const struct qrtr_hdr *phdr;
+	struct qrtr_sock *ipc;
+
+	phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+
+	ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id));
+	if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	if (sock_queue_rcv_skb(&ipc->sk, skb)) {
+		qrtr_port_put(ipc);
+		kfree_skb(skb);
+		return -ENOSPC;
+	}
+
+	qrtr_port_put(ipc);
+
+	return 0;
+}
+
+/* Queue packet for broadcast. */
+static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+
+	mutex_lock(&qrtr_node_lock);
+	list_for_each_entry(node, &qrtr_all_nodes, item) {
+		skbn = skb_clone(skb, GFP_KERNEL);
+		if (!skbn)
+			break;
+		skb_set_owner_w(skbn, skb->sk);
+		qrtr_node_enqueue(node, skbn);
+	}
+	mutex_unlock(&qrtr_node_lock);
+
+	qrtr_local_enqueue(node, skb);
+
+	return 0;
+}
+
+static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+	int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	struct qrtr_node *node;
+	struct qrtr_hdr *hdr;
+	struct sk_buff *skb;
+	size_t plen;
+	int rc;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT))
+		return -EINVAL;
+
+	if (len > 65535)
+		return -EMSGSIZE;
+
+	lock_sock(sk);
+
+	if (addr) {
+		if (msg->msg_namelen < sizeof(*addr)) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+
+		if (addr->sq_family != AF_QIPCRTR) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+
+		rc = qrtr_autobind(sock);
+		if (rc) {
+			release_sock(sk);
+			return rc;
+		}
+	} else if (sk->sk_state == TCP_ESTABLISHED) {
+		addr = &ipc->peer;
+	} else {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	node = NULL;
+	if (addr->sq_node == QRTR_NODE_BCAST) {
+		enqueue_fn = qrtr_bcast_enqueue;
+	} else if (addr->sq_node == ipc->us.sq_node) {
+		enqueue_fn = qrtr_local_enqueue;
+	} else {
+		enqueue_fn = qrtr_node_enqueue;
+		node = qrtr_node_lookup(addr->sq_node);
+		if (!node) {
+			release_sock(sk);
+			return -ECONNRESET;
+		}
+	}
+
+	plen = (len + 3) & ~3;
+	skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE,
+				  msg->msg_flags & MSG_DONTWAIT, &rc);
+	if (!skb)
+		goto out_node;
+
+	skb_reset_transport_header(skb);
+	skb_put(skb, len + QRTR_HDR_SIZE);
+
+	hdr = (struct qrtr_hdr *)skb_transport_header(skb);
+	hdr->version = cpu_to_le32(QRTR_PROTO_VER);
+	hdr->src_node_id = cpu_to_le32(ipc->us.sq_node);
+	hdr->src_port_id = cpu_to_le32(ipc->us.sq_port);
+	hdr->confirm_rx = cpu_to_le32(0);
+	hdr->size = cpu_to_le32(len);
+	hdr->dst_node_id = cpu_to_le32(addr->sq_node);
+	hdr->dst_port_id = cpu_to_le32(addr->sq_port);
+
+	rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE,
+					 &msg->msg_iter, len);
+	if (rc) {
+		kfree_skb(skb);
+		goto out_node;
+	}
+
+	if (plen != len) {
+		skb_pad(skb, plen - len);
+		skb_put(skb, plen - len);
+	}
+
+	if (ipc->us.sq_port == QRTR_PORT_CTRL) {
+		if (len < 4) {
+			rc = -EINVAL;
+			kfree_skb(skb);
+			goto out_node;
+		}
+
+		/* control messages already require the type as 'command' */
+		skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4);
+	} else {
+		hdr->type = cpu_to_le32(QRTR_TYPE_DATA);
+	}
+
+	rc = enqueue_fn(node, skb);
+	if (rc >= 0)
+		rc = len;
+
+out_node:
+	qrtr_node_release(node);
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t size, int flags)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name);
+	const struct qrtr_hdr *phdr;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied, rc;
+
+	lock_sock(sk);
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		release_sock(sk);
+		return -EADDRNOTAVAIL;
+	}
+
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &rc);
+	if (!skb) {
+		release_sock(sk);
+		return rc;
+	}
+
+	phdr = (const struct qrtr_hdr *)skb_transport_header(skb);
+	copied = le32_to_cpu(phdr->size);
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied);
+	if (rc < 0)
+		goto out;
+	rc = copied;
+
+	if (addr) {
+		addr->sq_family = AF_QIPCRTR;
+		addr->sq_node = le32_to_cpu(phdr->src_node_id);
+		addr->sq_port = le32_to_cpu(phdr->src_port_id);
+		msg->msg_namelen = sizeof(*addr);
+	}
+
+out:
+	skb_free_datagram(sk, skb);
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_connect(struct socket *sock, struct sockaddr *saddr,
+			int len, int flags)
+{
+	DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr);
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int rc;
+
+	if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	sk->sk_state = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	rc = qrtr_autobind(sock);
+	if (rc) {
+		release_sock(sk);
+		return rc;
+	}
+
+	ipc->peer = *addr;
+	sock->state = SS_CONNECTED;
+	sk->sk_state = TCP_ESTABLISHED;
+
+	release_sock(sk);
+
+	return 0;
+}
+
+static int qrtr_getname(struct socket *sock, struct sockaddr *saddr,
+			int *len, int peer)
+{
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sockaddr_qrtr qaddr;
+	struct sock *sk = sock->sk;
+
+	lock_sock(sk);
+	if (peer) {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+
+		qaddr = ipc->peer;
+	} else {
+		qaddr = ipc->us;
+	}
+	release_sock(sk);
+
+	*len = sizeof(qaddr);
+	qaddr.sq_family = AF_QIPCRTR;
+
+	memcpy(saddr, &qaddr, sizeof(qaddr));
+
+	return 0;
+}
+
+static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	struct sockaddr_qrtr *sq;
+	struct sk_buff *skb;
+	struct ifreq ifr;
+	long len = 0;
+	int rc = 0;
+
+	lock_sock(sk);
+
+	switch (cmd) {
+	case TIOCOUTQ:
+		len = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (len < 0)
+			len = 0;
+		rc = put_user(len, (int __user *)argp);
+		break;
+	case TIOCINQ:
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			len = skb->len - QRTR_HDR_SIZE;
+		rc = put_user(len, (int __user *)argp);
+		break;
+	case SIOCGIFADDR:
+		if (copy_from_user(&ifr, argp, sizeof(ifr))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		sq = (struct sockaddr_qrtr *)&ifr.ifr_addr;
+		*sq = ipc->us;
+		if (copy_to_user(argp, &ifr, sizeof(ifr))) {
+			rc = -EFAULT;
+			break;
+		}
+		break;
+	case SIOCGSTAMP:
+		rc = sock_get_timestamp(sk, argp);
+		break;
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+		rc = -EINVAL;
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+
+	release_sock(sk);
+
+	return rc;
+}
+
+static int qrtr_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct qrtr_sock *ipc;
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+
+	ipc = qrtr_sk(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+
+	sock_set_flag(sk, SOCK_DEAD);
+	sock->sk = NULL;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		qrtr_port_remove(ipc);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static const struct proto_ops qrtr_proto_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_QIPCRTR,
+	.bind		= qrtr_bind,
+	.connect	= qrtr_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.listen		= sock_no_listen,
+	.sendmsg	= qrtr_sendmsg,
+	.recvmsg	= qrtr_recvmsg,
+	.getname	= qrtr_getname,
+	.ioctl		= qrtr_ioctl,
+	.poll		= datagram_poll,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.release	= qrtr_release,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct proto qrtr_proto = {
+	.name		= "QIPCRTR",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct qrtr_sock),
+};
+
+static int qrtr_create(struct net *net, struct socket *sock,
+		       int protocol, int kern)
+{
+	struct qrtr_sock *ipc;
+	struct sock *sk;
+
+	if (sock->type != SOCK_DGRAM)
+		return -EPROTOTYPE;
+
+	sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	sock_init_data(sock, sk);
+	sock->ops = &qrtr_proto_ops;
+
+	ipc = qrtr_sk(sk);
+	ipc->us.sq_family = AF_QIPCRTR;
+	ipc->us.sq_node = qrtr_local_nid;
+	ipc->us.sq_port = 0;
+
+	return 0;
+}
+
+static const struct nla_policy qrtr_policy[IFA_MAX + 1] = {
+	[IFA_LOCAL] = { .type = NLA_U32 },
+};
+
+static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[IFA_MAX + 1];
+	struct ifaddrmsg *ifm;
+	int rc;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!netlink_capable(skb, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+	if (!tb[IFA_LOCAL])
+		return -EINVAL;
+
+	qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]);
+	return 0;
+}
+
+static const struct net_proto_family qrtr_family = {
+	.owner	= THIS_MODULE,
+	.family	= AF_QIPCRTR,
+	.create	= qrtr_create,
+};
+
+static int __init qrtr_proto_init(void)
+{
+	int rc;
+
+	rc = proto_register(&qrtr_proto, 1);
+	if (rc)
+		return rc;
+
+	rc = sock_register(&qrtr_family);
+	if (rc) {
+		proto_unregister(&qrtr_proto);
+		return rc;
+	}
+
+	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL);
+
+	return 0;
+}
+module_init(qrtr_proto_init);
+
+static void __exit qrtr_proto_fini(void)
+{
+	rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR);
+	sock_unregister(qrtr_family.family);
+	proto_unregister(&qrtr_proto);
+}
+module_exit(qrtr_proto_fini);
+
+MODULE_DESCRIPTION("Qualcomm IPC-router driver");
+MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
new file mode 100644
index 000000000000..2b848718f8fe
--- /dev/null
+++ b/net/qrtr/qrtr.h
@@ -0,0 +1,31 @@
+#ifndef __QRTR_H_
+#define __QRTR_H_
+
+#include <linux/types.h>
+
+struct sk_buff;
+
+/* endpoint node id auto assignment */
+#define QRTR_EP_NID_AUTO (-1)
+
+/**
+ * struct qrtr_endpoint - endpoint handle
+ * @xmit: Callback for outgoing packets
+ *
+ * The socket buffer passed to the xmit function becomes owned by the endpoint
+ * driver.  As such, when the driver is done with the buffer, it should
+ * call kfree_skb() on failure, or consume_skb() on success.
+ */
+struct qrtr_endpoint {
+	int (*xmit)(struct qrtr_endpoint *ep, struct sk_buff *skb);
+	/* private: not for endpoint use */
+	struct qrtr_node *node;
+};
+
+int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid);
+
+void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
+
+int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
+
+#endif
diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c
new file mode 100644
index 000000000000..84ebce73aa23
--- /dev/null
+++ b/net/qrtr/smd.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/soc/qcom/smd.h>
+
+#include "qrtr.h"
+
+struct qrtr_smd_dev {
+	struct qrtr_endpoint ep;
+	struct qcom_smd_channel *channel;
+};
+
+/* from smd to qrtr */
+static int qcom_smd_qrtr_callback(struct qcom_smd_device *sdev,
+				  const void *data, size_t len)
+{
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+	int rc;
+
+	if (!qdev)
+		return -EAGAIN;
+
+	rc = qrtr_endpoint_post(&qdev->ep, data, len);
+	if (rc == -EINVAL) {
+		dev_err(&sdev->dev, "invalid ipcrouter packet\n");
+		/* return 0 to let smd drop the packet */
+		rc = 0;
+	}
+
+	return rc;
+}
+
+/* from qrtr to smd */
+static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+	struct qrtr_smd_dev *qdev = container_of(ep, struct qrtr_smd_dev, ep);
+	int rc;
+
+	rc = skb_linearize(skb);
+	if (rc)
+		goto out;
+
+	rc = qcom_smd_send(qdev->channel, skb->data, skb->len);
+
+out:
+	if (rc)
+		kfree_skb(skb);
+	else
+		consume_skb(skb);
+	return rc;
+}
+
+static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev)
+{
+	struct qrtr_smd_dev *qdev;
+	int rc;
+
+	qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL);
+	if (!qdev)
+		return -ENOMEM;
+
+	qdev->channel = sdev->channel;
+	qdev->ep.xmit = qcom_smd_qrtr_send;
+
+	rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO);
+	if (rc)
+		return rc;
+
+	dev_set_drvdata(&sdev->dev, qdev);
+
+	dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n");
+
+	return 0;
+}
+
+static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev)
+{
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+
+	qrtr_endpoint_unregister(&qdev->ep);
+
+	dev_set_drvdata(&sdev->dev, NULL);
+}
+
+static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = {
+	{ "IPCRTR" },
+	{}
+};
+
+static struct qcom_smd_driver qcom_smd_qrtr_driver = {
+	.probe = qcom_smd_qrtr_probe,
+	.remove = qcom_smd_qrtr_remove,
+	.callback = qcom_smd_qrtr_callback,
+	.smd_match_table = qcom_smd_qrtr_smd_match,
+	.driver = {
+		.name = "qcom_smd_qrtr",
+		.owner = THIS_MODULE,
+	},
+};
+
+module_qcom_smd_driver(qcom_smd_qrtr_driver);
+
+MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 0e10d549f6eebd0a26bf075309b6fb947f4c1cb2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 15 Apr 2016 17:44:18 +0300
Subject: mfd: wm8400-core: Delete wm8400_reg_read()

There was a static checker warning in wm8400_reg_read() because we were
returning u16 and that can't hold the negative error codes.  The
function isn't used, so let's just delete it.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/wm8400-core.c          | 21 ---------------------
 include/linux/mfd/wm8400-private.h |  1 -
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 9fd823049f90..81667762c3ad 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -35,27 +35,6 @@ static bool wm8400_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
-/**
- * wm8400_reg_read - Single register read
- *
- * @wm8400: Pointer to wm8400 control structure
- * @reg:    Register to read
- *
- * @return  Read value
- */
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg)
-{
-	unsigned int val;
-	int ret;
-
-	ret = regmap_read(wm8400->regmap, reg, &val);
-	if (ret < 0)
-		return ret;
-
-	return val;
-}
-EXPORT_SYMBOL_GPL(wm8400_reg_read);
-
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
 {
 	return regmap_bulk_read(wm8400->regmap, reg, data, count);
diff --git a/include/linux/mfd/wm8400-private.h b/include/linux/mfd/wm8400-private.h
index 2de565b94d0c..4ee908f5b834 100644
--- a/include/linux/mfd/wm8400-private.h
+++ b/include/linux/mfd/wm8400-private.h
@@ -923,7 +923,6 @@ struct wm8400 {
 #define WM8400_LINE_CMP_VTHD_SHIFT                   0  /* LINE_CMP_VTHD - [3:0] */
 #define WM8400_LINE_CMP_VTHD_WIDTH                   4  /* LINE_CMP_VTHD - [3:0] */
 
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg);
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data);
 
 static inline int wm8400_set_bits(struct wm8400 *wm8400, u8 reg,
-- 
cgit v1.2.3


From bb208f144cf3f59d8f89a09a80efd04389718907 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 21 Mar 2016 20:18:21 +0100
Subject: can: fix handling of unmodifiable configuration options

As described in 'can: m_can: tag current CAN FD controllers as non-ISO'
(6cfda7fbebe) it is possible to define fixed configuration options by
setting the according bit in 'ctrlmode' and clear it in 'ctrlmode_supported'.
This leads to the incovenience that the fixed configuration bits can not be
passed by netlink even when they have the correct values (e.g. non-ISO, FD).

This patch fixes that issue and not only allows fixed set bit values to be set
again but now requires(!) to provide these fixed values at configuration time.
A valid CAN FD configuration consists of a nominal/arbitration bittiming, a
data bittiming and a control mode with CAN_CTRLMODE_FD set - which is now
enforced by a new can_validate() function. This fix additionally removed the
inconsistency that was prohibiting the support of 'CANFD-only' controller
drivers, like the RCar CAN FD.

For this reason a new helper can_set_static_ctrlmode() has been introduced to
provide a proper interface to handle static enabled CAN controller options.

Reported-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Reviewed-by: Ramesh Shanmugasundaram  <ramesh.shanmugasundaram@bp.renesas.com>
Cc: <stable@vger.kernel.org> # >= 3.18
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c         | 56 +++++++++++++++++++++++++++++++++++++++----
 drivers/net/can/m_can/m_can.c |  2 +-
 include/linux/can/dev.h       | 22 +++++++++++++++--
 3 files changed, 73 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 141c2a42d7ed..910c12e2638e 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -696,11 +696,17 @@ int can_change_mtu(struct net_device *dev, int new_mtu)
 	/* allow change of MTU according to the CANFD ability of the device */
 	switch (new_mtu) {
 	case CAN_MTU:
+		/* 'CANFD-only' controllers can not switch to CAN_MTU */
+		if (priv->ctrlmode_static & CAN_CTRLMODE_FD)
+			return -EINVAL;
+
 		priv->ctrlmode &= ~CAN_CTRLMODE_FD;
 		break;
 
 	case CANFD_MTU:
-		if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD))
+		/* check for potential CANFD ability */
+		if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD) &&
+		    !(priv->ctrlmode_static & CAN_CTRLMODE_FD))
 			return -EINVAL;
 
 		priv->ctrlmode |= CAN_CTRLMODE_FD;
@@ -782,6 +788,35 @@ static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
 				= { .len = sizeof(struct can_bittiming_const) },
 };
 
+static int can_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	bool is_can_fd = false;
+
+	/* Make sure that valid CAN FD configurations always consist of
+	 * - nominal/arbitration bittiming
+	 * - data bittiming
+	 * - control mode with CAN_CTRLMODE_FD set
+	 */
+
+	if (data[IFLA_CAN_CTRLMODE]) {
+		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+
+		is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD;
+	}
+
+	if (is_can_fd) {
+		if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	if (data[IFLA_CAN_DATA_BITTIMING]) {
+		if (!is_can_fd || !data[IFLA_CAN_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int can_changelink(struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[])
 {
@@ -813,19 +848,31 @@ static int can_changelink(struct net_device *dev,
 
 	if (data[IFLA_CAN_CTRLMODE]) {
 		struct can_ctrlmode *cm;
+		u32 ctrlstatic;
+		u32 maskedflags;
 
 		/* Do not allow changing controller mode while running */
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
 		cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+		ctrlstatic = priv->ctrlmode_static;
+		maskedflags = cm->flags & cm->mask;
+
+		/* check whether provided bits are allowed to be passed */
+		if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic))
+			return -EOPNOTSUPP;
+
+		/* do not check for static fd-non-iso if 'fd' is disabled */
+		if (!(maskedflags & CAN_CTRLMODE_FD))
+			ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO;
 
-		/* check whether changed bits are allowed to be modified */
-		if (cm->mask & ~priv->ctrlmode_supported)
+		/* make sure static options are provided by configuration */
+		if ((maskedflags & ctrlstatic) != ctrlstatic)
 			return -EOPNOTSUPP;
 
 		/* clear bits to be modified and copy the flag values */
 		priv->ctrlmode &= ~cm->mask;
-		priv->ctrlmode |= (cm->flags & cm->mask);
+		priv->ctrlmode |= maskedflags;
 
 		/* CAN_CTRLMODE_FD can only be set when driver supports FD */
 		if (priv->ctrlmode & CAN_CTRLMODE_FD)
@@ -966,6 +1013,7 @@ static struct rtnl_link_ops can_link_ops __read_mostly = {
 	.maxtype	= IFLA_CAN_MAX,
 	.policy		= can_policy,
 	.setup		= can_setup,
+	.validate	= can_validate,
 	.newlink	= can_newlink,
 	.changelink	= can_changelink,
 	.get_size	= can_get_size,
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 39cf911f7a1e..195f15edb32e 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -955,7 +955,7 @@ static struct net_device *alloc_m_can_dev(void)
 	priv->can.do_get_berr_counter = m_can_get_berr_counter;
 
 	/* CAN_CTRLMODE_FD_NON_ISO is fixed with M_CAN IP v3.0.1 */
-	priv->can.ctrlmode = CAN_CTRLMODE_FD_NON_ISO;
+	can_set_static_ctrlmode(dev, CAN_CTRLMODE_FD_NON_ISO);
 
 	/* CAN_CTRLMODE_FD_NON_ISO can not be changed with M_CAN IP v3.0.1 */
 	priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK |
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 735f9f8c4e43..5261751f6bd4 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -40,8 +40,11 @@ struct can_priv {
 	struct can_clock clock;
 
 	enum can_state state;
-	u32 ctrlmode;
-	u32 ctrlmode_supported;
+
+	/* CAN controller features - see include/uapi/linux/can/netlink.h */
+	u32 ctrlmode;		/* current options setting */
+	u32 ctrlmode_supported;	/* options that can be modified by netlink */
+	u32 ctrlmode_static;	/* static enabled options for driver/hardware */
 
 	int restart_ms;
 	struct timer_list restart_timer;
@@ -108,6 +111,21 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return skb->len == CANFD_MTU;
 }
 
+/* helper to define static CAN controller features at device creation time */
+static inline void can_set_static_ctrlmode(struct net_device *dev,
+					   u32 static_mode)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	/* alloc_candev() succeeded => netdev_priv() is valid at this point */
+	priv->ctrlmode = static_mode;
+	priv->ctrlmode_static = static_mode;
+
+	/* override MTU which was set by default in can_setup()? */
+	if (static_mode & CAN_CTRLMODE_FD)
+		dev->mtu = CANFD_MTU;
+}
+
 /* get data length from can_dlc with sanitized can_dlc */
 u8 can_dlc2len(u8 can_dlc);
 
-- 
cgit v1.2.3


From fe238e601d2519f259103ab65caea3b077ed7b39 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Fri, 1 Apr 2016 13:45:09 -0400
Subject: NFS: Save struct inode * inside nfs_commit_info to clarify usage of
 i_lock

Commit ea2cf22 created nfs_commit_info and saved &inode->i_lock inside
this NFS specific structure.  This obscures the usage of i_lock.
Instead, save struct inode * so later it's clear the spinlock taken is
i_lock.

Should be no functional change.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/direct.c                        | 10 +++++-----
 fs/nfs/filelayout/filelayout.c         |  4 ++--
 fs/nfs/flexfilelayout/flexfilelayout.c |  4 ++--
 fs/nfs/pnfs_nfs.c                      | 32 ++++++++++++++++----------------
 fs/nfs/write.c                         | 16 ++++++++--------
 include/linux/nfs_xdr.h                |  2 +-
 6 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f79d98ae4c10..7f03163b5364 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -278,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq)
 {
-	cinfo->lock = &dreq->inode->i_lock;
+	cinfo->inode = dreq->inode;
 	cinfo->mds = &dreq->mds_cinfo;
 	cinfo->ds = &dreq->ds_cinfo;
 	cinfo->dreq = dreq;
@@ -635,13 +635,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
 				  struct list_head *list,
 				  struct nfs_commit_info *cinfo)
 {
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 #ifdef CONFIG_NFS_V4_1
 	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
 		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -676,13 +676,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		if (!nfs_pageio_add_request(&desc, req)) {
 			nfs_list_remove_request(req);
 			nfs_list_add_request(req, &failed);
-			spin_lock(cinfo.lock);
+			spin_lock(&cinfo.inode->i_lock);
 			dreq->flags = 0;
 			if (desc.pg_error < 0)
 				dreq->error = desc.pg_error;
 			else
 				dreq->error = -EIO;
-			spin_unlock(cinfo.lock);
+			spin_unlock(&cinfo.inode->i_lock);
 		}
 		nfs_release_request(req);
 	}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3384dc8e6683..3e50057eeadf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
 	}
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	if (cinfo->ds->nbuckets >= size)
 		goto out;
 	for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	swap(cinfo->ds->buckets, buckets);
 	cinfo->ds->nbuckets = size;
 out:
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	kfree(buckets);
 	return 0;
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0cb1abd535e3..3b398f7b4637 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -745,7 +745,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	else {
 		int i;
 
-		spin_lock(cinfo->lock);
+		spin_lock(&cinfo->inode->i_lock);
 		if (cinfo->ds->nbuckets != 0)
 			kfree(buckets);
 		else {
@@ -759,7 +759,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 					NFS_INVALID_STABLE_HOW;
 			}
 		}
-		spin_unlock(cinfo->lock);
+		spin_unlock(&cinfo->inode->i_lock);
 		return 0;
 	}
 }
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4aaed890048f..d2a7c9f7aa94 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
  */
 void
 pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(cinfo->lock))
+		if (cond_resched_lock(&cinfo->inode->i_lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	struct list_head *dst = &bucket->committing;
 	int ret;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
 	int i, rv = 0, cnt;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
 		cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
 						       cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
 	struct pnfs_layout_segment *freeme;
 	int i;
 
-	lockdep_assert_held(cinfo->lock);
+	lockdep_assert_held(&cinfo->inode->i_lock);
 restart:
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		if (pnfs_generic_transfer_commit_list(&b->written, dst,
 						      cinfo, 0)) {
 			freeme = b->wlseg;
 			b->wlseg = NULL;
-			spin_unlock(cinfo->lock);
+			spin_unlock(&cinfo->inode->i_lock);
 			pnfs_put_lseg(freeme);
-			spin_lock(cinfo->lock);
+			spin_lock(&cinfo->inode->i_lock);
 			goto restart;
 		}
 	}
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 	LIST_HEAD(pages);
 	int i;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	for (i = idx; i < fl_cinfo->nbuckets; i++) {
 		bucket = &fl_cinfo->buckets[i];
 		if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 		freeme = bucket->clseg;
 		bucket->clseg = NULL;
 		list_splice_init(&bucket->committing, &pages);
-		spin_unlock(cinfo->lock);
+		spin_unlock(&cinfo->inode->i_lock);
 		nfs_retry_commit(&pages, freeme, cinfo, i);
 		pnfs_put_lseg(freeme);
-		spin_lock(cinfo->lock);
+		spin_lock(&cinfo->inode->i_lock);
 	}
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 }
 
 static unsigned int
@@ -238,11 +238,11 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
 	struct pnfs_commit_bucket *bucket;
 
 	bucket = &cinfo->ds->buckets[data->ds_commit_index];
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	list_splice_init(&bucket->committing, pages);
 	data->lseg = bucket->clseg;
 	bucket->clseg = NULL;
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 
 }
 
@@ -874,12 +874,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	struct list_head *list;
 	struct pnfs_commit_bucket *buckets;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	buckets = cinfo->ds->buckets;
 	list = &buckets[ds_commit_idx].written;
 	if (list_empty(list)) {
 		if (!pnfs_is_valid_lseg(lseg)) {
-			spin_unlock(cinfo->lock);
+			spin_unlock(&cinfo->inode->i_lock);
 			cinfo->completion_ops->resched_write(cinfo, req);
 			return;
 		}
@@ -896,7 +896,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	cinfo->ds->nwritten++;
 
 	nfs_request_add_commit_list_locked(req, list, cinfo);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f4fd53e5764..9283a96b9fb8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,9 +832,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -864,7 +864,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode)
 {
-	cinfo->lock = &inode->i_lock;
+	cinfo->inode = inode;
 	cinfo->mds = &NFS_I(inode)->commit_info;
 	cinfo->ds = pnfs_get_ds_info(inode);
 	cinfo->dreq = NULL;
@@ -967,7 +967,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 	return cinfo->mds->ncommit;
 }
 
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
@@ -979,7 +979,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(cinfo->lock))
+		if (cond_resched_lock(&cinfo->inode->i_lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
@@ -1005,7 +1005,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
-	spin_lock(cinfo->lock);
+	spin_lock(&cinfo->inode->i_lock);
 	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
@@ -1013,7 +1013,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 					   cinfo, max);
 		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(cinfo->lock);
+	spin_unlock(&cinfo->inode->i_lock);
 	return ret;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d320906cf13e..cb9982d8f38f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1431,7 +1431,7 @@ struct nfs_commit_completion_ops {
 };
 
 struct nfs_commit_info {
-	spinlock_t			*lock;	/* inode->i_lock */
+	struct inode 			*inode;	/* Needed for inode->i_lock */
 	struct nfs_mds_commit_info	*mds;
 	struct pnfs_ds_commit_info	*ds;
 	struct nfs_direct_req		*dreq;	/* O_DIRECT request */
-- 
cgit v1.2.3


From 3c6e0bc8a14cfc8e1d4ab87f46f77b070c815bf1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 21 Apr 2016 20:51:54 -0400
Subject: sunrpc: plumb gfp_t parm into crcreate operation

We need to be able to call the generic_cred creator from different
contexts. Add a gfp_t parm to the crcreate operation and to
rpcauth_lookup_credcache. For now, we just push the gfp_t parms up
one level to the *_lookup_cred functions.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    | 4 ++--
 net/sunrpc/auth.c              | 4 ++--
 net/sunrpc/auth_generic.c      | 6 +++---
 net/sunrpc/auth_gss/auth_gss.c | 6 +++---
 net/sunrpc/auth_unix.c         | 6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6a241a277249..3b616aa7e4d2 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -127,7 +127,7 @@ struct rpc_authops {
 	void			(*destroy)(struct rpc_auth *);
 
 	struct rpc_cred *	(*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
-	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int);
+	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
 	rpc_authflavor_t	(*info2flavor)(struct rpcsec_gss_info *);
 	int			(*flavor2info)(rpc_authflavor_t,
@@ -178,7 +178,7 @@ rpc_authflavor_t	rpcauth_get_pseudoflavor(rpc_authflavor_t,
 int			rpcauth_get_gssinfo(rpc_authflavor_t,
 				struct rpcsec_gss_info *);
 int			rpcauth_list_flavors(rpc_authflavor_t *, int);
-struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
+struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..e0bb30fd2ed3 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
  */
 struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
-		int flags)
+		int flags, gfp_t gfp)
 {
 	LIST_HEAD(free);
 	struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	if (flags & RPCAUTH_LOOKUP_RCU)
 		return ERR_PTR(-ECHILD);
 
-	new = auth->au_ops->crcreate(auth, acred, flags);
+	new = auth->au_ops->crcreate(auth, acred, flags, gfp);
 	if (IS_ERR(new)) {
 		cred = new;
 		goto out;
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..6ed3e3df43e9 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -77,15 +77,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 static struct rpc_cred *
 generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct generic_cred *gcred;
 
-	gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+	gcred = kmalloc(sizeof(*gcred), gfp);
 	if (gcred == NULL)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
 	struct gss_cred	*cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		__func__, from_kuid(&init_user_ns, acred->uid),
 		auth->au_flavor);
 
-	if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kzalloc(sizeof(*cred), gfp)))
 		goto out_err;
 
 	rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct unx_cred	*cred;
 	unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 			from_kuid(&init_user_ns, acred->uid),
 			from_kgid(&init_user_ns, acred->gid));
 
-	if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kmalloc(sizeof(*cred), gfp)))
 		return ERR_PTR(-ENOMEM);
 
 	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
-- 
cgit v1.2.3


From c065d229e308ede426a3608cf480c61983b36fb8 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 21 Apr 2016 20:51:55 -0400
Subject: sunrpc: add rpc_lookup_generic_cred

Add function rpc_lookup_generic_cred, which allows lookups of a generic
credential that's not current_cred().

[jlayton: add gfp_t parm]

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 1 +
 net/sunrpc/auth_generic.c   | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 3b616aa7e4d2..16bd8f8fef8c 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -167,6 +167,7 @@ void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
 struct rpc_cred *	rpc_lookup_cred_nonblock(void);
+struct rpc_cred *	rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
 struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6ed3e3df43e9..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
 struct rpc_cred *rpc_lookup_cred_nonblock(void)
 {
 	return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
-- 
cgit v1.2.3


From 62dbef2ae41393eba2f6853ca174130f2d09c7d3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 21 Apr 2016 20:51:56 -0400
Subject: sunrpc: add a get_rpccred_rcu inline

Sometimes we might have a RCU managed credential pointer and don't want
to use locking to handle it. Add a function that will take a reference
to the cred iff the refcount is not already zero. Callers can dereference
the pointer under the rcu_read_lock and use that function to take a
reference only if the cred is not on its way to destruction.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 16bd8f8fef8c..6f36b2bf3e05 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -206,5 +206,23 @@ struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
 	return cred;
 }
 
+/**
+ * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
+ * @cred: cred of which to take a reference
+ *
+ * In some cases, we may have a pointer to a credential to which we
+ * want to take a reference, but don't already have one. Because these
+ * objects are freed using RCU, we can access the cr_count while its
+ * on its way to destruction and only take a reference if it's not already
+ * zero.
+ */
+static inline struct rpc_cred *
+get_rpccred_rcu(struct rpc_cred *cred)
+{
+	if (atomic_inc_not_zero(&cred->cr_count))
+		return cred;
+	return NULL;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_AUTH_H */
-- 
cgit v1.2.3


From 3c3e8943ac6f36ca5d18ca61b30634fb560b4ebb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 7 Apr 2016 18:42:04 +0100
Subject: iommu: remove unused priv field from struct iommu_ops

The priv field from iommu_ops is a hangover from the of_dma_configure
series and isn't actually used. Remove it before it has chance to
spread.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ef7a6ecd8584..8a2570443b80 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -156,7 +156,6 @@ struct iommu_dm_region {
  * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @pgsize_bitmap: bitmap of supported page sizes
- * @priv: per-instance data private to the iommu driver
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -198,7 +197,6 @@ struct iommu_ops {
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 
 	unsigned long pgsize_bitmap;
-	void *priv;
 };
 
 #define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
-- 
cgit v1.2.3


From 53c92d793395fdab9edbd2f79b084bb6b2e6ae79 Mon Sep 17 00:00:00 2001
From: Robin Murphy <Robin.Murphy@arm.com>
Date: Thu, 7 Apr 2016 18:42:05 +0100
Subject: iommu: of: enforce const-ness of struct iommu_ops

As a set of driver-provided callbacks and static data, there is no
compelling reason for struct iommu_ops to be mutable in core code, so
enforce const-ness throughout.

Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm/include/asm/dma-mapping.h   |  2 +-
 arch/arm/mm/dma-mapping.c            |  6 +++---
 arch/arm64/include/asm/dma-mapping.h |  2 +-
 arch/arm64/mm/dma-mapping.c          |  4 ++--
 drivers/iommu/of_iommu.c             | 14 +++++++-------
 drivers/of/device.c                  |  2 +-
 include/linux/dma-mapping.h          |  2 +-
 include/linux/of_iommu.h             |  8 ++++----
 8 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 6ad1ceda62a5..02283eb2f5b2 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -118,7 +118,7 @@ static inline unsigned long dma_max_pfn(struct device *dev)
 
 #define arch_setup_dma_ops arch_setup_dma_ops
 extern void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			       struct iommu_ops *iommu, bool coherent);
+			       const struct iommu_ops *iommu, bool coherent);
 
 #define arch_teardown_dma_ops arch_teardown_dma_ops
 extern void arch_teardown_dma_ops(struct device *dev);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c941e93048ad..5c2ca062c3fa 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -2215,7 +2215,7 @@ static struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent)
 }
 
 static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    struct iommu_ops *iommu)
+				    const struct iommu_ops *iommu)
 {
 	struct dma_iommu_mapping *mapping;
 
@@ -2253,7 +2253,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev)
 #else
 
 static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    struct iommu_ops *iommu)
+				    const struct iommu_ops *iommu)
 {
 	return false;
 }
@@ -2270,7 +2270,7 @@ static struct dma_map_ops *arm_get_dma_map_ops(bool coherent)
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent)
+			const struct iommu_ops *iommu, bool coherent)
 {
 	struct dma_map_ops *dma_ops;
 
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index ba437f090a74..7dbea6c070ec 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -48,7 +48,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent);
+			const struct iommu_ops *iommu, bool coherent);
 #define arch_setup_dma_ops	arch_setup_dma_ops
 
 #ifdef CONFIG_IOMMU_DMA
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a6e757cbab77..5d36907f9b12 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -979,13 +979,13 @@ void arch_teardown_dma_ops(struct device *dev)
 #else
 
 static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				  struct iommu_ops *iommu)
+				  const struct iommu_ops *iommu)
 { }
 
 #endif  /* CONFIG_IOMMU_DMA */
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			struct iommu_ops *iommu, bool coherent)
+			const struct iommu_ops *iommu, bool coherent)
 {
 	if (!dev->archdata.dma_ops)
 		dev->archdata.dma_ops = &swiotlb_dma_ops;
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5fea665af99d..af499aea0a1a 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -98,12 +98,12 @@ EXPORT_SYMBOL_GPL(of_get_dma_window);
 struct of_iommu_node {
 	struct list_head list;
 	struct device_node *np;
-	struct iommu_ops *ops;
+	const struct iommu_ops *ops;
 };
 static LIST_HEAD(of_iommu_list);
 static DEFINE_SPINLOCK(of_iommu_lock);
 
-void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops)
+void of_iommu_set_ops(struct device_node *np, const struct iommu_ops *ops)
 {
 	struct of_iommu_node *iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 
@@ -119,10 +119,10 @@ void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops)
 	spin_unlock(&of_iommu_lock);
 }
 
-struct iommu_ops *of_iommu_get_ops(struct device_node *np)
+const struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 {
 	struct of_iommu_node *node;
-	struct iommu_ops *ops = NULL;
+	const struct iommu_ops *ops = NULL;
 
 	spin_lock(&of_iommu_lock);
 	list_for_each_entry(node, &of_iommu_list, list)
@@ -134,12 +134,12 @@ struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 	return ops;
 }
 
-struct iommu_ops *of_iommu_configure(struct device *dev,
-				     struct device_node *master_np)
+const struct iommu_ops *of_iommu_configure(struct device *dev,
+					   struct device_node *master_np)
 {
 	struct of_phandle_args iommu_spec;
 	struct device_node *np;
-	struct iommu_ops *ops = NULL;
+	const struct iommu_ops *ops = NULL;
 	int idx = 0;
 
 	/*
diff --git a/drivers/of/device.c b/drivers/of/device.c
index e5f47cec75f3..fd5cfad7c403 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -88,7 +88,7 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 	int ret;
 	bool coherent;
 	unsigned long offset;
-	struct iommu_ops *iommu;
+	const struct iommu_ops *iommu;
 
 	/*
 	 * Set default coherent_dma_mask to 32 bit.  Drivers are expected to
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9ea9aba28049..71c1b215ef66 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -514,7 +514,7 @@ extern u64 dma_get_required_mask(struct device *dev);
 
 #ifndef arch_setup_dma_ops
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-				      u64 size, struct iommu_ops *iommu,
+				      u64 size, const struct iommu_ops *iommu,
 				      bool coherent) { }
 #endif
 
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index ffbe4707d4aa..bd02b44902d0 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -12,7 +12,7 @@ extern int of_get_dma_window(struct device_node *dn, const char *prefix,
 			     size_t *size);
 
 extern void of_iommu_init(void);
-extern struct iommu_ops *of_iommu_configure(struct device *dev,
+extern const struct iommu_ops *of_iommu_configure(struct device *dev,
 					struct device_node *master_np);
 
 #else
@@ -25,7 +25,7 @@ static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
 }
 
 static inline void of_iommu_init(void) { }
-static inline struct iommu_ops *of_iommu_configure(struct device *dev,
+static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 					 struct device_node *master_np)
 {
 	return NULL;
@@ -33,8 +33,8 @@ static inline struct iommu_ops *of_iommu_configure(struct device *dev,
 
 #endif	/* CONFIG_OF_IOMMU */
 
-void of_iommu_set_ops(struct device_node *np, struct iommu_ops *ops);
-struct iommu_ops *of_iommu_get_ops(struct device_node *np);
+void of_iommu_set_ops(struct device_node *np, const struct iommu_ops *ops);
+const struct iommu_ops *of_iommu_get_ops(struct device_node *np);
 
 extern struct of_device_id __iommu_of_table;
 
-- 
cgit v1.2.3


From d16e0faab911cc0e100a1e8e93635b432566608e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 7 Apr 2016 18:42:06 +0100
Subject: iommu: Allow selecting page sizes per domain

Many IOMMUs support multiple page table formats, meaning that any given
domain may only support a subset of the hardware page sizes presented in
iommu_ops->pgsize_bitmap. There are also certain use-cases where the
creator of a domain may want to control which page sizes are used, for
example to force the use of hugepage mappings to reduce pagetable walk
depth.

To this end, add a per-domain pgsize_bitmap to represent the subset of
page sizes actually in use, to make it possible for domains with
different requirements to coexist.

Signed-off-by: Will Deacon <will.deacon@arm.com>
[rm: hijacked and rebased original patch with new commit message]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c       |  2 +-
 drivers/iommu/iommu.c           | 22 ++++++++++++----------
 drivers/iommu/mtk_iommu.c       |  2 +-
 drivers/vfio/vfio_iommu_type1.c |  2 +-
 include/linux/iommu.h           |  3 ++-
 5 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 886cb3a78326..99432999b52f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -94,7 +94,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size
 		return -ENODEV;
 
 	/* Use the smallest supported page size for IOVA granularity */
-	order = __ffs(domain->ops->pgsize_bitmap);
+	order = __ffs(domain->pgsize_bitmap);
 	base_pfn = max_t(unsigned long, 1, base >> order);
 	end_pfn = (base + size - 1) >> order;
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b9df1411c894..ab4d014e3687 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -337,9 +337,9 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
 		return 0;
 
-	BUG_ON(!domain->ops->pgsize_bitmap);
+	BUG_ON(!domain->pgsize_bitmap);
 
-	pg_size = 1UL << __ffs(domain->ops->pgsize_bitmap);
+	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
 	INIT_LIST_HEAD(&mappings);
 
 	iommu_get_dm_regions(dev, &mappings);
@@ -1073,6 +1073,8 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 
 	domain->ops  = bus->iommu_ops;
 	domain->type = type;
+	/* Assume all sizes by default; the driver may override this later */
+	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
 
 	return domain;
 }
@@ -1297,7 +1299,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 	pgsize = (1UL << (pgsize_idx + 1)) - 1;
 
 	/* throw away page sizes not supported by the hardware */
-	pgsize &= domain->ops->pgsize_bitmap;
+	pgsize &= domain->pgsize_bitmap;
 
 	/* make sure we're still sane */
 	BUG_ON(!pgsize);
@@ -1319,14 +1321,14 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	int ret = 0;
 
 	if (unlikely(domain->ops->map == NULL ||
-		     domain->ops->pgsize_bitmap == 0UL))
+		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
 		return -EINVAL;
 
 	/* find out the minimum page size supported */
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	/*
 	 * both the virtual address and the physical one, as well as
@@ -1373,14 +1375,14 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 	unsigned long orig_iova = iova;
 
 	if (unlikely(domain->ops->unmap == NULL ||
-		     domain->ops->pgsize_bitmap == 0UL))
+		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
 		return -EINVAL;
 
 	/* find out the minimum page size supported */
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	/*
 	 * The virtual address, as well as the size of the mapping, must be
@@ -1426,10 +1428,10 @@ size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	unsigned int i, min_pagesz;
 	int ret;
 
-	if (unlikely(domain->ops->pgsize_bitmap == 0UL))
+	if (unlikely(domain->pgsize_bitmap == 0UL))
 		return 0;
 
-	min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
 
 	for_each_sg(sg, s, nents, i) {
 		phys_addr_t phys = page_to_phys(sg_page(s)) + s->offset;
@@ -1510,7 +1512,7 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 		break;
 	case DOMAIN_ATTR_PAGING:
 		paging  = data;
-		*paging = (domain->ops->pgsize_bitmap != 0UL);
+		*paging = (domain->pgsize_bitmap != 0UL);
 		break;
 	case DOMAIN_ATTR_WINDOWS:
 		count = data;
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 929a66a81b2b..e6b25276cfec 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -264,7 +264,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_data *data)
 	}
 
 	/* Update our support page sizes bitmap */
-	mtk_iommu_ops.pgsize_bitmap = dom->cfg.pgsize_bitmap;
+	dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap;
 
 	writel(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0],
 	       data->base + REG_MMU_PT_BASE_ADDR);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..15a65823aad9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -407,7 +407,7 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 
 	mutex_lock(&iommu->lock);
 	list_for_each_entry(domain, &iommu->domain_list, next)
-		bitmap &= domain->domain->ops->pgsize_bitmap;
+		bitmap &= domain->domain->pgsize_bitmap;
 	mutex_unlock(&iommu->lock);
 
 	/*
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8a2570443b80..7811294bc0f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -78,6 +78,7 @@ struct iommu_domain_geometry {
 struct iommu_domain {
 	unsigned type;
 	const struct iommu_ops *ops;
+	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
@@ -155,7 +156,7 @@ struct iommu_dm_region {
  * @domain_set_windows: Set the number of windows for a domain
  * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
- * @pgsize_bitmap: bitmap of supported page sizes
+ * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
-- 
cgit v1.2.3


From 3b6b7e19e31a816ee02a8d4372cbea9ad7db3784 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 13 Apr 2016 17:29:10 +0100
Subject: iommu/dma: Finish optimising higher-order allocations

Now that we know exactly which page sizes our caller wants to use in the
given domain, we can restrict higher-order allocation attempts to just
those sizes, if any, and avoid wasting any time or effort on other sizes
which offer no benefit. In the same vein, this also lets us accommodate
a minimum order greater than 0 for special cases.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c |  4 +--
 drivers/iommu/dma-iommu.c   | 60 +++++++++++++++++++++++++++++----------------
 include/linux/dma-iommu.h   |  4 +--
 3 files changed, 43 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 5d36907f9b12..41d19a0fc9c0 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -562,8 +562,8 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 		struct page **pages;
 		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
 
-		pages = iommu_dma_alloc(dev, iosize, gfp, ioprot, handle,
-					flush_page);
+		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
+					handle, flush_page);
 		if (!pages)
 			return NULL;
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99432999b52f..ea5a9ebf0f78 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -190,11 +190,15 @@ static void __iommu_dma_free_pages(struct page **pages, int count)
 	kvfree(pages);
 }
 
-static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
+static struct page **__iommu_dma_alloc_pages(unsigned int count,
+		unsigned long order_mask, gfp_t gfp)
 {
 	struct page **pages;
 	unsigned int i = 0, array_size = count * sizeof(*pages);
-	unsigned int order = MAX_ORDER;
+
+	order_mask &= (2U << MAX_ORDER) - 1;
+	if (!order_mask)
+		return NULL;
 
 	if (array_size <= PAGE_SIZE)
 		pages = kzalloc(array_size, GFP_KERNEL);
@@ -208,36 +212,38 @@ static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
 
 	while (count) {
 		struct page *page = NULL;
-		int j;
+		unsigned int order_size;
 
 		/*
 		 * Higher-order allocations are a convenience rather
 		 * than a necessity, hence using __GFP_NORETRY until
-		 * falling back to single-page allocations.
+		 * falling back to minimum-order allocations.
 		 */
-		for (order = min_t(unsigned int, order, __fls(count));
-		     order > 0; order--) {
-			page = alloc_pages(gfp | __GFP_NORETRY, order);
+		for (order_mask &= (2U << __fls(count)) - 1;
+		     order_mask; order_mask &= ~order_size) {
+			unsigned int order = __fls(order_mask);
+
+			order_size = 1U << order;
+			page = alloc_pages((order_mask - order_size) ?
+					   gfp | __GFP_NORETRY : gfp, order);
 			if (!page)
 				continue;
-			if (PageCompound(page)) {
-				if (!split_huge_page(page))
-					break;
-				__free_pages(page, order);
-			} else {
+			if (!order)
+				break;
+			if (!PageCompound(page)) {
 				split_page(page, order);
 				break;
+			} else if (!split_huge_page(page)) {
+				break;
 			}
+			__free_pages(page, order);
 		}
-		if (!page)
-			page = alloc_page(gfp);
 		if (!page) {
 			__iommu_dma_free_pages(pages, i);
 			return NULL;
 		}
-		j = 1 << order;
-		count -= j;
-		while (j--)
+		count -= order_size;
+		while (order_size--)
 			pages[i++] = page++;
 	}
 	return pages;
@@ -267,6 +273,7 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  *	 attached to an iommu_dma_domain
  * @size: Size of buffer in bytes
  * @gfp: Allocation flags
+ * @attrs: DMA attributes for this allocation
  * @prot: IOMMU mapping flags
  * @handle: Out argument for allocated DMA handle
  * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
@@ -278,8 +285,8 @@ void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
  * Return: Array of struct page pointers describing the buffer,
  *	   or NULL on failure.
  */
-struct page **iommu_dma_alloc(struct device *dev, size_t size,
-		gfp_t gfp, int prot, dma_addr_t *handle,
+struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
+		struct dma_attrs *attrs, int prot, dma_addr_t *handle,
 		void (*flush_page)(struct device *, const void *, phys_addr_t))
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
@@ -288,11 +295,22 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 	struct page **pages;
 	struct sg_table sgt;
 	dma_addr_t dma_addr;
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
 
 	*handle = DMA_ERROR_CODE;
 
-	pages = __iommu_dma_alloc_pages(count, gfp);
+	min_size = alloc_sizes & -alloc_sizes;
+	if (min_size < PAGE_SIZE) {
+		min_size = PAGE_SIZE;
+		alloc_sizes |= PAGE_SIZE;
+	} else {
+		size = ALIGN(size, min_size);
+	}
+	if (dma_get_attr(DMA_ATTR_ALLOC_SINGLE_PAGES, attrs))
+		alloc_sizes = min_size;
+
+	count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	pages = __iommu_dma_alloc_pages(count, alloc_sizes >> PAGE_SHIFT, gfp);
 	if (!pages)
 		return NULL;
 
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index fc481037478a..8443bbb5c071 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -38,8 +38,8 @@ int dma_direction_to_prot(enum dma_data_direction dir, bool coherent);
  * These implement the bulk of the relevant DMA mapping callbacks, but require
  * the arch code to take care of attributes and cache maintenance
  */
-struct page **iommu_dma_alloc(struct device *dev, size_t size,
-		gfp_t gfp, int prot, dma_addr_t *handle,
+struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
+		struct dma_attrs *attrs, int prot, dma_addr_t *handle,
 		void (*flush_page)(struct device *, const void *, phys_addr_t));
 void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
 		dma_addr_t *handle);
-- 
cgit v1.2.3


From 327156c593600e0f08575621c2a56f311d482e7a Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 28 Apr 2016 15:28:56 +0530
Subject: mfd: max77620: Add core driver for MAX77620/MAX20024

MAX77620/MAX20024 are Power Management IC from the MAXIM.
It supports RTC, multiple GPIOs, multiple DCDC and LDOs,
watchdog, clock etc.

Add MFD drier to provides common support for accessing the
device; additional drivers is developed on respected subsystem
in order to use the functionality of the device.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Mallikarjun Kasoju <mkasoju@nvidia.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig          |  15 ++
 drivers/mfd/Makefile         |   1 +
 drivers/mfd/max77620.c       | 590 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/max77620.h | 346 +++++++++++++++++++++++++
 4 files changed, 952 insertions(+)
 create mode 100644 drivers/mfd/max77620.c
 create mode 100644 include/linux/mfd/max77620.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index cf7cbba61971..16d0beffc1cd 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -527,6 +527,21 @@ config MFD_MAX14577
 	  additional drivers must be enabled in order to use the functionality
 	  of the device.
 
+config MFD_MAX77620
+	bool "Maxim Semiconductor MAX77620 and MAX20024 PMIC Support"
+	depends on I2C=y
+	depends on OF
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select IRQ_DOMAIN
+	help
+	  Say yes here to add support for Maxim Semiconductor MAX77620 and
+	  MAX20024 which are Power Management IC with General purpose pins,
+	  RTC, regulators, clock generator, watchdog etc. This driver
+	  provides common support for accessing the device; additional drivers
+	  must be enabled in order to use the functionality of the device.
+
 config MFD_MAX77686
 	tristate "Maxim Semiconductor MAX77686/802 PMIC Support"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 5eaa6465d0a6..921a08dad9b3 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_MFD_DA9063)	+= da9063.o
 obj-$(CONFIG_MFD_DA9150)	+= da9150-core.o
 
 obj-$(CONFIG_MFD_MAX14577)	+= max14577.o
+obj-$(CONFIG_MFD_MAX77620)	+= max77620.o
 obj-$(CONFIG_MFD_MAX77686)	+= max77686.o
 obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
 obj-$(CONFIG_MFD_MAX77843)	+= max77843.o
diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c
new file mode 100644
index 000000000000..199d261990be
--- /dev/null
+++ b/drivers/mfd/max77620.c
@@ -0,0 +1,590 @@
+/*
+ * Maxim MAX77620 MFD Driver
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author:
+ *	Laxman Dewangan <ldewangan@nvidia.com>
+ *	Chaitanya Bandi <bandik@nvidia.com>
+ *	Mallikarjun Kasoju <mkasoju@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/****************** Teminology used in driver ********************
+ * Here are some terminology used from datasheet for quick reference:
+ * Flexible Power Sequence (FPS):
+ * The Flexible Power Sequencer (FPS) allows each regulator to power up under
+ * hardware or software control. Additionally, each regulator can power on
+ * independently or among a group of other regulators with an adjustable
+ * power-up and power-down delays (sequencing). GPIO1, GPIO2, and GPIO3 can
+ * be programmed to be part of a sequence allowing external regulators to be
+ * sequenced along with internal regulators. 32KHz clock can be programmed to
+ * be part of a sequence.
+ * There is 3 FPS confguration registers and all resources are configured to
+ * any of these FPS or no FPS.
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max77620.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+static struct resource gpio_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_TOP_GPIO),
+};
+
+static struct resource power_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_MBATLOW),
+};
+
+static struct resource rtc_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_TOP_RTC),
+};
+
+static struct resource thermal_resources[] = {
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM1),
+	DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM2),
+};
+
+static const struct regmap_irq max77620_top_irqs[] = {
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GLBL, 0, MAX77620_IRQ_TOP_GLBL_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_SD, 0, MAX77620_IRQ_TOP_SD_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_LDO, 0, MAX77620_IRQ_TOP_LDO_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GPIO, 0, MAX77620_IRQ_TOP_GPIO_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_RTC, 0, MAX77620_IRQ_TOP_RTC_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_32K, 0, MAX77620_IRQ_TOP_32K_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_TOP_ONOFF, 0, MAX77620_IRQ_TOP_ONOFF_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_MBATLOW, 1, MAX77620_IRQ_LBM_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM1, 1, MAX77620_IRQ_TJALRM1_MASK),
+	REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM2, 1, MAX77620_IRQ_TJALRM2_MASK),
+};
+
+static const struct mfd_cell max77620_children[] = {
+	{ .name = "max77620-pinctrl", },
+	{ .name = "max77620-clock", },
+	{ .name = "max77620-pmic", },
+	{ .name = "max77620-watchdog", },
+	{
+		.name = "max77620-gpio",
+		.resources = gpio_resources,
+		.num_resources = ARRAY_SIZE(gpio_resources),
+	}, {
+		.name = "max77620-rtc",
+		.resources = rtc_resources,
+		.num_resources = ARRAY_SIZE(rtc_resources),
+	}, {
+		.name = "max77620-power",
+		.resources = power_resources,
+		.num_resources = ARRAY_SIZE(power_resources),
+	}, {
+		.name = "max77620-thermal",
+		.resources = thermal_resources,
+		.num_resources = ARRAY_SIZE(thermal_resources),
+	},
+};
+
+static const struct mfd_cell max20024_children[] = {
+	{ .name = "max20024-pinctrl", },
+	{ .name = "max77620-clock", },
+	{ .name = "max20024-pmic", },
+	{ .name = "max77620-watchdog", },
+	{
+		.name = "max77620-gpio",
+		.resources = gpio_resources,
+		.num_resources = ARRAY_SIZE(gpio_resources),
+	}, {
+		.name = "max77620-rtc",
+		.resources = rtc_resources,
+		.num_resources = ARRAY_SIZE(rtc_resources),
+	}, {
+		.name = "max20024-power",
+		.resources = power_resources,
+		.num_resources = ARRAY_SIZE(power_resources),
+	},
+};
+
+static struct regmap_irq_chip max77620_top_irq_chip = {
+	.name = "max77620-top",
+	.irqs = max77620_top_irqs,
+	.num_irqs = ARRAY_SIZE(max77620_top_irqs),
+	.num_regs = 2,
+	.status_base = MAX77620_REG_IRQTOP,
+	.mask_base = MAX77620_REG_IRQTOPM,
+};
+
+static const struct regmap_range max77620_readable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_readable_table = {
+	.yes_ranges = max77620_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max77620_readable_ranges),
+};
+
+static const struct regmap_range max20024_readable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+	regmap_reg_range(MAX20024_REG_MAX_ADD, MAX20024_REG_MAX_ADD),
+};
+
+static const struct regmap_access_table max20024_readable_table = {
+	.yes_ranges = max20024_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max20024_readable_ranges),
+};
+
+static const struct regmap_range max77620_writable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_writable_table = {
+	.yes_ranges = max77620_writable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(max77620_writable_ranges),
+};
+
+static const struct regmap_range max77620_cacheable_ranges[] = {
+	regmap_reg_range(MAX77620_REG_SD0_CFG, MAX77620_REG_LDO_CFG3),
+	regmap_reg_range(MAX77620_REG_FPS_CFG0, MAX77620_REG_FPS_SD3),
+};
+
+static const struct regmap_access_table max77620_volatile_table = {
+	.no_ranges = max77620_cacheable_ranges,
+	.n_no_ranges = ARRAY_SIZE(max77620_cacheable_ranges),
+};
+
+static const struct regmap_config max77620_regmap_config = {
+	.name = "power-slave",
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX77620_REG_DVSSD4 + 1,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &max77620_readable_table,
+	.wr_table = &max77620_writable_table,
+	.volatile_table = &max77620_volatile_table,
+};
+
+static const struct regmap_config max20024_regmap_config = {
+	.name = "power-slave",
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX20024_REG_MAX_ADD + 1,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &max20024_readable_table,
+	.wr_table = &max77620_writable_table,
+	.volatile_table = &max77620_volatile_table,
+};
+
+/* max77620_get_fps_period_reg_value:  Get FPS bit field value from
+ *				       requested periods.
+ * MAX77620 supports the FPS period of 40, 80, 160, 320, 540, 1280, 2560
+ * and 5120 microseconds. MAX20024 supports the FPS period of 20, 40, 80,
+ * 160, 320, 540, 1280 and 2560 microseconds.
+ * The FPS register has 3 bits field to set the FPS period as
+ * bits		max77620		max20024
+ * 000		40			20
+ * 001		80			40
+ * :::
+*/
+static int max77620_get_fps_period_reg_value(struct max77620_chip *chip,
+					     int tperiod)
+{
+	int fps_min_period;
+	int i;
+
+	switch (chip->chip_id) {
+	case MAX20024:
+		fps_min_period = MAX20024_FPS_PERIOD_MIN_US;
+		break;
+	case MAX77620:
+		fps_min_period = MAX77620_FPS_PERIOD_MIN_US;
+	default:
+		return -EINVAL;
+	}
+
+	for (i = 0; i < 7; i++) {
+		if (fps_min_period >= tperiod)
+			return i;
+		fps_min_period *= 2;
+	}
+
+	return i;
+}
+
+/* max77620_config_fps: Configure FPS configuration registers
+ *			based on platform specific information.
+ */
+static int max77620_config_fps(struct max77620_chip *chip,
+			       struct device_node *fps_np)
+{
+	struct device *dev = chip->dev;
+	unsigned int mask = 0, config = 0;
+	u32 fps_max_period;
+	u32 param_val;
+	int tperiod, fps_id;
+	int ret;
+	char fps_name[10];
+
+	switch (chip->chip_id) {
+	case MAX20024:
+		fps_max_period = MAX20024_FPS_PERIOD_MAX_US;
+		break;
+	case MAX77620:
+		fps_max_period = MAX77620_FPS_PERIOD_MAX_US;
+	default:
+		return -EINVAL;
+	}
+
+	for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+		sprintf(fps_name, "fps%d", fps_id);
+		if (!strcmp(fps_np->name, fps_name))
+			break;
+	}
+
+	if (fps_id == MAX77620_FPS_COUNT) {
+		dev_err(dev, "FPS node name %s is not valid\n", fps_np->name);
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32(fps_np, "maxim,shutdown-fps-time-period-us",
+				   &param_val);
+	if (!ret) {
+		mask |= MAX77620_FPS_TIME_PERIOD_MASK;
+		chip->shutdown_fps_period[fps_id] = min(param_val,
+							fps_max_period);
+		tperiod = max77620_get_fps_period_reg_value(chip,
+				chip->shutdown_fps_period[fps_id]);
+		config |= tperiod << MAX77620_FPS_TIME_PERIOD_SHIFT;
+	}
+
+	ret = of_property_read_u32(fps_np, "maxim,suspend-fps-time-period-us",
+				   &param_val);
+	if (!ret)
+		chip->suspend_fps_period[fps_id] = min(param_val,
+						       fps_max_period);
+
+	ret = of_property_read_u32(fps_np, "maxim,fps-event-source",
+				   &param_val);
+	if (!ret) {
+		if (param_val > 2) {
+			dev_err(dev, "FPS%d event-source invalid\n", fps_id);
+			return -EINVAL;
+		}
+		mask |= MAX77620_FPS_EN_SRC_MASK;
+		config |= param_val << MAX77620_FPS_EN_SRC_SHIFT;
+		if (param_val == 2) {
+			mask |= MAX77620_FPS_ENFPS_SW_MASK;
+			config |= MAX77620_FPS_ENFPS_SW;
+		}
+	}
+
+	if (!chip->sleep_enable && !chip->enable_global_lpm) {
+		ret = of_property_read_u32(fps_np,
+				"maxim,device-state-on-disabled-event",
+				&param_val);
+		if (!ret) {
+			if (param_val == 0)
+				chip->sleep_enable = true;
+			else if (param_val == 1)
+				chip->enable_global_lpm = true;
+		}
+	}
+
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+				 mask, config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update FPS CFG: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int max77620_initialise_fps(struct max77620_chip *chip)
+{
+	struct device *dev = chip->dev;
+	struct device_node *fps_np, *fps_child;
+	u8 config;
+	int fps_id;
+	int ret;
+
+	for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+		chip->shutdown_fps_period[fps_id] = -1;
+		chip->suspend_fps_period[fps_id] = -1;
+	}
+
+	fps_np = of_get_child_by_name(dev->of_node, "fps");
+	if (!fps_np)
+		goto skip_fps;
+
+	for_each_child_of_node(fps_np, fps_child) {
+		ret = max77620_config_fps(chip, fps_child);
+		if (ret < 0)
+			return ret;
+	}
+
+	config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0;
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_SLP_LPM_MSK, config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update SLP_LPM: %d\n", ret);
+		return ret;
+	}
+
+skip_fps:
+	/* Enable wake on EN0 pin */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0,
+				 MAX77620_ONOFFCNFG2_WK_EN0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to update WK_EN0: %d\n", ret);
+		return ret;
+	}
+
+	/* For MAX20024, SLPEN will be POR reset if CLRSE is b11 */
+	if ((chip->chip_id == MAX20024) && chip->sleep_enable) {
+		config = MAX77620_ONOFFCNFG1_SLPEN | MAX20024_ONOFFCNFG1_CLRSE;
+		ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+					 config, config);
+		if (ret < 0) {
+			dev_err(dev, "Failed to update SLPEN: %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int max77620_read_es_version(struct max77620_chip *chip)
+{
+	unsigned int val;
+	u8 cid_val[6];
+	int i;
+	int ret;
+
+	for (i = MAX77620_REG_CID0; i <= MAX77620_REG_CID5; i++) {
+		ret = regmap_read(chip->rmap, i, &val);
+		if (ret < 0) {
+			dev_err(chip->dev, "Failed to read CID: %d\n", ret);
+			return ret;
+		}
+		dev_dbg(chip->dev, "CID%d: 0x%02x\n",
+			i - MAX77620_REG_CID0, val);
+		cid_val[i - MAX77620_REG_CID0] = val;
+	}
+
+	/* CID4 is OTP Version  and CID5 is ES version */
+	dev_info(chip->dev, "PMIC Version OTP:0x%02X and ES:0x%X\n",
+		 cid_val[4], MAX77620_CID5_DIDM(cid_val[5]));
+
+	return ret;
+}
+
+static int max77620_probe(struct i2c_client *client,
+			  const struct i2c_device_id *id)
+{
+	const struct regmap_config *rmap_config;
+	struct max77620_chip *chip;
+	const struct mfd_cell *mfd_cells;
+	int n_mfd_cells;
+	int ret;
+
+	chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, chip);
+	chip->dev = &client->dev;
+	chip->irq_base = -1;
+	chip->chip_irq = client->irq;
+	chip->chip_id = (enum max77620_chip_id)id->driver_data;
+
+	switch (chip->chip_id) {
+	case MAX77620:
+		mfd_cells = max77620_children;
+		n_mfd_cells = ARRAY_SIZE(max77620_children);
+		rmap_config = &max77620_regmap_config;
+		break;
+	case MAX20024:
+		mfd_cells = max20024_children;
+		n_mfd_cells = ARRAY_SIZE(max20024_children);
+		rmap_config = &max20024_regmap_config;
+		break;
+	default:
+		dev_err(chip->dev, "ChipID is invalid %d\n", chip->chip_id);
+		return -EINVAL;
+	}
+
+	chip->rmap = devm_regmap_init_i2c(client, rmap_config);
+	if (IS_ERR(chip->rmap)) {
+		ret = PTR_ERR(chip->rmap);
+		dev_err(chip->dev, "Failed to intialise regmap: %d\n", ret);
+		return ret;
+	}
+
+	ret = max77620_read_es_version(chip);
+	if (ret < 0)
+		return ret;
+
+	ret = devm_regmap_add_irq_chip(chip->dev, chip->rmap, client->irq,
+				       IRQF_ONESHOT | IRQF_SHARED,
+				       chip->irq_base, &max77620_top_irq_chip,
+				       &chip->top_irq_data);
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to add regmap irq: %d\n", ret);
+		return ret;
+	}
+
+	ret = max77620_initialise_fps(chip);
+	if (ret < 0)
+		return ret;
+
+	ret =  devm_mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE,
+				    mfd_cells, n_mfd_cells, NULL, 0,
+				    regmap_irq_get_domain(chip->top_irq_data));
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to add MFD children: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int max77620_set_fps_period(struct max77620_chip *chip,
+				   int fps_id, int time_period)
+{
+	int period = max77620_get_fps_period_reg_value(chip, time_period);
+	int ret;
+
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+				 MAX77620_FPS_TIME_PERIOD_MASK,
+				 period << MAX77620_FPS_TIME_PERIOD_SHIFT);
+	if (ret < 0) {
+		dev_err(chip->dev, "Failed to update FPS period: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int max77620_i2c_suspend(struct device *dev)
+{
+	struct max77620_chip *chip = dev_get_drvdata(dev);
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned int config;
+	int fps;
+	int ret;
+
+	for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+		if (chip->suspend_fps_period[fps] < 0)
+			continue;
+
+		ret = max77620_set_fps_period(chip, fps,
+					      chip->suspend_fps_period[fps]);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * For MAX20024: No need to configure SLPEN on suspend as
+	 * it will be configured on Init.
+	 */
+	if (chip->chip_id == MAX20024)
+		goto out;
+
+	config = (chip->sleep_enable) ? MAX77620_ONOFFCNFG1_SLPEN : 0;
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+				 MAX77620_ONOFFCNFG1_SLPEN,
+				 config);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure sleep in suspend: %d\n", ret);
+		return ret;
+	}
+
+	/* Disable WK_EN0 */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0, 0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure WK_EN in suspend: %d\n", ret);
+		return ret;
+	}
+
+out:
+	disable_irq(client->irq);
+
+	return 0;
+}
+
+static int max77620_i2c_resume(struct device *dev)
+{
+	struct max77620_chip *chip = dev_get_drvdata(dev);
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret;
+	int fps;
+
+	for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+		if (chip->shutdown_fps_period[fps] < 0)
+			continue;
+
+		ret = max77620_set_fps_period(chip, fps,
+					      chip->shutdown_fps_period[fps]);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * For MAX20024: No need to configure WKEN0 on resume as
+	 * it is configured on Init.
+	 */
+	if (chip->chip_id == MAX20024)
+		goto out;
+
+	/* Enable WK_EN0 */
+	ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+				 MAX77620_ONOFFCNFG2_WK_EN0,
+				 MAX77620_ONOFFCNFG2_WK_EN0);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure WK_EN0 n resume: %d\n", ret);
+		return ret;
+	}
+
+out:
+	enable_irq(client->irq);
+
+	return 0;
+}
+#endif
+
+static const struct i2c_device_id max77620_id[] = {
+	{"max77620", MAX77620},
+	{"max20024", MAX20024},
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, max77620_id);
+
+static const struct dev_pm_ops max77620_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(max77620_i2c_suspend, max77620_i2c_resume)
+};
+
+static struct i2c_driver max77620_driver = {
+	.driver = {
+		.name = "max77620",
+		.pm = &max77620_pm_ops,
+	},
+	.probe = max77620_probe,
+	.id_table = max77620_id,
+};
+
+module_i2c_driver(max77620_driver);
+
+MODULE_DESCRIPTION("MAX77620/MAX20024 Multi Function Device Core Driver");
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_AUTHOR("Chaitanya Bandi <bandik@nvidia.com>");
+MODULE_AUTHOR("Mallikarjun Kasoju <mkasoju@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/max77620.h b/include/linux/mfd/max77620.h
new file mode 100644
index 000000000000..3ca0af07fc78
--- /dev/null
+++ b/include/linux/mfd/max77620.h
@@ -0,0 +1,346 @@
+/*
+ * Defining registers address and its bit definitions of MAX77620 and MAX20024
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _MFD_MAX77620_H_
+#define _MFD_MAX77620_H_
+
+#include <linux/types.h>
+
+/* GLOBAL, PMIC, GPIO, FPS, ONOFFC, CID Registers */
+#define MAX77620_REG_CNFGGLBL1			0x00
+#define MAX77620_REG_CNFGGLBL2			0x01
+#define MAX77620_REG_CNFGGLBL3			0x02
+#define MAX77620_REG_CNFG1_32K			0x03
+#define MAX77620_REG_CNFGBBC			0x04
+#define MAX77620_REG_IRQTOP			0x05
+#define MAX77620_REG_INTLBT			0x06
+#define MAX77620_REG_IRQSD			0x07
+#define MAX77620_REG_IRQ_LVL2_L0_7		0x08
+#define MAX77620_REG_IRQ_LVL2_L8		0x09
+#define MAX77620_REG_IRQ_LVL2_GPIO		0x0A
+#define MAX77620_REG_ONOFFIRQ			0x0B
+#define MAX77620_REG_NVERC			0x0C
+#define MAX77620_REG_IRQTOPM			0x0D
+#define MAX77620_REG_INTENLBT			0x0E
+#define MAX77620_REG_IRQMASKSD			0x0F
+#define MAX77620_REG_IRQ_MSK_L0_7		0x10
+#define MAX77620_REG_IRQ_MSK_L8			0x11
+#define MAX77620_REG_ONOFFIRQM			0x12
+#define MAX77620_REG_STATLBT			0x13
+#define MAX77620_REG_STATSD			0x14
+#define MAX77620_REG_ONOFFSTAT			0x15
+
+/* SD and LDO Registers */
+#define MAX77620_REG_SD0			0x16
+#define MAX77620_REG_SD1			0x17
+#define MAX77620_REG_SD2			0x18
+#define MAX77620_REG_SD3			0x19
+#define MAX77620_REG_SD4			0x1A
+#define MAX77620_REG_DVSSD0			0x1B
+#define MAX77620_REG_DVSSD1			0x1C
+#define MAX77620_REG_SD0_CFG			0x1D
+#define MAX77620_REG_SD1_CFG			0x1E
+#define MAX77620_REG_SD2_CFG			0x1F
+#define MAX77620_REG_SD3_CFG			0x20
+#define MAX77620_REG_SD4_CFG			0x21
+#define MAX77620_REG_SD_CFG2			0x22
+#define MAX77620_REG_LDO0_CFG			0x23
+#define MAX77620_REG_LDO0_CFG2			0x24
+#define MAX77620_REG_LDO1_CFG			0x25
+#define MAX77620_REG_LDO1_CFG2			0x26
+#define MAX77620_REG_LDO2_CFG			0x27
+#define MAX77620_REG_LDO2_CFG2			0x28
+#define MAX77620_REG_LDO3_CFG			0x29
+#define MAX77620_REG_LDO3_CFG2			0x2A
+#define MAX77620_REG_LDO4_CFG			0x2B
+#define MAX77620_REG_LDO4_CFG2			0x2C
+#define MAX77620_REG_LDO5_CFG			0x2D
+#define MAX77620_REG_LDO5_CFG2			0x2E
+#define MAX77620_REG_LDO6_CFG			0x2F
+#define MAX77620_REG_LDO6_CFG2			0x30
+#define MAX77620_REG_LDO7_CFG			0x31
+#define MAX77620_REG_LDO7_CFG2			0x32
+#define MAX77620_REG_LDO8_CFG			0x33
+#define MAX77620_REG_LDO8_CFG2			0x34
+#define MAX77620_REG_LDO_CFG3			0x35
+
+#define MAX77620_LDO_SLEW_RATE_MASK		0x1
+
+/* LDO Configuration 3 */
+#define MAX77620_TRACK4_MASK			BIT(5)
+#define MAX77620_TRACK4_SHIFT			5
+
+/* Voltage */
+#define MAX77620_SDX_VOLT_MASK			0xFF
+#define MAX77620_SD0_VOLT_MASK			0x3F
+#define MAX77620_SD1_VOLT_MASK			0x7F
+#define MAX77620_LDO_VOLT_MASK			0x3F
+
+#define MAX77620_REG_GPIO0			0x36
+#define MAX77620_REG_GPIO1			0x37
+#define MAX77620_REG_GPIO2			0x38
+#define MAX77620_REG_GPIO3			0x39
+#define MAX77620_REG_GPIO4			0x3A
+#define MAX77620_REG_GPIO5			0x3B
+#define MAX77620_REG_GPIO6			0x3C
+#define MAX77620_REG_GPIO7			0x3D
+#define MAX77620_REG_PUE_GPIO			0x3E
+#define MAX77620_REG_PDE_GPIO			0x3F
+#define MAX77620_REG_AME_GPIO			0x40
+#define MAX77620_REG_ONOFFCNFG1			0x41
+#define MAX77620_REG_ONOFFCNFG2			0x42
+
+/* FPS Registers */
+#define MAX77620_REG_FPS_CFG0			0x43
+#define MAX77620_REG_FPS_CFG1			0x44
+#define MAX77620_REG_FPS_CFG2			0x45
+#define MAX77620_REG_FPS_LDO0			0x46
+#define MAX77620_REG_FPS_LDO1			0x47
+#define MAX77620_REG_FPS_LDO2			0x48
+#define MAX77620_REG_FPS_LDO3			0x49
+#define MAX77620_REG_FPS_LDO4			0x4A
+#define MAX77620_REG_FPS_LDO5			0x4B
+#define MAX77620_REG_FPS_LDO6			0x4C
+#define MAX77620_REG_FPS_LDO7			0x4D
+#define MAX77620_REG_FPS_LDO8			0x4E
+#define MAX77620_REG_FPS_SD0			0x4F
+#define MAX77620_REG_FPS_SD1			0x50
+#define MAX77620_REG_FPS_SD2			0x51
+#define MAX77620_REG_FPS_SD3			0x52
+#define MAX77620_REG_FPS_SD4			0x53
+#define MAX77620_REG_FPS_NONE			0
+
+#define MAX77620_FPS_SRC_MASK			0xC0
+#define MAX77620_FPS_SRC_SHIFT			6
+#define MAX77620_FPS_PU_PERIOD_MASK		0x38
+#define MAX77620_FPS_PU_PERIOD_SHIFT		3
+#define MAX77620_FPS_PD_PERIOD_MASK		0x07
+#define MAX77620_FPS_PD_PERIOD_SHIFT		0
+#define MAX77620_FPS_TIME_PERIOD_MASK		0x38
+#define MAX77620_FPS_TIME_PERIOD_SHIFT		3
+#define MAX77620_FPS_EN_SRC_MASK		0x06
+#define MAX77620_FPS_EN_SRC_SHIFT		1
+#define MAX77620_FPS_ENFPS_SW_MASK		0x01
+#define MAX77620_FPS_ENFPS_SW			0x01
+
+/* Minimum and maximum FPS period time (in microseconds) are
+ * different for MAX77620 and Max20024.
+ */
+#define MAX77620_FPS_PERIOD_MIN_US		40
+#define MAX20024_FPS_PERIOD_MIN_US		20
+
+#define MAX77620_FPS_PERIOD_MAX_US		2560
+#define MAX20024_FPS_PERIOD_MAX_US		5120
+
+#define MAX77620_REG_FPS_GPIO1			0x54
+#define MAX77620_REG_FPS_GPIO2			0x55
+#define MAX77620_REG_FPS_GPIO3			0x56
+#define MAX77620_REG_FPS_RSO			0x57
+#define MAX77620_REG_CID0			0x58
+#define MAX77620_REG_CID1			0x59
+#define MAX77620_REG_CID2			0x5A
+#define MAX77620_REG_CID3			0x5B
+#define MAX77620_REG_CID4			0x5C
+#define MAX77620_REG_CID5			0x5D
+
+#define MAX77620_REG_DVSSD4			0x5E
+#define MAX20024_REG_MAX_ADD			0x70
+
+#define MAX77620_CID_DIDM_MASK			0xF0
+#define MAX77620_CID_DIDM_SHIFT			4
+
+/* CNCG2SD */
+#define MAX77620_SD_CNF2_ROVS_EN_SD1		BIT(1)
+#define MAX77620_SD_CNF2_ROVS_EN_SD0		BIT(2)
+
+/* Device Identification Metal */
+#define MAX77620_CID5_DIDM(n)			(((n) >> 4) & 0xF)
+/* Device Indentification OTP */
+#define MAX77620_CID5_DIDO(n)			((n) & 0xF)
+
+/* SD CNFG1 */
+#define MAX77620_SD_SR_MASK			0xC0
+#define MAX77620_SD_SR_SHIFT			6
+#define MAX77620_SD_POWER_MODE_MASK		0x30
+#define MAX77620_SD_POWER_MODE_SHIFT		4
+#define MAX77620_SD_CFG1_ADE_MASK		BIT(3)
+#define MAX77620_SD_CFG1_ADE_DISABLE		0
+#define MAX77620_SD_CFG1_ADE_ENABLE		BIT(3)
+#define MAX77620_SD_FPWM_MASK			0x04
+#define MAX77620_SD_FPWM_SHIFT			2
+#define MAX77620_SD_FSRADE_MASK			0x01
+#define MAX77620_SD_FSRADE_SHIFT		0
+#define MAX77620_SD_CFG1_FPWM_SD_MASK		BIT(2)
+#define MAX77620_SD_CFG1_FPWM_SD_SKIP		0
+#define MAX77620_SD_CFG1_FPWM_SD_FPWM		BIT(2)
+#define MAX77620_SD_CFG1_FSRADE_SD_MASK		BIT(0)
+#define MAX77620_SD_CFG1_FSRADE_SD_DISABLE	0
+#define MAX77620_SD_CFG1_FSRADE_SD_ENABLE	BIT(0)
+
+/* LDO_CNFG2 */
+#define MAX77620_LDO_POWER_MODE_MASK		0xC0
+#define MAX77620_LDO_POWER_MODE_SHIFT		6
+#define MAX77620_LDO_CFG2_ADE_MASK		BIT(1)
+#define MAX77620_LDO_CFG2_ADE_DISABLE		0
+#define MAX77620_LDO_CFG2_ADE_ENABLE		BIT(1)
+#define MAX77620_LDO_CFG2_SS_MASK		BIT(0)
+#define MAX77620_LDO_CFG2_SS_FAST		BIT(0)
+#define MAX77620_LDO_CFG2_SS_SLOW		0
+
+#define MAX77620_IRQ_TOP_GLBL_MASK		BIT(7)
+#define MAX77620_IRQ_TOP_SD_MASK		BIT(6)
+#define MAX77620_IRQ_TOP_LDO_MASK		BIT(5)
+#define MAX77620_IRQ_TOP_GPIO_MASK		BIT(4)
+#define MAX77620_IRQ_TOP_RTC_MASK		BIT(3)
+#define MAX77620_IRQ_TOP_32K_MASK		BIT(2)
+#define MAX77620_IRQ_TOP_ONOFF_MASK		BIT(1)
+
+#define MAX77620_IRQ_LBM_MASK			BIT(3)
+#define MAX77620_IRQ_TJALRM1_MASK		BIT(2)
+#define MAX77620_IRQ_TJALRM2_MASK		BIT(1)
+
+#define MAX77620_PWR_I2C_ADDR			0x3c
+#define MAX77620_RTC_I2C_ADDR			0x68
+
+#define MAX77620_CNFG_GPIO_DRV_MASK		BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_PUSHPULL		BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_OPENDRAIN	0
+#define MAX77620_CNFG_GPIO_DIR_MASK		BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_INPUT		BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_OUTPUT		0
+#define MAX77620_CNFG_GPIO_INPUT_VAL_MASK	BIT(2)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_MASK	BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_HIGH	BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_LOW	0
+#define MAX77620_CNFG_GPIO_INT_MASK		(0x3 << 4)
+#define MAX77620_CNFG_GPIO_INT_FALLING		BIT(4)
+#define MAX77620_CNFG_GPIO_INT_RISING		BIT(5)
+#define MAX77620_CNFG_GPIO_DBNC_MASK		(0x3 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_None		(0x0 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_8ms		(0x1 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_16ms		(0x2 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_32ms		(0x3 << 6)
+
+#define MAX77620_IRQ_LVL2_GPIO_EDGE0		BIT(0)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE1		BIT(1)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE2		BIT(2)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE3		BIT(3)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE4		BIT(4)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE5		BIT(5)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE6		BIT(6)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE7		BIT(7)
+
+#define MAX77620_CNFG1_32K_OUT0_EN		BIT(2)
+
+#define MAX77620_ONOFFCNFG1_SFT_RST		BIT(7)
+#define MAX77620_ONOFFCNFG1_MRT_MASK		0x38
+#define MAX77620_ONOFFCNFG1_MRT_SHIFT		0x3
+#define MAX77620_ONOFFCNFG1_SLPEN		BIT(2)
+#define MAX77620_ONOFFCNFG1_PWR_OFF		BIT(1)
+#define MAX20024_ONOFFCNFG1_CLRSE		0x18
+
+#define MAX77620_ONOFFCNFG2_SFT_RST_WK		BIT(7)
+#define MAX77620_ONOFFCNFG2_WD_RST_WK		BIT(6)
+#define MAX77620_ONOFFCNFG2_SLP_LPM_MSK		BIT(5)
+#define MAX77620_ONOFFCNFG2_WK_ALARM1		BIT(2)
+#define MAX77620_ONOFFCNFG2_WK_EN0		BIT(0)
+
+#define MAX77620_GLBLM_MASK			BIT(0)
+
+#define MAX77620_WDTC_MASK			0x3
+#define MAX77620_WDTOFFC			BIT(4)
+#define MAX77620_WDTSLPC			BIT(3)
+#define MAX77620_WDTEN				BIT(2)
+
+#define MAX77620_TWD_MASK			0x3
+#define MAX77620_TWD_2s				0x0
+#define MAX77620_TWD_16s			0x1
+#define MAX77620_TWD_64s			0x2
+#define MAX77620_TWD_128s			0x3
+
+#define MAX77620_CNFGGLBL1_LBDAC_EN		BIT(7)
+#define MAX77620_CNFGGLBL1_MPPLD		BIT(6)
+#define MAX77620_CNFGGLBL1_LBHYST		(BIT(5) | BIT(4))
+#define MAX77620_CNFGGLBL1_LBDAC		0x0E
+#define MAX77620_CNFGGLBL1_LBRSTEN		BIT(0)
+
+/* CNFG BBC registers */
+#define MAX77620_CNFGBBC_ENABLE			BIT(0)
+#define MAX77620_CNFGBBC_CURRENT_MASK		0x06
+#define MAX77620_CNFGBBC_CURRENT_SHIFT		1
+#define MAX77620_CNFGBBC_VOLTAGE_MASK		0x18
+#define MAX77620_CNFGBBC_VOLTAGE_SHIFT		3
+#define MAX77620_CNFGBBC_LOW_CURRENT_DISABLE	BIT(5)
+#define MAX77620_CNFGBBC_RESISTOR_MASK		0xC0
+#define MAX77620_CNFGBBC_RESISTOR_SHIFT		6
+
+#define MAX77620_FPS_COUNT			3
+
+/* Interrupts */
+enum {
+	MAX77620_IRQ_TOP_GLBL,		/* Low-Battery */
+	MAX77620_IRQ_TOP_SD,		/* SD power fail */
+	MAX77620_IRQ_TOP_LDO,		/* LDO power fail */
+	MAX77620_IRQ_TOP_GPIO,		/* TOP GPIO internal int to MAX77620 */
+	MAX77620_IRQ_TOP_RTC,		/* RTC */
+	MAX77620_IRQ_TOP_32K,		/* 32kHz oscillator */
+	MAX77620_IRQ_TOP_ONOFF,		/* ON/OFF oscillator */
+	MAX77620_IRQ_LBT_MBATLOW,	/* Thermal alarm status, > 120C */
+	MAX77620_IRQ_LBT_TJALRM1,	/* Thermal alarm status, > 120C */
+	MAX77620_IRQ_LBT_TJALRM2,	/* Thermal alarm status, > 140C */
+};
+
+/* GPIOs */
+enum {
+	MAX77620_GPIO0,
+	MAX77620_GPIO1,
+	MAX77620_GPIO2,
+	MAX77620_GPIO3,
+	MAX77620_GPIO4,
+	MAX77620_GPIO5,
+	MAX77620_GPIO6,
+	MAX77620_GPIO7,
+	MAX77620_GPIO_NR,
+};
+
+/* FPS Source */
+enum max77620_fps_src {
+	MAX77620_FPS_SRC_0,
+	MAX77620_FPS_SRC_1,
+	MAX77620_FPS_SRC_2,
+	MAX77620_FPS_SRC_NONE,
+	MAX77620_FPS_SRC_DEF,
+};
+
+enum max77620_chip_id {
+	MAX77620,
+	MAX20024,
+};
+
+struct max77620_chip {
+	struct device *dev;
+	struct regmap *rmap;
+
+	int chip_irq;
+	int irq_base;
+
+	/* chip id */
+	enum max77620_chip_id chip_id;
+
+	bool sleep_enable;
+	bool enable_global_lpm;
+	int shutdown_fps_period[MAX77620_FPS_COUNT];
+	int suspend_fps_period[MAX77620_FPS_COUNT];
+
+	struct regmap_irq_chip_data *top_irq_data;
+	struct regmap_irq_chip_data *gpio_irq_data;
+};
+
+#endif /* _MFD_MAX77620_H_ */
-- 
cgit v1.2.3


From 884be175351e73c515303118150f195dd611787c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 28 Apr 2016 23:56:31 -0400
Subject: nfs: per-name sillyunlink exclusion

use d_alloc_parallel() for sillyunlink/lookup exclusion and
explicit rwsem (nfs_rmdir() being a writer and nfs_call_unlink() -
a reader) for rmdir/sillyunlink one.

That ought to make lookup/readdir/!O_CREAT atomic_open really
parallel on NFS.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c            |   9 +--
 fs/nfs/inode.c          |   4 +-
 fs/nfs/nfs4proc.c       |   2 +-
 fs/nfs/nfstrace.h       |   2 +-
 fs/nfs/unlink.c         | 193 +++++++++++++-----------------------------------
 include/linux/nfs_fs.h  |  11 +--
 include/linux/nfs_xdr.h |   4 +-
 7 files changed, 61 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cd3c754e3f89..aaf7bd0cbae2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -909,7 +909,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
 
-	nfs_block_sillyrename(dentry);
 	if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (res < 0)
@@ -945,7 +944,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 	} while (!desc->eof);
 out:
-	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
 		res = 0;
 	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
@@ -1398,7 +1396,6 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	trace_nfs_lookup_enter(dir, dentry, flags);
-	nfs_block_sillyrename(parent);
 	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
 	if (error == -ENOENT)
 		goto no_entry;
@@ -1423,7 +1420,6 @@ no_entry:
 	}
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
-	nfs_unblock_sillyrename(parent);
 	trace_nfs_lookup_exit(dir, dentry, flags, error);
 	nfs4_label_free(label);
 out:
@@ -1535,9 +1531,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	trace_nfs_atomic_open_enter(dir, ctx, open_flags);
-	nfs_block_sillyrename(dentry->d_parent);
 	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
-	nfs_unblock_sillyrename(dentry->d_parent);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
@@ -1781,7 +1775,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	trace_nfs_rmdir_enter(dir, dentry);
 	if (d_really_is_positive(dentry)) {
-		nfs_wait_on_sillyrename(dentry);
+		down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
 		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 		/* Ensure the VFS deletes this inode */
 		switch (error) {
@@ -1791,6 +1785,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 		case -ENOENT:
 			nfs_dentry_handle_enoent(dentry);
 		}
+		up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
 	} else
 		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 	trace_nfs_rmdir_exit(dir, dentry, error);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 738c84a42eb0..52e7d6869e3b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1958,9 +1958,7 @@ static void init_once(void *foo)
 	nfsi->nrequests = 0;
 	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
-	atomic_set(&nfsi->silly_count, 1);
-	INIT_HLIST_HEAD(&nfsi->silly_list);
-	init_waitqueue_head(&nfsi->waitqueue);
+	init_rwsem(&nfsi->rmdir_sem);
 	nfs4_init_once(nfsi);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7a7ac1dafa02..084e8570da18 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3777,7 +3777,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
 {
-	nfs4_setup_sequence(NFS_SERVER(data->dir),
+	nfs4_setup_sequence(NFS_SB(data->dentry->d_sb),
 			&data->args.seq_args,
 			&data->res.seq_res,
 			task);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 9f80a086b612..0b9e5cc9a747 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -702,7 +702,7 @@ TRACE_EVENT(nfs_sillyrename_unlink,
 		),
 
 		TP_fast_assign(
-			struct inode *dir = data->dir;
+			struct inode *dir = d_inode(data->dentry->d_parent);
 			size_t len = data->args.name.len;
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7d6deaccf89b..1868246f56e6 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -30,45 +30,11 @@
 static void
 nfs_free_unlinkdata(struct nfs_unlinkdata *data)
 {
-	iput(data->dir);
 	put_rpccred(data->cred);
 	kfree(data->args.name.name);
 	kfree(data);
 }
 
-#define NAME_ALLOC_LEN(len)	((len+16) & ~15)
-/**
- * nfs_copy_dname - copy dentry name to data structure
- * @dentry: pointer to dentry
- * @data: nfs_unlinkdata
- */
-static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
-{
-	char		*str;
-	int		len = dentry->d_name.len;
-
-	str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL);
-	if (!str)
-		return -ENOMEM;
-	data->args.name.len = len;
-	data->args.name.name = str;
-	return 0;
-}
-
-static void nfs_free_dname(struct nfs_unlinkdata *data)
-{
-	kfree(data->args.name.name);
-	data->args.name.name = NULL;
-	data->args.name.len = 0;
-}
-
-static void nfs_dec_sillycount(struct inode *dir)
-{
-	struct nfs_inode *nfsi = NFS_I(dir);
-	if (atomic_dec_return(&nfsi->silly_count) == 1)
-		wake_up(&nfsi->waitqueue);
-}
-
 /**
  * nfs_async_unlink_done - Sillydelete post-processing
  * @task: rpc_task of the sillydelete
@@ -78,7 +44,7 @@ static void nfs_dec_sillycount(struct inode *dir)
 static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_unlinkdata *data = calldata;
-	struct inode *dir = data->dir;
+	struct inode *dir = d_inode(data->dentry->d_parent);
 
 	trace_nfs_sillyrename_unlink(data, task->tk_status);
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
@@ -95,17 +61,21 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 static void nfs_async_unlink_release(void *calldata)
 {
 	struct nfs_unlinkdata	*data = calldata;
-	struct super_block *sb = data->dir->i_sb;
+	struct dentry *dentry = data->dentry;
+	struct super_block *sb = dentry->d_sb;
 
-	nfs_dec_sillycount(data->dir);
+	up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
+	d_lookup_done(dentry);
 	nfs_free_unlinkdata(data);
+	dput(dentry);
 	nfs_sb_deactive(sb);
 }
 
 static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_unlinkdata *data = calldata;
-	NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
+	struct inode *dir = d_inode(data->dentry->d_parent);
+	NFS_PROTO(dir)->unlink_rpc_prepare(task, data);
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
@@ -114,7 +84,7 @@ static const struct rpc_call_ops nfs_unlink_ops = {
 	.rpc_call_prepare = nfs_unlink_prepare,
 };
 
-static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
+static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
 {
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
@@ -129,10 +99,31 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		.flags = RPC_TASK_ASYNC,
 	};
 	struct rpc_task *task;
+	struct inode *dir = d_inode(data->dentry->d_parent);
+	nfs_sb_active(dir->i_sb);
+	data->args.fh = NFS_FH(dir);
+	nfs_fattr_init(data->res.dir_attr);
+
+	NFS_PROTO(dir)->unlink_setup(&msg, dir);
+
+	task_setup_data.rpc_client = NFS_CLIENT(dir);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task_async(task);
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
 	struct dentry *alias;
 
-	alias = d_lookup(parent, &data->args.name);
-	if (alias != NULL) {
+	down_read_non_owner(&NFS_I(dir)->rmdir_sem);
+	alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
+	if (IS_ERR(alias)) {
+		up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+		return 0;
+	}
+	if (!d_in_lookup(alias)) {
 		int ret;
 		void *devname_garbage = NULL;
 
@@ -140,10 +131,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		 * Hey, we raced with lookup... See if we need to transfer
 		 * the sillyrename information to the aliased dentry.
 		 */
-		nfs_free_dname(data);
-		ret = nfs_copy_dname(alias, data);
 		spin_lock(&alias->d_lock);
-		if (ret == 0 && d_really_is_positive(alias) &&
+		if (d_really_is_positive(alias) &&
 		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
 			devname_garbage = alias->d_fsdata;
 			alias->d_fsdata = data;
@@ -152,8 +141,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		} else
 			ret = 0;
 		spin_unlock(&alias->d_lock);
-		nfs_dec_sillycount(dir);
 		dput(alias);
+		up_read_non_owner(&NFS_I(dir)->rmdir_sem);
 		/*
 		 * If we'd displaced old cached devname, free it.  At that
 		 * point dentry is definitely not a root, so we won't need
@@ -162,95 +151,18 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		kfree(devname_garbage);
 		return ret;
 	}
-	data->dir = igrab(dir);
-	if (!data->dir) {
-		nfs_dec_sillycount(dir);
-		return 0;
-	}
-	nfs_sb_active(dir->i_sb);
-	data->args.fh = NFS_FH(dir);
-	nfs_fattr_init(data->res.dir_attr);
-
-	NFS_PROTO(dir)->unlink_setup(&msg, dir);
-
-	task_setup_data.rpc_client = NFS_CLIENT(dir);
-	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task_async(task);
+	data->dentry = alias;
+	nfs_do_call_unlink(data);
 	return 1;
 }
 
-static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
-{
-	struct dentry *parent;
-	struct inode *dir;
-	int ret = 0;
-
-
-	parent = dget_parent(dentry);
-	if (parent == NULL)
-		goto out_free;
-	dir = d_inode(parent);
-	/* Non-exclusive lock protects against concurrent lookup() calls */
-	spin_lock(&dir->i_lock);
-	if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
-		/* Deferred delete */
-		hlist_add_head(&data->list, &NFS_I(dir)->silly_list);
-		spin_unlock(&dir->i_lock);
-		ret = 1;
-		goto out_dput;
-	}
-	spin_unlock(&dir->i_lock);
-	ret = nfs_do_call_unlink(parent, dir, data);
-out_dput:
-	dput(parent);
-out_free:
-	return ret;
-}
-
-void nfs_wait_on_sillyrename(struct dentry *dentry)
-{
-	struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
-
-	wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
-}
-
-void nfs_block_sillyrename(struct dentry *dentry)
-{
-	struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
-
-	wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1);
-}
-
-void nfs_unblock_sillyrename(struct dentry *dentry)
-{
-	struct inode *dir = d_inode(dentry);
-	struct nfs_inode *nfsi = NFS_I(dir);
-	struct nfs_unlinkdata *data;
-
-	atomic_inc(&nfsi->silly_count);
-	wake_up(&nfsi->waitqueue);
-	spin_lock(&dir->i_lock);
-	while (!hlist_empty(&nfsi->silly_list)) {
-		if (!atomic_inc_not_zero(&nfsi->silly_count))
-			break;
-		data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list);
-		hlist_del(&data->list);
-		spin_unlock(&dir->i_lock);
-		if (nfs_do_call_unlink(dentry, dir, data) == 0)
-			nfs_free_unlinkdata(data);
-		spin_lock(&dir->i_lock);
-	}
-	spin_unlock(&dir->i_lock);
-}
-
 /**
  * nfs_async_unlink - asynchronous unlinking of a file
  * @dir: parent directory of dentry
  * @dentry: dentry to unlink
  */
 static int
-nfs_async_unlink(struct inode *dir, struct dentry *dentry)
+nfs_async_unlink(struct dentry *dentry, struct qstr *name)
 {
 	struct nfs_unlinkdata *data;
 	int status = -ENOMEM;
@@ -259,13 +171,18 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		goto out;
+	data->args.name.name = kstrdup(name->name, GFP_KERNEL);
+	if (!data->args.name.name)
+		goto out_free;
+	data->args.name.len = name->len;
 
 	data->cred = rpc_lookup_cred();
 	if (IS_ERR(data->cred)) {
 		status = PTR_ERR(data->cred);
-		goto out_free;
+		goto out_free_name;
 	}
 	data->res.dir_attr = &data->dir_attr;
+	init_waitqueue_head(&data->wq);
 
 	status = -EBUSY;
 	spin_lock(&dentry->d_lock);
@@ -285,6 +202,8 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 out_unlock:
 	spin_unlock(&dentry->d_lock);
 	put_rpccred(data->cred);
+out_free_name:
+	kfree(data->args.name.name);
 out_free:
 	kfree(data);
 out:
@@ -303,17 +222,15 @@ out:
 void
 nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 {
-	struct nfs_unlinkdata	*data = NULL;
+	struct nfs_unlinkdata	*data;
 
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
-		data = dentry->d_fsdata;
-		dentry->d_fsdata = NULL;
-	}
+	dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+	data = dentry->d_fsdata;
+	dentry->d_fsdata = NULL;
 	spin_unlock(&dentry->d_lock);
 
-	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+	if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))
 		nfs_free_unlinkdata(data);
 }
 
@@ -560,18 +477,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	/* queue unlink first. Can't do this from rpc_release as it
 	 * has to allocate memory
 	 */
-	error = nfs_async_unlink(dir, dentry);
+	error = nfs_async_unlink(dentry, &sdentry->d_name);
 	if (error)
 		goto out_dput;
 
-	/* populate unlinkdata with the right dname */
-	error = nfs_copy_dname(sdentry,
-				(struct nfs_unlinkdata *)dentry->d_fsdata);
-	if (error) {
-		nfs_cancel_async_unlink(dentry);
-		goto out_dput;
-	}
-
 	/* run the rename task, undo unlink if it fails */
 	task = nfs_async_rename(dir, dir, dentry, sdentry,
 					nfs_complete_sillyrename);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 67300f8e5f2f..fa167f25465d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -163,11 +163,9 @@ struct nfs_inode {
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
 
-	/* Number of in-flight sillydelete RPC calls */
-	atomic_t		silly_count;
-	/* List of deferred sillydelete requests */
-	struct hlist_head	silly_list;
-	wait_queue_head_t	waitqueue;
+	/* Readers: in-flight sillydelete RPC calls */
+	/* Writers: rmdir */
+	struct rw_semaphore	rmdir_sem;
 
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
@@ -492,9 +490,6 @@ extern void nfs_release_automount_timer(void);
  * linux/fs/nfs/unlink.c
  */
 extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
-extern void nfs_wait_on_sillyrename(struct dentry *dentry);
-extern void nfs_block_sillyrename(struct dentry *dentry);
-extern void nfs_unblock_sillyrename(struct dentry *dentry);
 
 /*
  * linux/fs/nfs/write.c
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d320906cf13e..ee8491dadbf3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1468,10 +1468,10 @@ struct nfs_pgio_completion_ops {
 };
 
 struct nfs_unlinkdata {
-	struct hlist_node list;
 	struct nfs_removeargs args;
 	struct nfs_removeres res;
-	struct inode *dir;
+	struct dentry *dentry;
+	wait_queue_head_t wq;
 	struct rpc_cred	*cred;
 	struct nfs_fattr dir_attr;
 	long timeout;
-- 
cgit v1.2.3


From 85ad1d13ee9b3db00615ea24b031c15e5ba14fd1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 3 May 2016 22:22:13 -0400
Subject: md: set MD_CHANGE_PENDING in a atomic region

Some code waits for a metadata update by:

1. flagging that it is needed (MD_CHANGE_DEVS or MD_CHANGE_CLEAN)
2. setting MD_CHANGE_PENDING and waking the management thread
3. waiting for MD_CHANGE_PENDING to be cleared

If the first two are done without locking, the code in md_update_sb()
which checks if it needs to repeat might test if an update is needed
before step 1, then clear MD_CHANGE_PENDING after step 2, resulting
in the wait returning early.

So make sure all places that set MD_CHANGE_PENDING are atomicial, and
bit_clear_unless (suggested by Neil) is introduced for the purpose.

Cc: Martin Kepplinger <martink@posteo.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: <linux-kernel@vger.kernel.org>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c          | 27 ++++++++++++++-------------
 drivers/md/raid1.c       |  4 ++--
 drivers/md/raid10.c      |  8 ++++----
 drivers/md/raid5-cache.c |  4 ++--
 drivers/md/raid5.c       |  4 ++--
 include/linux/bitops.h   | 16 ++++++++++++++++
 6 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 23c6d732a374..a79462dcd5e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2295,12 +2295,16 @@ repeat:
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 			force_change = 1;
+		if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+			nospares = 1;
 		ret = md_cluster_ops->metadata_update_start(mddev);
 		/* Has someone else has updated the sb */
 		if (!does_sb_need_changing(mddev)) {
 			if (ret == 0)
 				md_cluster_ops->metadata_update_cancel(mddev);
-			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+							 BIT(MD_CHANGE_DEVS) |
+							 BIT(MD_CHANGE_CLEAN));
 			return;
 		}
 	}
@@ -2434,15 +2438,11 @@ repeat:
 	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->metadata_update_finish(mddev);
 
-	spin_lock(&mddev->lock);
 	if (mddev->in_sync != sync_req ||
-	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+	    !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+			       BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN)))
 		/* have to write it out again */
-		spin_unlock(&mddev->lock);
 		goto repeat;
-	}
-	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-	spin_unlock(&mddev->lock);
 	wake_up(&mddev->sb_wait);
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -8147,18 +8147,18 @@ void md_do_sync(struct md_thread *thread)
 		}
 	}
  skip:
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-
 	if (mddev_is_clustered(mddev) &&
 	    ret == 0) {
 		/* set CHANGE_PENDING here since maybe another
 		 * update is needed, so other nodes are informed */
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 		md_cluster_ops->resync_finish(mddev);
-	}
+	} else
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8550,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape);
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
+	struct mddev *mddev = rdev->mddev;
 	int rv;
 	if (is_new)
 		s += rdev->new_data_offset;
@@ -8559,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 	if (rv == 0) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
-		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(rdev->mddev->thread);
 		return 1;
 	} else
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7f2b9c9f8a0..c7c8cde0ab21 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 	 * if recovery is running, make sure it aborts.
 	 */
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid1:%s: Operation continuing on %d devices.\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84e24e648165..c7de2a53e625 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		bio->bi_iter.bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9531f5f05b93..ac51bc5ecb16 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	 * in_teardown check workaround this issue.
 	 */
 	if (!log->in_teardown) {
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4d31b235a888..8959e6dd31dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid:%s: Operation continuing on %d devices.\n",
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index defeaac0745f..299e76b59fe9 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word)
 })
 #endif
 
+#ifndef bit_clear_unless
+#define bit_clear_unless(ptr, _clear, _test)	\
+({								\
+	const typeof(*ptr) clear = (_clear), test = (_test);	\
+	typeof(*ptr) old, new;					\
+								\
+	do {							\
+		old = ACCESS_ONCE(*ptr);			\
+		new = old & ~clear;				\
+	} while (!(old & test) &&				\
+		 cmpxchg(ptr, old, new) != old);		\
+								\
+	!(old & test);						\
+})
+#endif
+
 #ifndef find_last_bit
 /**
  * find_last_bit - find the last set bit in a memory region
-- 
cgit v1.2.3


From 661ce1f0c4a69f92ad781d8d2c205c90dd9c5833 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:45 +0200
Subject: libata/libsas: Define ATA_CMD_NCQ_NON_DATA

Define the NCQ NON DATA command and update libsas to handle it
correctly.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 1 +
 drivers/scsi/isci/request.c            | 3 ++-
 drivers/scsi/libsas/sas_ata.c          | 3 ++-
 drivers/scsi/mvsas/mv_sas.c            | 3 ++-
 drivers/scsi/pm8001/pm8001_sas.c       | 3 ++-
 include/linux/ata.h                    | 1 +
 include/trace/events/libata.h          | 1 +
 7 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index 860c9f847371..bd20c5488768 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -1632,6 +1632,7 @@ static u8 get_ata_protocol(u8 cmd, int direction)
 	case ATA_CMD_FPDMA_READ:
 	case ATA_CMD_FPDMA_RECV:
 	case ATA_CMD_FPDMA_SEND:
+	case ATA_CMD_NCQ_NON_DATA:
 	return SATA_PROTOCOL_FPDMA;
 
 	case ATA_CMD_ID_ATA:
diff --git a/drivers/scsi/isci/request.c b/drivers/scsi/isci/request.c
index 29456e097a30..b709d2b20880 100644
--- a/drivers/scsi/isci/request.c
+++ b/drivers/scsi/isci/request.c
@@ -3171,7 +3171,8 @@ static enum sci_status isci_request_stp_request_construct(struct isci_request *i
 	if (qc && (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		   qc->tf.command == ATA_CMD_FPDMA_READ ||
 		   qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		   qc->tf.command == ATA_CMD_FPDMA_SEND)) {
+		   qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		   qc->tf.command == ATA_CMD_NCQ_NON_DATA)) {
 		fis->sector_count = qc->tag << 3;
 		ireq->tc->type.stp.ncq_tag = qc->tag;
 	}
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index fe1cd2691748..935c43095109 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -207,7 +207,8 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 	    qc->tf.command == ATA_CMD_FPDMA_READ ||
 	    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-	    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+	    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+	    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 		/* Need to zero out the tag libata assigned us */
 		qc->tf.nsect = 0;
 	}
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index db3714964c0a..5b9fcff6cd94 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -431,7 +431,8 @@ static u32 mvs_get_ncq_tag(struct sas_task *task, u32 *tag)
 		if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		    qc->tf.command == ATA_CMD_FPDMA_READ ||
 		    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+		    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 			*tag = qc->tag;
 			return 1;
 		}
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 62abd9896625..dc33dfa8f994 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -282,7 +282,8 @@ u32 pm8001_get_ncq_tag(struct sas_task *task, u32 *tag)
 		if (qc->tf.command == ATA_CMD_FPDMA_WRITE ||
 		    qc->tf.command == ATA_CMD_FPDMA_READ ||
 		    qc->tf.command == ATA_CMD_FPDMA_RECV ||
-		    qc->tf.command == ATA_CMD_FPDMA_SEND) {
+		    qc->tf.command == ATA_CMD_FPDMA_SEND ||
+		    qc->tf.command == ATA_CMD_NCQ_NON_DATA) {
 			*tag = qc->tag;
 			return 1;
 		}
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 00aebc4c83ad..b84210a28a00 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -243,6 +243,7 @@ enum {
 	ATA_CMD_WRITE_QUEUED_FUA_EXT = 0x3E,
 	ATA_CMD_FPDMA_READ	= 0x60,
 	ATA_CMD_FPDMA_WRITE	= 0x61,
+	ATA_CMD_NCQ_NON_DATA	= 0x63,
 	ATA_CMD_FPDMA_SEND	= 0x64,
 	ATA_CMD_FPDMA_RECV	= 0x65,
 	ATA_CMD_PIO_READ	= 0x20,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 8b0fbd93082c..016860320f6f 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -39,6 +39,7 @@
 		 ata_opcode_name(ATA_CMD_WRITE_QUEUED_FUA_EXT), \
 		 ata_opcode_name(ATA_CMD_FPDMA_READ),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_WRITE),		\
+		 ata_opcode_name(ATA_CMD_NCQ_NON_DATA),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_SEND),		\
 		 ata_opcode_name(ATA_CMD_FPDMA_RECV),		\
 		 ata_opcode_name(ATA_CMD_PIO_READ),		\
-- 
cgit v1.2.3


From 5c65d8bb3503beb12864895426a69269c19e6e87 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:47 +0200
Subject: libata: Add command definitions for NCQ Encapsulation for READ LOG
 DMA EXT

ACS-4 defines an NCQ encapsulation for READ LOG DMA EXT.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ata.h    | 5 +++++
 include/linux/libata.h | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index b84210a28a00..94ccde5ee83c 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -306,6 +306,9 @@ enum {
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
 
+	/* Subcmds for ATA_CMD_FPDMA_RECV */
+	ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT = 0x01,
+
 	/* Subcmds for ATA_CMD_FPDMA_SEND */
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
@@ -329,7 +332,9 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET	= 0x04,
 	ATA_LOG_NCQ_SEND_RECV_DSM_TRIM		= (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET	= 0x08,
+	ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_OFFSET	= 0x0C,
+	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
 
 	/* READ/WRITE LONG (obsolete) */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a418bca0df0d..09ddb5a6f555 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1642,6 +1642,13 @@ static inline bool ata_fpdma_dsm_supported(struct ata_device *dev)
 		 ATA_LOG_NCQ_SEND_RECV_DSM_TRIM);
 }
 
+static inline bool ata_fpdma_read_log_supported(struct ata_device *dev)
+{
+	return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
+		(dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET] &
+		 ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
-- 
cgit v1.2.3


From fe5af0cc3029d52e31d282f5d53787d308e9695a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:48 +0200
Subject: libata: Check log page directory before accessing pages

When reading the NCQ Send/Recv log it might actually not
supported, thereby causing irritating messages
'READ LOG DMA EXT failed'.
Instead we should be reading the log directory first to
figure out if the log is actually supported before trying
to access it.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 17 +++++++++++++++++
 include/linux/ata.h       |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index fa74b57ee52e..b2bd7c499653 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -66,6 +66,7 @@
 #include <scsi/scsi_host.h>
 #include <linux/libata.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 #include <linux/cdrom.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
@@ -2083,7 +2084,23 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->link->ap;
 	unsigned int err_mask;
+	int log_index = ATA_LOG_NCQ_SEND_RECV * 2;
+	u16 log_pages;
 
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get Log Directory Emask 0x%x\n",
+			    err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (!log_pages) {
+		ata_dev_warn(dev,
+			     "NCQ Send/Recv Log not supported\n");
+		return;
+	}
 	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV,
 				     0, ap->sector_buf, 1);
 	if (err_mask) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 94ccde5ee83c..b5be5e85d2d3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -314,6 +314,7 @@ enum {
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
 
 	/* READ_LOG_EXT pages */
+	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
-- 
cgit v1.2.3


From a57038496422d7d21b7e41ed70d63bf0c6ff6068 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:49 +0200
Subject: libata-trace: decode subcommands

Some commands like FPDMA RECEIVE or NCQ NON DATA can encapsulate
other commands to NCQ transport. So decode the subcmds, too.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-trace.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ata.h           | 17 +++++++++++++++++
 include/trace/events/libata.h |  7 ++++++-
 3 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index fd30b8c10cf5..99ec1e8cb95d 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -149,3 +149,46 @@ libata_trace_parse_qc_flags(struct trace_seq *p, unsigned int qc_flags)
 
 	return ret;
 }
+
+const char *
+libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
+			  unsigned char feature, unsigned char hob_nsect)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (cmd) {
+	case ATA_CMD_FPDMA_RECV:
+		switch (hob_nsect & 0x5f) {
+		case ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT:
+			trace_seq_printf(p, " READ_LOG_DMA_EXT");
+			break;
+		}
+		break;
+	case ATA_CMD_FPDMA_SEND:
+		switch (hob_nsect & 0x5f) {
+		case ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT:
+			trace_seq_printf(p, " WRITE_LOG_DMA_EXT");
+			break;
+		case ATA_SUBCMD_FPDMA_SEND_DSM:
+			trace_seq_printf(p, " DATASET_MANAGEMENT");
+			break;
+		}
+		break;
+	case ATA_CMD_NCQ_NON_DATA:
+		switch (feature) {
+		case ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE:
+			trace_seq_printf(p, " ABORT_QUEUE");
+			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES:
+			trace_seq_printf(p, " SET_FEATURES");
+			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT:
+			trace_seq_printf(p, " ZERO_EXT");
+			break;
+		}
+		break;
+	}
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
diff --git a/include/linux/ata.h b/include/linux/ata.h
index b5be5e85d2d3..032bb223cd8c 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -313,6 +313,11 @@ enum {
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
 	ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,
 
+	/* Subcmds for ATA_CMD_NCQ_NON_DATA */
+	ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE  = 0x00,
+	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
+	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
@@ -338,6 +343,18 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
 
+	/* NCQ Non-Data log */
+	ATA_LOG_NCQ_NON_DATA_SUBCMDS_OFFSET	= 0x00,
+	ATA_LOG_NCQ_NON_DATA_ABORT_OFFSET	= 0x00,
+	ATA_LOG_NCQ_NON_DATA_ABORT_NCQ		= (1 << 0),
+	ATA_LOG_NCQ_NON_DATA_ABORT_ALL		= (1 << 1),
+	ATA_LOG_NCQ_NON_DATA_ABORT_STREAMING	= (1 << 2),
+	ATA_LOG_NCQ_NON_DATA_ABORT_NON_STREAMING = (1 << 3),
+	ATA_LOG_NCQ_NON_DATA_ABORT_SELECTED	= (1 << 4),
+	ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET	= 0x1C,
+	ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT	= (1 << 0),
+	ATA_LOG_NCQ_NON_DATA_SIZE		= 0x40,
+
 	/* READ/WRITE LONG (obsolete) */
 	ATA_CMD_READ_LONG	= 0x22,
 	ATA_CMD_READ_LONG_ONCE	= 0x23,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 016860320f6f..8e77572350f0 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -140,6 +140,10 @@ const char *libata_trace_parse_eh_err_mask(struct trace_seq *, unsigned int);
 const char *libata_trace_parse_qc_flags(struct trace_seq *, unsigned int);
 #define __parse_qc_flags(f) libata_trace_parse_qc_flags(p, f)
 
+const char *libata_trace_parse_subcmd(struct trace_seq *, unsigned char,
+				      unsigned char, unsigned char);
+#define __parse_subcmd(c,f,h) libata_trace_parse_subcmd(p, c, f, h)
+
 TRACE_EVENT(ata_qc_issue,
 
 	TP_PROTO(struct ata_queued_cmd *qc),
@@ -186,11 +190,12 @@ TRACE_EVENT(ata_qc_issue,
 		__entry->hob_nsect	= qc->tf.hob_nsect;
 	),
 
-	TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s " \
+	TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s%s " \
 		  " tf=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)",
 		  __entry->ata_port, __entry->ata_dev, __entry->tag,
 		  show_protocol_name(__entry->proto),
 		  show_opcode_name(__entry->cmd),
+		  __parse_subcmd(__entry->cmd, __entry->feature, __entry->hob_nsect),
 		  __entry->cmd, __entry->feature, __entry->nsect,
 		  __entry->lbal, __entry->lbam, __entry->lbah,
 		  __entry->hob_feature, __entry->hob_nsect,
-- 
cgit v1.2.3


From 3a92945b24c7ff46757a3d5d5112bfc62d2e45b2 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:51 +0200
Subject: libata: fixup ZAC device disabling

libata device disabling is ... curious. So add the correct
definitions that we can disable ZAC devices properly.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/libata.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 09ddb5a6f555..92297cd111f6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -192,7 +192,8 @@ enum {
 	ATA_DEV_SEMB		= 7,	/* SEMB */
 	ATA_DEV_SEMB_UNSUP	= 8,	/* SEMB (unsupported) */
 	ATA_DEV_ZAC		= 9,	/* ZAC device */
-	ATA_DEV_NONE		= 10,	/* no device */
+	ATA_DEV_ZAC_UNSUP	= 10,	/* ZAC device (unsupported) */
+	ATA_DEV_NONE		= 11,	/* no device */
 
 	/* struct ata_link flags */
 	ATA_LFLAG_NO_HRST	= (1 << 1), /* avoid hardreset */
@@ -1524,7 +1525,8 @@ static inline unsigned int ata_class_enabled(unsigned int class)
 static inline unsigned int ata_class_disabled(unsigned int class)
 {
 	return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP ||
-		class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP;
+		class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP ||
+		class == ATA_DEV_ZAC_UNSUP;
 }
 
 static inline unsigned int ata_class_absent(unsigned int class)
-- 
cgit v1.2.3


From 28a3fc2295a744a0d2ddf86b2ccdf03fbab123f9 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:52 +0200
Subject: libata: implement ZBC IN translation

ZAC drives implement a 'ZAC Management In' command template,
which maps onto the ZBC IN command.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c       |   1 +
 drivers/ata/libata-scsi.c     | 157 ++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-trace.c    |  10 +++
 include/linux/ata.h           |  10 ++-
 include/linux/libata.h        |   7 ++
 include/trace/events/libata.h |   1 +
 6 files changed, 185 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e81661981a09..ee6c572d2a4a 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2482,6 +2482,7 @@ const char *ata_get_cmd_descript(u8 command)
 		{ ATA_CMD_CFA_WRITE_MULT_NE,	"CFA WRITE MULTIPLE WITHOUT ERASE" },
 		{ ATA_CMD_REQ_SENSE_DATA,	"REQUEST SENSE DATA EXT" },
 		{ ATA_CMD_SANITIZE_DEVICE,	"SANITIZE DEVICE" },
+		{ ATA_CMD_ZAC_MGMT_IN,		"ZAC MANAGEMENT IN" },
 		{ ATA_CMD_READ_LONG,		"READ LONG (with retries)" },
 		{ ATA_CMD_READ_LONG_ONCE,	"READ LONG (without retries)" },
 		{ ATA_CMD_WRITE_LONG,		"WRITE LONG (with retries)" },
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6d78b4b422a4..06d5a62f507d 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3317,6 +3317,160 @@ invalid_opcode:
 	return 1;
 }
 
+/**
+ *	ata_scsi_report_zones_complete - convert ATA output
+ *	@qc: command structure returning the data
+ *
+ *	Convert T-13 little-endian field representation into
+ *	T-10 big-endian field representation.
+ *	What a mess.
+ */
+static void ata_scsi_report_zones_complete(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct sg_mapping_iter miter;
+	unsigned long flags;
+	unsigned int bytes = 0;
+
+	sg_miter_start(&miter, scsi_sglist(scmd), scsi_sg_count(scmd),
+		       SG_MITER_TO_SG | SG_MITER_ATOMIC);
+
+	local_irq_save(flags);
+	while (sg_miter_next(&miter)) {
+		unsigned int offset = 0;
+
+		if (bytes == 0) {
+			char *hdr;
+			u32 list_length;
+			u64 max_lba, opt_lba;
+			u16 same;
+
+			/* Swizzle header */
+			hdr = miter.addr;
+			list_length = get_unaligned_le32(&hdr[0]);
+			same = get_unaligned_le16(&hdr[4]);
+			max_lba = get_unaligned_le64(&hdr[8]);
+			opt_lba = get_unaligned_le64(&hdr[16]);
+			put_unaligned_be32(list_length, &hdr[0]);
+			hdr[4] = same & 0xf;
+			put_unaligned_be64(max_lba, &hdr[8]);
+			put_unaligned_be64(opt_lba, &hdr[16]);
+			offset += 64;
+			bytes += 64;
+		}
+		while (offset < miter.length) {
+			char *rec;
+			u8 cond, type, non_seq, reset;
+			u64 size, start, wp;
+
+			/* Swizzle zone descriptor */
+			rec = miter.addr + offset;
+			type = rec[0] & 0xf;
+			cond = (rec[1] >> 4) & 0xf;
+			non_seq = (rec[1] & 2);
+			reset = (rec[1] & 1);
+			size = get_unaligned_le64(&rec[8]);
+			start = get_unaligned_le64(&rec[16]);
+			wp = get_unaligned_le64(&rec[24]);
+			rec[0] = type;
+			rec[1] = (cond << 4) | non_seq | reset;
+			put_unaligned_be64(size, &rec[8]);
+			put_unaligned_be64(start, &rec[16]);
+			put_unaligned_be64(wp, &rec[24]);
+			WARN_ON(offset + 64 > miter.length);
+			offset += 64;
+			bytes += 64;
+		}
+	}
+	sg_miter_stop(&miter);
+	local_irq_restore(flags);
+
+	ata_scsi_qc_complete(qc);
+}
+
+static unsigned int ata_scsi_zbc_in_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	const u8 *cdb = scmd->cmnd;
+	u16 sect, fp = (u16)-1;
+	u8 sa, options, bp = 0xff;
+	u64 block;
+	u32 n_block;
+
+	if (unlikely(scmd->cmd_len < 16)) {
+		ata_dev_warn(qc->dev, "invalid cdb length %d\n",
+			     scmd->cmd_len);
+		fp = 15;
+		goto invalid_fld;
+	}
+	scsi_16_lba_len(cdb, &block, &n_block);
+	if (n_block != scsi_bufflen(scmd)) {
+		ata_dev_warn(qc->dev, "non-matching transfer count (%d/%d)\n",
+			     n_block, scsi_bufflen(scmd));
+		goto invalid_param_len;
+	}
+	sa = cdb[1] & 0x1f;
+	if (sa != ZI_REPORT_ZONES) {
+		ata_dev_warn(qc->dev, "invalid service action %d\n", sa);
+		fp = 1;
+		goto invalid_fld;
+	}
+	/*
+	 * ZAC allows only for transfers in 512 byte blocks,
+	 * and uses a 16 bit value for the transfer count.
+	 */
+	if ((n_block / 512) > 0xffff || n_block < 512 || (n_block % 512)) {
+		ata_dev_warn(qc->dev, "invalid transfer count %d\n", n_block);
+		goto invalid_param_len;
+	}
+	sect = n_block / 512;
+	options = cdb[14];
+
+	if (ata_ncq_enabled(qc->dev) &&
+	    ata_fpdma_zac_mgmt_in_supported(qc->dev)) {
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_FPDMA_RECV;
+		tf->hob_nsect = ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN & 0x1f;
+		tf->nsect = qc->tag << 3;
+		tf->feature = sect & 0xff;
+		tf->hob_feature = (sect >> 8) & 0xff;
+		tf->auxiliary = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES;
+	} else {
+		tf->command = ATA_CMD_ZAC_MGMT_IN;
+		tf->feature = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES;
+		tf->protocol = ATA_PROT_DMA;
+		tf->hob_feature = options;
+		tf->hob_nsect = (sect >> 8) & 0xff;
+		tf->nsect = sect & 0xff;
+	}
+	tf->device = ATA_LBA;
+	tf->lbah = (block >> 16) & 0xff;
+	tf->lbam = (block >> 8) & 0xff;
+	tf->lbal = block & 0xff;
+	tf->hob_lbah = (block >> 40) & 0xff;
+	tf->hob_lbam = (block >> 32) & 0xff;
+	tf->hob_lbal = (block >> 24) & 0xff;
+
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;
+	qc->flags |= ATA_QCFLAG_RESULT_TF;
+
+	ata_qc_set_pc_nbytes(qc);
+
+	qc->complete_fn = ata_scsi_report_zones_complete;
+
+	return 0;
+
+invalid_fld:
+	ata_scsi_set_invalid_field(qc->dev, scmd, fp, bp);
+	return 1;
+
+invalid_param_len:
+	/* "Parameter list length error" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	return 1;
+}
+
 /**
  *	ata_mselect_caching - Simulate MODE SELECT for caching info page
  *	@qc: Storage for translated ATA taskfile
@@ -3632,6 +3786,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 		return ata_scsi_mode_select_xlat;
 		break;
 
+	case ZBC_IN:
+		return ata_scsi_zbc_in_xlat;
+
 	case START_STOP:
 		return ata_scsi_start_stop_xlat;
 	}
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 99ec1e8cb95d..9caeabd69ccb 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -162,6 +162,9 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 		case ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT:
 			trace_seq_printf(p, " READ_LOG_DMA_EXT");
 			break;
+		case ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN:
+			trace_seq_printf(p, " ZAC_MGMT_IN");
+			break;
 		}
 		break;
 	case ATA_CMD_FPDMA_SEND:
@@ -187,6 +190,13 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 			break;
 		}
 		break;
+	case ATA_CMD_ZAC_MGMT_IN:
+		switch (feature) {
+		case ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES:
+			trace_seq_printf(p, " REPORT_ZONES");
+			break;
+		}
+		break;
 	}
 	trace_seq_putc(p, 0);
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 032bb223cd8c..255aa0f1c9bc 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -302,12 +302,14 @@ enum {
 	ATA_CMD_CFA_WRITE_MULT_NE = 0xCD,
 	ATA_CMD_REQ_SENSE_DATA  = 0x0B,
 	ATA_CMD_SANITIZE_DEVICE = 0xB4,
+	ATA_CMD_ZAC_MGMT_IN	= 0x4A,
 
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
 
 	/* Subcmds for ATA_CMD_FPDMA_RECV */
 	ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT = 0x01,
+	ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN    = 0x02,
 
 	/* Subcmds for ATA_CMD_FPDMA_SEND */
 	ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
@@ -318,6 +320,9 @@ enum {
 	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
 	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
 
+	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
+	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
@@ -341,7 +346,10 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED  = (1 << 0),
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_OFFSET	= 0x0C,
 	ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
-	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x10,
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET	= 0x10,
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OUT_SUPPORTED = (1 << 0),
+	ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED = (1 << 1),
+	ATA_LOG_NCQ_SEND_RECV_SIZE		= 0x14,
 
 	/* NCQ Non-Data log */
 	ATA_LOG_NCQ_NON_DATA_SUBCMDS_OFFSET	= 0x00,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 92297cd111f6..c0806b60c4fa 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1651,6 +1651,13 @@ static inline bool ata_fpdma_read_log_supported(struct ata_device *dev)
 		 ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED);
 }
 
+static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev)
+{
+	return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
+		(dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET] &
+		ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 8e77572350f0..77370a650c15 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -98,6 +98,7 @@
 		 ata_opcode_name(ATA_CMD_CFA_WRITE_MULT_NE),	\
 		 ata_opcode_name(ATA_CMD_REQ_SENSE_DATA),	\
 		 ata_opcode_name(ATA_CMD_SANITIZE_DEVICE),	\
+		 ata_opcode_name(ATA_CMD_ZAC_MGMT_IN),		\
 		 ata_opcode_name(ATA_CMD_RESTORE),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG_ONCE),	\
-- 
cgit v1.2.3


From 27708a9579ee069c6e0ebb6e61ac1114ed1d546c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:53 +0200
Subject: libata: Implement ZBC OUT translation

ZAC drives implement a 'ZAC Management Out' command template,
which maps onto the ZBC OUT command.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c       |  1 +
 drivers/ata/libata-scsi.c     | 67 +++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-trace.c    | 16 +++++++++++
 include/linux/ata.h           |  7 +++++
 include/trace/events/libata.h |  1 +
 5 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index ee6c572d2a4a..61dc7a99e89a 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2483,6 +2483,7 @@ const char *ata_get_cmd_descript(u8 command)
 		{ ATA_CMD_REQ_SENSE_DATA,	"REQUEST SENSE DATA EXT" },
 		{ ATA_CMD_SANITIZE_DEVICE,	"SANITIZE DEVICE" },
 		{ ATA_CMD_ZAC_MGMT_IN,		"ZAC MANAGEMENT IN" },
+		{ ATA_CMD_ZAC_MGMT_OUT,		"ZAC MANAGEMENT OUT" },
 		{ ATA_CMD_READ_LONG,		"READ LONG (with retries)" },
 		{ ATA_CMD_READ_LONG_ONCE,	"READ LONG (without retries)" },
 		{ ATA_CMD_WRITE_LONG,		"WRITE LONG (with retries)" },
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 06d5a62f507d..6afd0840ebbe 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3471,6 +3471,70 @@ invalid_param_len:
 	return 1;
 }
 
+static unsigned int ata_scsi_zbc_out_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct ata_device *dev = qc->dev;
+	const u8 *cdb = scmd->cmnd;
+	u8 reset_all, sa;
+	u64 block;
+	u32 n_block;
+	u16 fp = (u16)-1;
+
+	if (unlikely(scmd->cmd_len < 16)) {
+		fp = 15;
+		goto invalid_fld;
+	}
+
+	sa = cdb[1] & 0x1f;
+	if ((sa != ZO_CLOSE_ZONE) && (sa != ZO_FINISH_ZONE) &&
+	    (sa != ZO_OPEN_ZONE) && (sa != ZO_RESET_WRITE_POINTER)) {
+		fp = 1;
+		goto invalid_fld;
+	}
+
+	scsi_16_lba_len(cdb, &block, &n_block);
+	if (n_block) {
+		/*
+		 * ZAC MANAGEMENT OUT doesn't define any length
+		 */
+		goto invalid_param_len;
+	}
+	if (block > dev->n_sectors)
+		goto out_of_range;
+
+	reset_all = cdb[14] & 0x1;
+
+	tf->protocol = ATA_PROT_NODATA;
+	tf->command = ATA_CMD_ZAC_MGMT_OUT;
+	tf->feature = sa;
+	tf->hob_feature = reset_all & 0x1;
+
+	tf->lbah = (block >> 16) & 0xff;
+	tf->lbam = (block >> 8) & 0xff;
+	tf->lbal = block & 0xff;
+	tf->hob_lbah = (block >> 40) & 0xff;
+	tf->hob_lbam = (block >> 32) & 0xff;
+	tf->hob_lbal = (block >> 24) & 0xff;
+	tf->device = ATA_LBA;
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;
+
+	return 0;
+
+ invalid_fld:
+	ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
+	return 1;
+ out_of_range:
+	/* "Logical Block Address out of range" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x00);
+	return 1;
+invalid_param_len:
+	/* "Parameter list length error" */
+	ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
+	return 1;
+}
+
 /**
  *	ata_mselect_caching - Simulate MODE SELECT for caching info page
  *	@qc: Storage for translated ATA taskfile
@@ -3789,6 +3853,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 	case ZBC_IN:
 		return ata_scsi_zbc_in_xlat;
 
+	case ZBC_OUT:
+		return ata_scsi_zbc_out_xlat;
+
 	case START_STOP:
 		return ata_scsi_start_stop_xlat;
 	}
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 9caeabd69ccb..1111ba7db5b3 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -197,6 +197,22 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 			break;
 		}
 		break;
+	case ATA_CMD_ZAC_MGMT_OUT:
+		switch (feature) {
+		case ATA_SUBCMD_ZAC_MGMT_OUT_CLOSE_ZONE:
+			trace_seq_printf(p, " CLOSE_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_FINISH_ZONE:
+			trace_seq_printf(p, " FINISH_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_OPEN_ZONE:
+			trace_seq_printf(p, " OPEN_ZONE");
+			break;
+		case ATA_SUBCMD_ZAC_MGMT_OUT_RESET_WRITE_POINTER:
+			trace_seq_printf(p, " RESET_WRITE_POINTER");
+			break;
+		}
+		break;
 	}
 	trace_seq_putc(p, 0);
 
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 255aa0f1c9bc..9d7c47075ebc 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -303,6 +303,7 @@ enum {
 	ATA_CMD_REQ_SENSE_DATA  = 0x0B,
 	ATA_CMD_SANITIZE_DEVICE = 0xB4,
 	ATA_CMD_ZAC_MGMT_IN	= 0x4A,
+	ATA_CMD_ZAC_MGMT_OUT	= 0x9F,
 
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
@@ -323,6 +324,12 @@ enum {
 	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
 	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
 
+	/* Subcmds for ATA_CMD_ZAC_MGMT_OUT */
+	ATA_SUBCMD_ZAC_MGMT_OUT_CLOSE_ZONE = 0x01,
+	ATA_SUBCMD_ZAC_MGMT_OUT_FINISH_ZONE = 0x02,
+	ATA_SUBCMD_ZAC_MGMT_OUT_OPEN_ZONE = 0x03,
+	ATA_SUBCMD_ZAC_MGMT_OUT_RESET_WRITE_POINTER = 0x04,
+
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h
index 77370a650c15..75fff8696bae 100644
--- a/include/trace/events/libata.h
+++ b/include/trace/events/libata.h
@@ -99,6 +99,7 @@
 		 ata_opcode_name(ATA_CMD_REQ_SENSE_DATA),	\
 		 ata_opcode_name(ATA_CMD_SANITIZE_DEVICE),	\
 		 ata_opcode_name(ATA_CMD_ZAC_MGMT_IN),		\
+		 ata_opcode_name(ATA_CMD_ZAC_MGMT_OUT),		\
 		 ata_opcode_name(ATA_CMD_RESTORE),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG),		\
 		 ata_opcode_name(ATA_CMD_READ_LONG_ONCE),	\
-- 
cgit v1.2.3


From 284b3b77ea883234dadb2cbf97b145c3c30fe4bd Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:54 +0200
Subject: libata: NCQ encapsulation for ZAC MANAGEMENT OUT

Add NCQ encapsulation for ZAC MANAGEMENT OUT and evaluate
NCQ Non-Data log pages to figure out if NCQ encapsulation
is supported.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c  | 43 ++++++++++++++++++++++++++++++++++++++++---
 drivers/ata/libata-scsi.c  | 18 +++++++++++++-----
 drivers/ata/libata-trace.c |  3 +++
 include/linux/ata.h        |  7 +++++++
 include/linux/libata.h     |  7 +++++++
 5 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b2bd7c499653..1528c7cc0089 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2121,6 +2121,40 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
 	}
 }
 
+static void ata_dev_config_ncq_non_data(struct ata_device *dev)
+{
+	struct ata_port *ap = dev->link->ap;
+	unsigned int err_mask;
+	int log_index = ATA_LOG_NCQ_NON_DATA * 2;
+	u16 log_pages;
+
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get Log Directory Emask 0x%x\n",
+			    err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (!log_pages) {
+		ata_dev_warn(dev,
+			     "NCQ Send/Recv Log not supported\n");
+		return;
+	}
+	err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_dbg(dev,
+			    "failed to get NCQ Non-Data Log Emask 0x%x\n",
+			    err_mask);
+	} else {
+		u8 *cmds = dev->ncq_non_data_cmds;
+
+		memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE);
+	}
+}
+
 static int ata_dev_config_ncq(struct ata_device *dev,
 			       char *desc, size_t desc_sz)
 {
@@ -2165,9 +2199,12 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 		snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
 			ddepth, aa_desc);
 
-	if ((ap->flags & ATA_FLAG_FPDMA_AUX) &&
-	    ata_id_has_ncq_send_and_recv(dev->id))
-		ata_dev_config_ncq_send_recv(dev);
+	if ((ap->flags & ATA_FLAG_FPDMA_AUX)) {
+		if (ata_id_has_ncq_send_and_recv(dev->id))
+			ata_dev_config_ncq_send_recv(dev);
+		if (ata_id_has_ncq_non_data(dev->id))
+			ata_dev_config_ncq_non_data(dev);
+	}
 
 	return 0;
 }
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6afd0840ebbe..43403aa0801f 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3506,11 +3506,19 @@ static unsigned int ata_scsi_zbc_out_xlat(struct ata_queued_cmd *qc)
 
 	reset_all = cdb[14] & 0x1;
 
-	tf->protocol = ATA_PROT_NODATA;
-	tf->command = ATA_CMD_ZAC_MGMT_OUT;
-	tf->feature = sa;
-	tf->hob_feature = reset_all & 0x1;
-
+	if (ata_ncq_enabled(qc->dev) &&
+	    ata_fpdma_zac_mgmt_out_supported(qc->dev)) {
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_NCQ_NON_DATA;
+		tf->hob_nsect = ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT;
+		tf->nsect = qc->tag << 3;
+		tf->auxiliary = sa | (reset_all & 0x1) << 8;
+	} else {
+		tf->protocol = ATA_PROT_NODATA;
+		tf->command = ATA_CMD_ZAC_MGMT_OUT;
+		tf->feature = sa;
+		tf->hob_feature = reset_all & 0x1;
+	}
 	tf->lbah = (block >> 16) & 0xff;
 	tf->lbam = (block >> 8) & 0xff;
 	tf->lbal = block & 0xff;
diff --git a/drivers/ata/libata-trace.c b/drivers/ata/libata-trace.c
index 1111ba7db5b3..f8c550df0615 100644
--- a/drivers/ata/libata-trace.c
+++ b/drivers/ata/libata-trace.c
@@ -188,6 +188,9 @@ libata_trace_parse_subcmd(struct trace_seq *p, unsigned char cmd,
 		case ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT:
 			trace_seq_printf(p, " ZERO_EXT");
 			break;
+		case ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT:
+			trace_seq_printf(p, " ZAC_MGMT_OUT");
+			break;
 		}
 		break;
 	case ATA_CMD_ZAC_MGMT_IN:
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9d7c47075ebc..e62703201e84 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -320,6 +320,7 @@ enum {
 	ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE  = 0x00,
 	ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
 	ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
+	ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT = 0x07,
 
 	/* Subcmds for ATA_CMD_ZAC_MGMT_IN */
 	ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,
@@ -333,6 +334,7 @@ enum {
 	/* READ_LOG_EXT pages */
 	ATA_LOG_DIRECTORY	= 0x0,
 	ATA_LOG_SATA_NCQ	= 0x10,
+	ATA_LOG_NCQ_NON_DATA	  = 0x12,
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
@@ -877,6 +879,11 @@ static inline bool ata_id_has_ncq_send_and_recv(const u16 *id)
 	return id[ATA_ID_SATA_CAPABILITY_2] & BIT(6);
 }
 
+static inline bool ata_id_has_ncq_non_data(const u16 *id)
+{
+	return id[ATA_ID_SATA_CAPABILITY_2] & BIT(5);
+}
+
 static inline bool ata_id_has_trim(const u16 *id)
 {
 	if (ata_id_major_version(id) >= 7 &&
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c0806b60c4fa..0019d4b51b11 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -729,6 +729,7 @@ struct ata_device {
 
 	/* NCQ send and receive log subcommand support */
 	u8			ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE];
+	u8			ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE];
 
 	/* error history */
 	int			spdn_cnt;
@@ -1658,6 +1659,12 @@ static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev)
 		ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED);
 }
 
+static inline bool ata_fpdma_zac_mgmt_out_supported(struct ata_device *dev)
+{
+	return (dev->ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET] &
+		ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT);
+}
+
 static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
 {
 	qc->tf.ctl |= ATA_NIEN;
-- 
cgit v1.2.3


From 856c4663930988118d9f355aad66811dd6df06de Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:55 +0200
Subject: libata: support device-managed ZAC devices

Device-managed ZAC devices just set the zoned capabilities field
in INQUIRY byte 69 (cf ACS-4). This corresponds to the 'zoned'
field in the block device characteristics VPD page.
As this is only defined in SPC-5/SBC-4 we also need to update
the supported SCSI version descriptor.

Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 19 ++++++++++---------
 include/linux/ata.h       |  5 +++++
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 43403aa0801f..96abd42c9985 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2082,14 +2082,14 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 		0x00,
 		0xA0,	/* SAM-5 (no version claimed) */
 
-		0x04,
-		0xC0,	/* SBC-3 (no version claimed) */
+		0x06,
+		0x00,	/* SBC-4 (no version claimed) */
 
-		0x04,
-		0x60,	/* SPC-4 (no version claimed) */
+		0x05,
+		0xC0,	/* SPC-5 (no version claimed) */
 
 		0x60,
-		0x20,   /* ZBC (no version claimed) */
+		0x24,   /* ZBC r05 */
 	};
 
 	u8 hdr[] = {
@@ -2109,10 +2109,8 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 	    (args->dev->link->ap->pflags & ATA_PFLAG_EXTERNAL))
 		hdr[1] |= (1 << 7);
 
-	if (args->dev->class == ATA_DEV_ZAC) {
+	if (args->dev->class == ATA_DEV_ZAC)
 		hdr[0] = TYPE_ZBC;
-		hdr[2] = 0x6; /* ZBC is defined in SPC-4 */
-	}
 
 	memcpy(rbuf, hdr, sizeof(hdr));
 	memcpy(&rbuf[8], "ATA     ", 8);
@@ -2126,7 +2124,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 	if (rbuf[32] == 0 || rbuf[32] == ' ')
 		memcpy(&rbuf[32], "n/a ", 4);
 
-	if (args->dev->class == ATA_DEV_ZAC)
+	if (ata_id_zoned_cap(args->id) || args->dev->class == ATA_DEV_ZAC)
 		memcpy(rbuf + 58, versions_zbc, sizeof(versions_zbc));
 	else
 		memcpy(rbuf + 58, versions, sizeof(versions));
@@ -2322,12 +2320,15 @@ static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
 {
 	int form_factor = ata_id_form_factor(args->id);
 	int media_rotation_rate = ata_id_rotation_rate(args->id);
+	u8 zoned = ata_id_zoned_cap(args->id);
 
 	rbuf[1] = 0xb1;
 	rbuf[3] = 0x3c;
 	rbuf[4] = media_rotation_rate >> 8;
 	rbuf[5] = media_rotation_rate;
 	rbuf[7] = form_factor;
+	if (zoned)
+		rbuf[8] = (zoned << 4);
 
 	return 0;
 }
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e62703201e84..ac1cb9310dea 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -935,6 +935,11 @@ static inline bool ata_id_is_ssd(const u16 *id)
 	return id[ATA_ID_ROT_SPEED] == 0x01;
 }
 
+static inline u8 ata_id_zoned_cap(const u16 *id)
+{
+	return (id[ATA_ID_ADDITIONAL_SUPP] & 0x3);
+}
+
 static inline bool ata_id_pio_need_iordy(const u16 *id, const u8 pio)
 {
 	/* CF spec. r4.1 Table 22 says no IORDY on PIO5 and PIO6. */
-- 
cgit v1.2.3


From 6d1003ae8db228b74ef61536364cd2a1bd973dd8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 25 Apr 2016 12:45:56 +0200
Subject: libata: support host-aware and host-managed ZAC devices

Byte 69 bits 0:1 in the IDENTIFY DEVICE data indicate a
host-aware ZAC device.
Host-managed ZAC devices have their own individual signature,
and to not set the bits in the IDENTIFY DEVICE data.
And whenever we detect a ZAC-compatible device we should
be displaying the zoned block characteristics VPD page.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-scsi.c | 38 ++++++++++++++++++-
 include/linux/ata.h       |  1 +
 include/linux/libata.h    |  7 ++++
 4 files changed, 138 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 1528c7cc0089..97f31707b570 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2227,6 +2227,99 @@ static void ata_dev_config_sense_reporting(struct ata_device *dev)
 	}
 }
 
+static void ata_dev_config_zac(struct ata_device *dev)
+{
+	struct ata_port *ap = dev->link->ap;
+	unsigned int err_mask;
+	u8 *identify_buf = ap->sector_buf;
+	int log_index = ATA_LOG_SATA_ID_DEV_DATA * 2, i, found = 0;
+	u16 log_pages;
+
+	dev->zac_zones_optimal_open = U32_MAX;
+	dev->zac_zones_optimal_nonseq = U32_MAX;
+	dev->zac_zones_max_open = U32_MAX;
+
+	/*
+	 * Always set the 'ZAC' flag for Host-managed devices.
+	 */
+	if (dev->class == ATA_DEV_ZAC)
+		dev->flags |= ATA_DFLAG_ZAC;
+	else if (ata_id_zoned_cap(dev->id) == 0x01)
+		/*
+		 * Check for host-aware devices.
+		 */
+		dev->flags |= ATA_DFLAG_ZAC;
+
+	if (!(dev->flags & ATA_DFLAG_ZAC))
+		return;
+
+	/*
+	 * Read Log Directory to figure out if IDENTIFY DEVICE log
+	 * is supported.
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_DIRECTORY,
+				     0, ap->sector_buf, 1);
+	if (err_mask) {
+		ata_dev_info(dev,
+			     "failed to get Log Directory Emask 0x%x\n",
+			     err_mask);
+		return;
+	}
+	log_pages = get_unaligned_le16(&ap->sector_buf[log_index]);
+	if (log_pages == 0) {
+		ata_dev_warn(dev,
+			     "ATA Identify Device Log not supported\n");
+		return;
+	}
+	/*
+	 * Read IDENTIFY DEVICE data log, page 0, to figure out
+	 * if page 9 is supported.
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_SATA_ID_DEV_DATA, 0,
+				     identify_buf, 1);
+	if (err_mask) {
+		ata_dev_info(dev,
+			     "failed to get Device Identify Log Emask 0x%x\n",
+			     err_mask);
+		return;
+	}
+	log_pages = identify_buf[8];
+	for (i = 0; i < log_pages; i++) {
+		if (identify_buf[9 + i] == ATA_LOG_ZONED_INFORMATION) {
+			found++;
+			break;
+		}
+	}
+	if (!found) {
+		ata_dev_warn(dev,
+			     "ATA Zoned Information Log not supported\n");
+		return;
+	}
+
+	/*
+	 * Read IDENTIFY DEVICE data log, page 9 (Zoned-device information)
+	 */
+	err_mask = ata_read_log_page(dev, ATA_LOG_SATA_ID_DEV_DATA,
+				     ATA_LOG_ZONED_INFORMATION,
+				     identify_buf, 1);
+	if (!err_mask) {
+		u64 zoned_cap, opt_open, opt_nonseq, max_open;
+
+		zoned_cap = get_unaligned_le64(&identify_buf[8]);
+		if ((zoned_cap >> 63))
+			dev->zac_zoned_cap = (zoned_cap & 1);
+		opt_open = get_unaligned_le64(&identify_buf[24]);
+		if ((opt_open >> 63))
+			dev->zac_zones_optimal_open = (u32)opt_open;
+		opt_nonseq = get_unaligned_le64(&identify_buf[32]);
+		if ((opt_nonseq >> 63))
+			dev->zac_zones_optimal_nonseq = (u32)opt_nonseq;
+		max_open = get_unaligned_le64(&identify_buf[40]);
+		if ((max_open >> 63))
+			dev->zac_zones_max_open = (u32)max_open;
+	}
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@dev: Target device to configure
@@ -2450,6 +2543,7 @@ int ata_dev_configure(struct ata_device *dev)
 				}
 		}
 		ata_dev_config_sense_reporting(dev);
+		ata_dev_config_zac(dev);
 		dev->cdb_len = 16;
 	}
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 96abd42c9985..b86af1416dce 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2144,6 +2144,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
  */
 static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 {
+	int num_pages;
 	const u8 pages[] = {
 		0x00,	/* page 0x00, this page */
 		0x80,	/* page 0x80, unit serial no page */
@@ -2152,10 +2153,14 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 		0xb0,	/* page 0xb0, block limits page */
 		0xb1,	/* page 0xb1, block device characteristics page */
 		0xb2,	/* page 0xb2, thin provisioning page */
+		0xb6,	/* page 0xb6, zoned block device characteristics */
 	};
 
-	rbuf[3] = sizeof(pages);	/* number of supported VPD pages */
-	memcpy(rbuf + 4, pages, sizeof(pages));
+	num_pages = sizeof(pages);
+	if (!(args->dev->flags & ATA_DFLAG_ZAC))
+		num_pages--;
+	rbuf[3] = num_pages;	/* number of supported VPD pages */
+	memcpy(rbuf + 4, pages, num_pages);
 	return 0;
 }
 
@@ -2343,6 +2348,26 @@ static unsigned int ata_scsiop_inq_b2(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 }
 
+static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
+{
+	/*
+	 * zbc-r05 SCSI Zoned Block device characteristics VPD page
+	 */
+	rbuf[1] = 0xb6;
+	rbuf[3] = 0x3C;
+
+	/*
+	 * URSWRZ bit is only meaningful for host-managed ZAC drives
+	 */
+	if (args->dev->zac_zoned_cap & 1)
+		rbuf[4] |= 1;
+	put_unaligned_be32(args->dev->zac_zones_optimal_open, &rbuf[8]);
+	put_unaligned_be32(args->dev->zac_zones_optimal_nonseq, &rbuf[12]);
+	put_unaligned_be32(args->dev->zac_zones_max_open, &rbuf[16]);
+
+	return 0;
+}
+
 /**
  *	ata_scsiop_noop - Command handler that simply returns success.
  *	@args: device IDENTIFY data / SCSI command of interest.
@@ -2661,6 +2686,9 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 				rbuf[14] |= 0x40; /* LBPRZ */
 			}
 		}
+		if (ata_id_zoned_cap(args->id) ||
+		    args->dev->class == ATA_DEV_ZAC)
+			rbuf[12] = (1 << 4); /* RC_BASIS */
 	}
 	return 0;
 }
@@ -4046,6 +4074,12 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 		case 0xb2:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
 			break;
+		case 0xb6:
+			if (dev->flags & ATA_DFLAG_ZAC) {
+				ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
+				break;
+			}
+			/* Fallthrough */
 		default:
 			ata_scsi_invalid_field(dev, cmd, 2);
 			break;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index ac1cb9310dea..83e2a99866c2 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -338,6 +338,7 @@ enum {
 	ATA_LOG_NCQ_SEND_RECV	  = 0x13,
 	ATA_LOG_SATA_ID_DEV_DATA  = 0x30,
 	ATA_LOG_SATA_SETTINGS	  = 0x08,
+	ATA_LOG_ZONED_INFORMATION = 0x09,
 	ATA_LOG_DEVSLP_OFFSET	  = 0x30,
 	ATA_LOG_DEVSLP_SIZE	  = 0x08,
 	ATA_LOG_DEVSLP_MDAT	  = 0x00,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0019d4b51b11..d15c19e331d1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -181,6 +181,7 @@ enum {
 	ATA_DFLAG_DEVSLP	= (1 << 27), /* device supports Device Sleep */
 	ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
 	ATA_DFLAG_D_SENSE	= (1 << 29), /* Descriptor sense requested */
+	ATA_DFLAG_ZAC		= (1 << 30), /* ZAC device */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
@@ -731,6 +732,12 @@ struct ata_device {
 	u8			ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE];
 	u8			ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE];
 
+	/* ZAC zone configuration */
+	u32			zac_zoned_cap;
+	u32			zac_zones_optimal_open;
+	u32			zac_zones_optimal_nonseq;
+	u32			zac_zones_max_open;
+
 	/* error history */
 	int			spdn_cnt;
 	/* ering is CLEAR_END, read comment above CLEAR_END */
-- 
cgit v1.2.3


From 59fa0224cfea31dde596e29555de94c961b139f9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:15 -0700
Subject: blk-throttle: don't parse cgroup path if trace isn't enabled

if trace isn't enabled, parsing cgroup path just wastes cpu

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c         | 5 ++---
 include/linux/blktrace_api.h | 9 +++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2149a1ddbacf..47a3e540631a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -211,15 +211,14 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  *
  * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
  * throtl_grp; otherwise, just "throtl".
- *
- * TODO: this should be made a function and name formatting should happen
- * after testing whether blktrace is enabled.
  */
 #define throtl_log(sq, fmt, args...)	do {				\
 	struct throtl_grp *__tg = sq_to_tg((sq));			\
 	struct throtl_data *__td = sq_to_td((sq));			\
 									\
 	(void)__td;							\
+	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
+		break;							\
 	if ((__tg)) {							\
 		char __pbuf[128];					\
 									\
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index afc1343df3c7..0f3172b8b225 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -57,6 +57,14 @@ void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 	} while (0)
 #define BLK_TN_MAX_MSG		128
 
+static inline bool blk_trace_note_message_enabled(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+	if (likely(!bt))
+		return false;
+	return bt->act_mask & BLK_TC_NOTIFY;
+}
+
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
@@ -79,6 +87,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 # define blk_trace_remove_sysfs(dev)			do { } while (0)
+# define blk_trace_note_message_enabled(q)		(false)
 static inline int blk_trace_init_sysfs(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From 9d9a77cee1ab53dc6419b1ab9da88c4e9342d26a Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Tue, 10 May 2016 00:19:41 +0200
Subject: net: phy: add phy_ethtool_{get|set}_link_ksettings

Ethtool callbacks {get|set}_link_ksettings are often the same, so
we add two generics functions phy_ethtool_{get|set}_link_ksettings
to avoid writing severals times the same function.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Acked-By: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 24 ++++++++++++++++++++++++
 include/linux/phy.h   |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 6f221c8c2a7f..603e8db50162 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1347,3 +1347,27 @@ void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
 		phydev->drv->get_wol(phydev, wol);
 }
 EXPORT_SYMBOL(phy_ethtool_get_wol);
+
+int phy_ethtool_get_link_ksettings(struct net_device *ndev,
+				   struct ethtool_link_ksettings *cmd)
+{
+	struct phy_device *phydev = ndev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_ksettings_get(phydev, cmd);
+}
+EXPORT_SYMBOL(phy_ethtool_get_link_ksettings);
+
+int phy_ethtool_set_link_ksettings(struct net_device *ndev,
+				   const struct ethtool_link_ksettings *cmd)
+{
+	struct phy_device *phydev = ndev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_ksettings_set(phydev, cmd);
+}
+EXPORT_SYMBOL(phy_ethtool_set_link_ksettings);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index be3f83bbdc0b..2d24b283aa2d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -829,6 +829,10 @@ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data);
 int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
 void phy_ethtool_get_wol(struct phy_device *phydev,
 			 struct ethtool_wolinfo *wol);
+int phy_ethtool_get_link_ksettings(struct net_device *ndev,
+				   struct ethtool_link_ksettings *cmd);
+int phy_ethtool_set_link_ksettings(struct net_device *ndev,
+				   const struct ethtool_link_ksettings *cmd);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit v1.2.3


From 1dee3f59a8d5e711797dc82628aaf94a64e99922 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 9 May 2016 11:40:20 +0200
Subject: block/drbd: align properly u64 in nl messages

The attribute 0 is never used in drbd, so let's use it as pad attribute
in netlink messages. This minimizes the patch.

Note that this patch is only compile-tested.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_nl.c      | 28 ++++++++++++++++------------
 include/linux/genl_magic_struct.h |  7 ++++++-
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1fd1dccebb6b..0bac9c8246bc 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -3633,14 +3633,15 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 		goto nla_put_failure;
 	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
 	    nla_put_u32(skb, T_current_state, device->state.i) ||
-	    nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
-	    nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
-	    nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
-	    nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
-	    nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
-	    nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
-	    nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
-	    nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_ed_uuid, device->ed_uuid) ||
+	    nla_put_u64_0pad(skb, T_capacity,
+			     drbd_get_capacity(device->this_bdev)) ||
+	    nla_put_u64_0pad(skb, T_send_cnt, device->send_cnt) ||
+	    nla_put_u64_0pad(skb, T_recv_cnt, device->recv_cnt) ||
+	    nla_put_u64_0pad(skb, T_read_cnt, device->read_cnt) ||
+	    nla_put_u64_0pad(skb, T_writ_cnt, device->writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
 	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
 	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
 	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
@@ -3657,13 +3658,16 @@ static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 			goto nla_put_failure;
 
 		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
-		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
-		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+		    nla_put_u64_0pad(skb, T_bits_total, drbd_bm_bits(device)) ||
+		    nla_put_u64_0pad(skb, T_bits_oos,
+				     drbd_bm_total_weight(device)))
 			goto nla_put_failure;
 		if (C_SYNC_SOURCE <= device->state.conn &&
 		    C_PAUSED_SYNC_T >= device->state.conn) {
-			if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
-			    nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+			if (nla_put_u64_0pad(skb, T_bits_rs_total,
+					     device->rs_total) ||
+			    nla_put_u64_0pad(skb, T_bits_rs_failed,
+					     device->rs_failed))
 				goto nla_put_failure;
 		}
 	}
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index eecd19b37001..6270a56e5edc 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -62,6 +62,11 @@ extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
 
 /* MAGIC helpers							{{{2 */
 
+static inline int nla_put_u64_0pad(struct sk_buff *skb, int attrtype, u64 value)
+{
+	return nla_put_64bit(skb, attrtype, sizeof(u64), &value, 0);
+}
+
 /* possible field types */
 #define __flg_field(attr_nr, attr_flag, name) \
 	__field(attr_nr, attr_flag, name, NLA_U8, char, \
@@ -80,7 +85,7 @@ extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
 			nla_get_u32, nla_put_u32, true)
 #define __u64_field(attr_nr, attr_flag, name)	\
 	__field(attr_nr, attr_flag, name, NLA_U64, __u64, \
-			nla_get_u64, nla_put_u64, false)
+			nla_get_u64, nla_put_u64_0pad, false)
 #define __str_field(attr_nr, attr_flag, name, maxlen) \
 	__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
 			nla_strlcpy, nla_put, false)
-- 
cgit v1.2.3


From e5366a266a8cd4cd6b0fe66876462cca2e1c6a89 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 6 May 2016 08:37:41 -0700
Subject: mtd: spi-nor: support GigaDevice gd25lq64c

Also note the GigaDevice JEDEC ID.

No write-protect support yet, since this flash uses a different status
register layout.

Cc: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Marek Vasut <marex@denx.de>
---
 drivers/mtd/spi-nor/spi-nor.c | 1 +
 include/linux/mtd/spi-nor.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 157841dc3e99..c52e45594bfd 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -832,6 +832,7 @@ static const struct flash_info spi_nor_ids[] = {
 	/* GigaDevice */
 	{ "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
 	{ "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+	{ "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
 	{ "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
 
 	/* Intel/Numonyx -- xxxs33b */
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 3c36113a88e1..7f041bd88b82 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -21,6 +21,7 @@
  * Sometimes these are the same as CFI IDs, but sometimes they aren't.
  */
 #define SNOR_MFR_ATMEL		CFI_MFR_ATMEL
+#define SNOR_MFR_GIGADEVICE	0xc8
 #define SNOR_MFR_INTEL		CFI_MFR_INTEL
 #define SNOR_MFR_MICRON		CFI_MFR_ST /* ST Micro <--> Micron */
 #define SNOR_MFR_MACRONIX	CFI_MFR_MACRONIX
-- 
cgit v1.2.3


From 496aec577b5183716ed9d8bcc853ad9003485fe8 Mon Sep 17 00:00:00 2001
From: Christian Daudt <csd@broadcom.com>
Date: Wed, 4 May 2016 17:55:20 -0700
Subject: brcmfmac: Add 4356 sdio support

This adds support for the 4356-sdio wireless chip.

Signed-off-by: Christian Daudt <csd@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c   | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c   | 4 +++-
 include/linux/mmc/sdio_ids.h                              | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 2fc0597f2cd0..c7550dab6a23 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1098,6 +1098,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356),
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 0e8f2a079907..d3fd6b1db1d9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -1333,6 +1333,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub)
 
 	switch (pub->chip) {
 	case BRCM_CC_4354_CHIP_ID:
+	case BRCM_CC_4356_CHIP_ID:
 		/* explicitly check SR engine enable bit */
 		pmu_cc3_mask = BIT(2);
 		/* fall-through */
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 4252fa82b89c..67e69bff2545 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -609,6 +609,7 @@ BRCMF_FW_NVRAM_DEF(4339, "brcmfmac4339-sdio.bin", "brcmfmac4339-sdio.txt");
 BRCMF_FW_NVRAM_DEF(43430, "brcmfmac43430-sdio.bin", "brcmfmac43430-sdio.txt");
 BRCMF_FW_NVRAM_DEF(43455, "brcmfmac43455-sdio.bin", "brcmfmac43455-sdio.txt");
 BRCMF_FW_NVRAM_DEF(4354, "brcmfmac4354-sdio.bin", "brcmfmac4354-sdio.txt");
+BRCMF_FW_NVRAM_DEF(4356, "brcmfmac4356-sdio.bin", "brcmfmac4356-sdio.txt");
 
 static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
@@ -624,7 +625,8 @@ static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4339_CHIP_ID, 0xFFFFFFFF, 4339),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43430_CHIP_ID, 0xFFFFFFFF, 43430),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4345_CHIP_ID, 0xFFFFFFC0, 43455),
-	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354)
+	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354),
+	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356)
 };
 
 static void pkt_align(struct sk_buff *p, int len, int align)
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 83430f2ea757..0d126aeb3ec0 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -36,6 +36,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
+#define SDIO_DEVICE_ID_BROADCOM_4356		0x4356
 
 #define SDIO_VENDOR_ID_INTEL			0x0089
 #define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX	0x1402
-- 
cgit v1.2.3


From 5d749d0bbe811c10d9048cde6dfebc761713abfd Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 8 Mar 2016 09:13:52 -0800
Subject: platform/chrome: cros_ec_dev - Fix security issue

Prevent memory scribble by checking that ioctl buffer size parameters
are sane.
Without this check, on 32 bits system, if .insize = 0xffffffff - 20 and
.outsize the amount to scribble, we would overflow, allocate a small
amounts and be able to write outside of the malloc'ed area.
Adding a hard limit allows argument checking of the ioctl. With the
current EC, it is expected .insize and .outsize to be at around 512 bytes
or less.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 drivers/platform/chrome/cros_ec_dev.c   | 4 ++++
 drivers/platform/chrome/cros_ec_proto.c | 4 ++--
 include/linux/mfd/cros_ec.h             | 6 ++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c
index d45cd254ed1c..187470c8a1f6 100644
--- a/drivers/platform/chrome/cros_ec_dev.c
+++ b/drivers/platform/chrome/cros_ec_dev.c
@@ -137,6 +137,10 @@ static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
 	if (copy_from_user(&u_cmd, arg, sizeof(u_cmd)))
 		return -EFAULT;
 
+	if ((u_cmd.outsize > EC_MAX_MSG_BYTES) ||
+	    (u_cmd.insize > EC_MAX_MSG_BYTES))
+		return -EINVAL;
+
 	s_cmd = kmalloc(sizeof(*s_cmd) + max(u_cmd.outsize, u_cmd.insize),
 			GFP_KERNEL);
 	if (!s_cmd)
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 990308ca384f..b6e161f71b26 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -298,8 +298,8 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 			ec_dev->max_response = EC_PROTO2_MAX_PARAM_SIZE;
 			ec_dev->max_passthru = 0;
 			ec_dev->pkt_xfer = NULL;
-			ec_dev->din_size = EC_MSG_BYTES;
-			ec_dev->dout_size = EC_MSG_BYTES;
+			ec_dev->din_size = EC_PROTO2_MSG_BYTES;
+			ec_dev->dout_size = EC_PROTO2_MSG_BYTES;
 		} else {
 			/*
 			 * It's possible for a test to occur too early when
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index a677c2bd485c..64184d27e3cd 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -50,9 +50,11 @@ enum {
 					EC_MSG_TX_TRAILER_BYTES,
 	EC_MSG_RX_PROTO_BYTES	= 3,
 
-	/* Max length of messages */
-	EC_MSG_BYTES		= EC_PROTO2_MAX_PARAM_SIZE +
+	/* Max length of messages for proto 2*/
+	EC_PROTO2_MSG_BYTES		= EC_PROTO2_MAX_PARAM_SIZE +
 					EC_MSG_TX_PROTO_BYTES,
+
+	EC_MAX_MSG_BYTES		= 64 * 1024,
 };
 
 /*
-- 
cgit v1.2.3


From 9b9e3fc4d5a31f6050508f2404369beac4356867 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 18:11:54 +0200
Subject: KVM: remove NULL return path for vcpu ids >= KVM_MAX_VCPUS

Commit c896939f7cff ("KVM: use heuristic for fast VCPU lookup by id") added
a return path that prevents vcpu ids to exceed KVM_MAX_VCPUS. This is a
problem for powerpc where vcpu ids can grow up to 8*KVM_MAX_VCPUS.

This patch simply reverses the logic so that we only try fast path if the
vcpu id can be tried as an index in kvm->vcpus[]. The slow path is not
affected by the change.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad40d44784c7..0a0e00d9c5da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -447,12 +447,13 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 
 static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 {
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu = NULL;
 	int i;
 
-	if (id < 0 || id >= KVM_MAX_VCPUS)
+	if (id < 0)
 		return NULL;
-	vcpu = kvm_get_vcpu(kvm, id);
+	if (id < KVM_MAX_VCPUS)
+		vcpu = kvm_get_vcpu(kvm, id);
 	if (vcpu && vcpu->vcpu_id == id)
 		return vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm)
-- 
cgit v1.2.3


From 0b1b1dfd52a67f4f09a18cb82337199bc90ad7fb Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 18:13:37 +0200
Subject: kvm: introduce KVM_MAX_VCPU_ID

The KVM_MAX_VCPUS define provides the maximum number of vCPUs per guest, and
also the upper limit for vCPU ids. This is okay for all archs except PowerPC
which can have higher ids, depending on the cpu/core/thread topology. In the
worst case (single threaded guest, host with 8 threads per core), it limits
the maximum number of vCPUS to KVM_MAX_VCPUS / 8.

This patch separates the vCPU numbering from the total number of vCPUs, with
the introduction of KVM_MAX_VCPU_ID, as the maximal valid value for vCPU ids
plus one.

The corresponding KVM_CAP_MAX_VCPU_ID allows userspace to validate vCPU ids
before passing them to KVM_CREATE_VCPU.

This patch only implements KVM_MAX_VCPU_ID with a specific value for PowerPC.
Other archs continue to return KVM_MAX_VCPUS instead.

Suggested-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt   | 10 ++++++++--
 arch/powerpc/include/asm/kvm_host.h |  3 +++
 include/linux/kvm_host.h            |  4 ++++
 include/uapi/linux/kvm.h            |  1 +
 virt/kvm/kvm_main.c                 |  4 +++-
 5 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8cc857fffcc7..a4482cce4bae 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -199,8 +199,8 @@ Type: vm ioctl
 Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
-This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).
+This API adds a vcpu to a virtual machine. No more than max_vcpus may be added.
+The vcpu id is an integer in the range [0, max_vcpu_id).
 
 The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
 the KVM_CHECK_EXTENSION ioctl() at run-time.
@@ -212,6 +212,12 @@ cpus max.
 If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
 same as the value returned from KVM_CAP_NR_VCPUS.
 
+The maximum possible value for max_vcpu_id can be retrieved using the
+KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
+If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id
+is the same as the value returned from KVM_CAP_MAX_VCPUS.
+
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
 hardware requires all the hardware threads in a CPU core to be in the
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..a07645c17818 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -40,6 +40,9 @@
 #define KVM_MAX_VCORES		NR_CPUS
 #define KVM_USER_MEM_SLOTS	512
 
+#include <asm/cputhreads.h>
+#define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
+
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0a0e00d9c5da..352889d6e322 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,10 @@
 
 #include <asm/kvm_host.h>
 
+#ifndef KVM_MAX_VCPU_ID
+#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#endif
+
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a7f1f8032ec1..05ebf475104c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -865,6 +865,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SPAPR_TCE_64 125
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4fd482fb9260..ed3d9bb18a56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2272,7 +2272,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	int r;
 	struct kvm_vcpu *vcpu;
 
-	if (id >= KVM_MAX_VCPUS)
+	if (id >= KVM_MAX_VCPU_ID)
 		return -EINVAL;
 
 	vcpu = kvm_arch_vcpu_create(kvm, id);
@@ -2746,6 +2746,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
 		return KVM_ADDRESS_SPACE_NUM;
 #endif
+	case KVM_CAP_MAX_VCPU_ID:
+		return KVM_MAX_VCPU_ID;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From b52f3ed02221252d8ee2c7d756e76fad4a5e84f6 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 5 May 2016 11:58:29 -0600
Subject: irqbypass: Disallow NULL token

A NULL token is meaningless and can only lead to unintended problems.
Error on registration with a NULL token, ignore de-registrations with
a NULL token.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/irqbypass.h |  4 ++--
 virt/lib/irqbypass.c      | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index 1551b5b2f4c2..f0f5d2671509 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -34,7 +34,7 @@ struct irq_bypass_consumer;
 /**
  * struct irq_bypass_producer - IRQ bypass producer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer
+ * @token: opaque token to match between producer and consumer (non-NULL)
  * @irq: Linux IRQ number for the producer device
  * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
  * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@@ -60,7 +60,7 @@ struct irq_bypass_producer {
 /**
  * struct irq_bypass_consumer - IRQ bypass consumer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer
+ * @token: opaque token to match between producer and consumer (non-NULL)
  * @add_producer: Connect the IRQ consumer to an IRQ producer
  * @del_producer: Disconnect the IRQ consumer from an IRQ producer
  * @stop: Perform any quiesce operations necessary prior to add/del (optional)
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 09a03b5a21ff..52abac4bb6a2 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -89,6 +89,9 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer)
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 
+	if (!producer->token)
+		return -EINVAL;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
@@ -136,6 +139,9 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 
+	if (!producer->token)
+		return;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
@@ -177,7 +183,8 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 
-	if (!consumer->add_producer || !consumer->del_producer)
+	if (!consumer->token ||
+	    !consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
 
 	might_sleep();
@@ -227,6 +234,9 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 
+	if (!consumer->token)
+		return;
+
 	might_sleep();
 
 	if (!try_module_get(THIS_MODULE))
-- 
cgit v1.2.3


From 14717e2031862d9aa2512b24a7df42cf68a977ec Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 5 May 2016 11:58:35 -0600
Subject: kvm: Conditionally register IRQ bypass consumer

If we don't support a mechanism for bypassing IRQs, don't register as
a consumer.  This eliminates meaningless dev_info()s when the connect
fails between producer and consumer, such as on AMD systems where
kvm_x86_ops->update_pi_irte is not implemented

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 19 ++++++++-----------
 include/linux/kvm_host.h |  1 +
 virt/kvm/eventfd.c       | 18 ++++++++++--------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 57a62d1e2f5d..6c774cdf553c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8358,19 +8358,21 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
+bool kvm_arch_has_irq_bypass(void)
+{
+	return kvm_x86_ops->update_pi_irte != NULL;
+}
+
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 
-	if (kvm_x86_ops->update_pi_irte) {
-		irqfd->producer = prod;
-		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
-				prod->irq, irqfd->gsi, 1);
-	}
+	irqfd->producer = prod;
 
-	return -EINVAL;
+	return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+					   prod->irq, irqfd->gsi, 1);
 }
 
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@ -8380,11 +8382,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 
-	if (!kvm_x86_ops->update_pi_irte) {
-		WARN_ON(irqfd->producer != NULL);
-		return;
-	}
-
 	WARN_ON(irqfd->producer != prod);
 	irqfd->producer = NULL;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 352889d6e322..92a0229044fb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1185,6 +1185,7 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+bool kvm_arch_has_irq_bypass(void);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 46dbc0a7dfc1..e469b6012471 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -408,15 +408,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 */
 	fdput(f);
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
-	irqfd->consumer.token = (void *)irqfd->eventfd;
-	irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
-	irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
-	irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
-	irqfd->consumer.start = kvm_arch_irq_bypass_start;
-	ret = irq_bypass_register_consumer(&irqfd->consumer);
-	if (ret)
-		pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+	if (kvm_arch_has_irq_bypass()) {
+		irqfd->consumer.token = (void *)irqfd->eventfd;
+		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
+		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
+		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
+		irqfd->consumer.start = kvm_arch_irq_bypass_start;
+		ret = irq_bypass_register_consumer(&irqfd->consumer);
+		if (ret)
+			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
 				irqfd->consumer.token, ret);
+	}
 #endif
 
 	return 0;
-- 
cgit v1.2.3


From 74b20582ac389ee9f18a6fcc0eef244658ce8de0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 10 May 2016 11:19:50 -0700
Subject: net: l3mdev: Add hook in ip and ipv6

Currently the VRF driver uses the rx_handler to switch the skb device
to the VRF device. Switching the dev prior to the ip / ipv6 layer
means the VRF driver has to duplicate IP/IPv6 processing which adds
overhead and makes features such as retaining the ingress device index
more complicated than necessary.

This patch moves the hook to the L3 layer just after the first NF_HOOK
for PRE_ROUTING. This location makes exposing the original ingress device
trivial (next patch) and allows adding other NF_HOOKs to the VRF driver
in the future.

dev_queue_xmit_nit is exported so that the VRF driver can cycle the skb
with the switched device through the packet taps to maintain current
behavior (tcpdump can be used on either the vrf device or the enslaved
devices).

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c         | 189 ++++++++++++++++++++++------------------------
 include/linux/ipv6.h      |  17 ++++-
 include/linux/netdevice.h |   2 +
 include/net/l3mdev.h      |  42 +++++++++++
 include/net/tcp.h         |   4 +-
 net/core/dev.c            |   3 +-
 net/ipv4/ip_input.c       |   7 ++
 net/ipv6/ip6_input.c      |   7 ++
 8 files changed, 170 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c8db55aa8280..0ea29345eb2e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -42,9 +42,6 @@
 #define DRV_NAME	"vrf"
 #define DRV_VERSION	"1.0"
 
-#define vrf_master_get_rcu(dev) \
-	((struct net_device *)rcu_dereference(dev->rx_handler_data))
-
 struct net_vrf {
 	struct rtable           *rth;
 	struct rt6_info		*rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-/* neighbor handling is done with actual device; do not want
- * to flip skb->dev for those ndisc packets. This really fails
- * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
- * a start.
- */
-#if IS_ENABLED(CONFIG_IPV6)
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	const struct ipv6hdr *ipv6h;
-	struct ipv6hdr _ipv6h;
-	bool rc = true;
-
-	ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
-	if (!ipv6h)
-		goto out;
-
-	if (ipv6h->nexthdr == NEXTHDR_ICMP) {
-		const struct icmp6hdr *icmph;
-		struct icmp6hdr _icmph;
-
-		icmph = skb_header_pointer(skb, sizeof(_ipv6h),
-					   sizeof(_icmph), &_icmph);
-		if (!icmph)
-			goto out;
-
-		switch (icmph->icmp6_type) {
-		case NDISC_ROUTER_SOLICITATION:
-		case NDISC_ROUTER_ADVERTISEMENT:
-		case NDISC_NEIGHBOUR_SOLICITATION:
-		case NDISC_NEIGHBOUR_ADVERTISEMENT:
-		case NDISC_REDIRECT:
-			rc = false;
-			break;
-		}
-	}
-
-out:
-	return rc;
-}
-#else
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	return false;
-}
-#endif
-
-static bool is_ip_rx_frame(struct sk_buff *skb)
-{
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		return true;
-	case htons(ETH_P_IPV6):
-		return check_ipv6_frame(skb);
-	}
-	return false;
-}
-
 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 {
 	vrf_dev->stats.tx_errors++;
 	kfree_skb(skb);
 }
 
-/* note: already called with rcu_read_lock */
-static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
-{
-	struct sk_buff *skb = *pskb;
-
-	if (is_ip_rx_frame(skb)) {
-		struct net_device *dev = vrf_master_get_rcu(skb->dev);
-		struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
-
-		u64_stats_update_begin(&dstats->syncp);
-		dstats->rx_pkts++;
-		dstats->rx_bytes += skb->len;
-		u64_stats_update_end(&dstats->syncp);
-
-		skb->dev = dev;
-
-		return RX_HANDLER_ANOTHER;
-	}
-	return RX_HANDLER_PASS;
-}
-
 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 						 struct rtnl_link_stats64 *stats)
 {
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 {
 	int ret;
 
-	/* register the packet handler for slave ports */
-	ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
-	if (ret) {
-		netdev_err(port_dev,
-			   "Device %s failed to register rx_handler\n",
-			   port_dev->name);
-		goto out_fail;
-	}
-
 	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
 	if (ret < 0)
-		goto out_unregister;
+		return ret;
 
 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
 	cycle_netdev(port_dev);
 
 	return 0;
-
-out_unregister:
-	netdev_rx_handler_unregister(port_dev);
-out_fail:
-	return ret;
 }
 
 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 	netdev_upper_dev_unlink(port_dev, dev);
 	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
 
-	netdev_rx_handler_unregister(port_dev);
-
 	cycle_netdev(port_dev);
 
 	return 0;
@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 	return rc;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+static bool ipv6_ndisc_frame(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	bool rc = false;
+
+	if (iph->nexthdr == NEXTHDR_ICMP) {
+		const struct icmp6hdr *icmph;
+		struct icmp6hdr _icmph;
+
+		icmph = skb_header_pointer(skb, sizeof(*iph),
+					   sizeof(_icmph), &_icmph);
+		if (!icmph)
+			goto out;
+
+		switch (icmph->icmp6_type) {
+		case NDISC_ROUTER_SOLICITATION:
+		case NDISC_ROUTER_ADVERTISEMENT:
+		case NDISC_NEIGHBOUR_SOLICITATION:
+		case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		case NDISC_REDIRECT:
+			rc = true;
+			break;
+		}
+	}
+
+out:
+	return rc;
+}
+
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	/* if packet is NDISC keep the ingress interface */
+	if (!ipv6_ndisc_frame(skb)) {
+		skb->dev = vrf_dev;
+		skb->skb_iif = vrf_dev->ifindex;
+
+		skb_push(skb, skb->mac_len);
+		dev_queue_xmit_nit(skb, vrf_dev);
+		skb_pull(skb, skb->mac_len);
+
+		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
+	}
+
+	return skb;
+}
+
+#else
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	return skb;
+}
+#endif
+
+static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb)
+{
+	skb->dev = vrf_dev;
+	skb->skb_iif = vrf_dev->ifindex;
+
+	skb_push(skb, skb->mac_len);
+	dev_queue_xmit_nit(skb, vrf_dev);
+	skb_pull(skb, skb->mac_len);
+
+	return skb;
+}
+
+/* called with rcu lock held */
+static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb,
+				  u16 proto)
+{
+	switch (proto) {
+	case AF_INET:
+		return vrf_ip_rcv(vrf_dev, skb);
+	case AF_INET6:
+		return vrf_ip6_rcv(vrf_dev, skb);
+	}
+
+	return skb;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 					 const struct flowi6 *fl6)
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_get_saddr	= vrf_get_saddr,
+	.l3mdev_l3_rcv		= vrf_l3_rcv,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
 #endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 58d6e158755f..5c91b0b055d4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
 #define IP6SKB_ROUTERALERT	8
 #define IP6SKB_FRAGMENTED      16
 #define IP6SKB_HOPBYHOP        32
+#define IP6SKB_L3SLAVE         64
 };
 
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return flags & IP6SKB_L3SLAVE;
+}
+#else
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return false;
+}
+#endif
+
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
 #define IP6CBMTU(skb)	((struct ip6_mtuinfo *)((skb)->cb))
 
 static inline int inet6_iif(const struct sk_buff *skb)
 {
-	return IP6CB(skb)->iif;
+	bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+
+	return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }
 
 struct tcp6_request_sock {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 63580e6d0df4..c2f5112f08f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);
 
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+
 extern int		netdev_budget;
 
 /* Called by rtnetlink.c:rtnl_unlock() */
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 78872bd1dc2c..374388dc01c8 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,6 +25,8 @@
 
 struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
+	struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
+					  struct sk_buff *skb, u16 proto);
 
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
 struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
 
+static inline
+struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
+{
+	struct net_device *master = NULL;
+
+	if (netif_is_l3_slave(skb->dev))
+		master = netdev_master_upper_dev_get_rcu(skb->dev);
+	else if (netif_is_l3_master(skb->dev))
+		master = skb->dev;
+
+	if (master && master->l3mdev_ops->l3mdev_l3_rcv)
+		skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
+
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET6);
+}
+
 #else
 
 static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
 {
 	return NULL;
 }
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c9ab561387c4..0bcc70f4e1fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
  */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
 {
-	return TCP_SKB_CB(skb)->header.h6.iif;
+	bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
+
+	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
 }
 #endif
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c7490339315c..12436d1312ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
  *	taps currently in use.
  */
 
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 	struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 
 /**
  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 751c0658e194..37375eedeef9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ed56012005d..f185cbcda114 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
 		const struct inet6_protocol *ipprot;
 
-- 
cgit v1.2.3


From 7219ab34f184b5d864be38f5ada7cdff1ab5b18a Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 11 May 2016 00:29:14 +0300
Subject: net/mlx5e: CQE compression

CQE compression feature is meant to save PCIe bandwidth by
compressing few CQEs into smaller amount of bytes on PCIe.
CQE compression can be selectively enabled per CQ.  By default
is disabled for now and will be enabled later on.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  10 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 151 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   8 ++
 include/linux/mlx5/device.h                        |  34 +++++
 6 files changed, 211 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index bfa5daaaf5aa..19f0d8db27ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -157,6 +157,8 @@ struct mlx5e_params {
 	u8  log_rq_size;
 	u16 num_channels;
 	u8  num_tc;
+	bool rx_cqe_compress_admin;
+	bool rx_cqe_compress;
 	u16 rx_cq_moderation_usec;
 	u16 rx_cq_moderation_pkts;
 	u16 tx_cq_moderation_usec;
@@ -202,6 +204,13 @@ struct mlx5e_cq {
 	struct mlx5e_channel      *channel;
 	struct mlx5e_priv         *priv;
 
+	/* cqe decompression */
+	struct mlx5_cqe64          title;
+	struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
+	u8                         mini_arr_idx;
+	u16                        decmprs_left;
+	u16                        decmprs_wqe_counter;
+
 	/* control */
 	struct mlx5_wq_ctrl        wq_ctrl;
 } ____cacheline_aligned_in_smp;
@@ -616,6 +625,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv);
 void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv);
 int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr);
 int mlx5e_hwstamp_get(struct net_device *dev, struct ifreq *ifr);
+void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val);
 
 int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
 			  u16 vid);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index 2018eebe1531..847a8f3ac2b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -93,6 +93,8 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
 	/* RX HW timestamp */
 	switch (config.rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression(priv, priv->params.rx_cqe_compress_admin);
 		break;
 	case HWTSTAMP_FILTER_ALL:
 	case HWTSTAMP_FILTER_SOME:
@@ -108,6 +110,8 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		/* Disable CQE compression */
+		mlx5e_modify_rx_cqe_compression(priv, false);
 		config.rx_filter = HWTSTAMP_FILTER_ALL;
 		break;
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1c70e518b5c5..0ea4c03a7946 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -114,6 +114,8 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
 		s->rx_mpwqe_frag   += rq_stats->mpwqe_frag;
 		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
+		s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
+		s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
 
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
@@ -1204,6 +1206,10 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
+	if (priv->params.rx_cqe_compress) {
+		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_CSUM);
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 23adfe2fcba9..c8b8d456268f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,6 +42,143 @@ static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
 	return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
 }
 
+static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
+				       void *data)
+{
+	u32 ci = cqcc & cq->wq.sz_m1;
+
+	memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
+}
+
+static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
+					 struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
+	cq->decmprs_left        = be32_to_cpu(cq->title.byte_cnt);
+	cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
+	rq->stats.cqe_compress_blks++;
+}
+
+static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, cq->mini_arr);
+	cq->mini_arr_idx = 0;
+}
+
+static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
+{
+	u8 op_own = (cqcc >> cq->wq.log_sz) & 1;
+	u32 wq_sz = 1 << cq->wq.log_sz;
+	u32 ci = cqcc & cq->wq.sz_m1;
+	u32 ci_top = min_t(u32, wq_sz, ci + n);
+
+	for (; ci < ci_top; ci++, n--) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+		cqe->op_own = op_own;
+	}
+
+	if (unlikely(ci == wq_sz)) {
+		op_own = !op_own;
+		for (ci = 0; ci < n; ci++) {
+			struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+			cqe->op_own = op_own;
+		}
+	}
+}
+
+static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
+					struct mlx5e_cq *cq, u32 cqcc)
+{
+	u16 wqe_cnt_step;
+
+	cq->title.byte_cnt     = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
+	cq->title.check_sum    = cq->mini_arr[cq->mini_arr_idx].checksum;
+	cq->title.op_own      &= 0xf0;
+	cq->title.op_own      |= 0x01 & (cqcc >> cq->wq.log_sz);
+	cq->title.wqe_counter  = cpu_to_be16(cq->decmprs_wqe_counter);
+
+	wqe_cnt_step =
+		rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+		mpwrq_get_cqe_consumed_strides(&cq->title) : 1;
+	cq->decmprs_wqe_counter =
+		(cq->decmprs_wqe_counter + wqe_cnt_step) & rq->wq.sz_m1;
+}
+
+static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
+						struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_decompress_cqe(rq, cq, cqcc);
+	cq->title.rss_hash_type   = 0;
+	cq->title.rss_hash_result = 0;
+}
+
+static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
+					     struct mlx5e_cq *cq,
+					     int update_owner_only,
+					     int budget_rem)
+{
+	u32 cqcc = cq->wq.cc + update_owner_only;
+	u32 cqe_count;
+	u32 i;
+
+	cqe_count = min_t(u32, cq->decmprs_left, budget_rem);
+
+	for (i = update_owner_only; i < cqe_count;
+	     i++, cq->mini_arr_idx++, cqcc++) {
+		if (unlikely(cq->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE))
+			mlx5e_read_mini_arr_slot(cq, cqcc);
+
+		mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
+		rq->handle_rx_cqe(rq, &cq->title);
+	}
+	mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
+	cq->wq.cc = cqcc;
+	cq->decmprs_left -= cqe_count;
+	rq->stats.cqe_compress_pkts += cqe_count;
+
+	return cqe_count;
+}
+
+static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
+					      struct mlx5e_cq *cq,
+					      int budget_rem)
+{
+	mlx5e_read_title_slot(rq, cq, cq->wq.cc);
+	mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
+	mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
+	rq->handle_rx_cqe(rq, &cq->title);
+	cq->mini_arr_idx++;
+
+	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
+}
+
+void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val)
+{
+	bool was_opened;
+
+	if (!MLX5_CAP_GEN(priv->mdev, cqe_compression))
+		return;
+
+	mutex_lock(&priv->state_lock);
+
+	if (priv->params.rx_cqe_compress == val)
+		goto unlock;
+
+	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	if (was_opened)
+		mlx5e_close_locked(priv->netdev);
+
+	priv->params.rx_cqe_compress = val;
+
+	if (was_opened)
+		mlx5e_open_locked(priv->netdev);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+}
+
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
 	struct sk_buff *skb;
@@ -738,14 +875,24 @@ mpwrq_cqe_out:
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
-	int work_done;
+	int work_done = 0;
 
-	for (work_done = 0; work_done < budget; work_done++) {
+	if (cq->decmprs_left)
+		work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+
+	for (; work_done < budget; work_done++) {
 		struct mlx5_cqe64 *cqe = mlx5e_get_cqe(cq);
 
 		if (!cqe)
 			break;
 
+		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
+			work_done +=
+				mlx5e_decompress_cqes_start(rq, cq,
+							    budget - work_done);
+			continue;
+		}
+
 		mlx5_cqwq_pop(&cq->wq);
 
 		rq->handle_rx_cqe(rq, cqe);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 115752b53d85..83bc32b25849 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -72,6 +72,8 @@ struct mlx5e_sw_stats {
 	u64 rx_mpwqe_filler;
 	u64 rx_mpwqe_frag;
 	u64 rx_buff_alloc_err;
+	u64 rx_cqe_compress_blks;
+	u64 rx_cqe_compress_pkts;
 
 	/* Special handling counters */
 	u64 link_down_events;
@@ -101,6 +103,8 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events) },
 };
 
@@ -283,6 +287,8 @@ struct mlx5e_rq_stats {
 	u64 mpwqe_filler;
 	u64 mpwqe_frag;
 	u64 buff_alloc_err;
+	u64 cqe_compress_blks;
+	u64 cqe_compress_pkts;
 };
 
 static const struct counter_desc rq_stats_desc[] = {
@@ -297,6 +303,8 @@ static const struct counter_desc rq_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, mpwqe_frag) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
 };
 
 struct mlx5e_sq_stats {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ee0d5a937f02..035abdf62cfe 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -685,6 +685,40 @@ struct mlx5_cqe64 {
 	u8		op_own;
 };
 
+struct mlx5_mini_cqe8 {
+	union {
+		__be32 rx_hash_result;
+		struct {
+			__be16 checksum;
+			__be16 rsvd;
+		};
+		struct {
+			__be16 wqe_counter;
+			u8  s_wqe_opcode;
+			u8  reserved;
+		} s_wqe_info;
+	};
+	__be32 byte_cnt;
+};
+
+enum {
+	MLX5_NO_INLINE_DATA,
+	MLX5_INLINE_DATA32_SEG,
+	MLX5_INLINE_DATA64_SEG,
+	MLX5_COMPRESSED,
+};
+
+enum {
+	MLX5_CQE_FORMAT_CSUM = 0x1,
+};
+
+#define MLX5_MINI_CQE_ARRAY_SIZE 8
+
+static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->op_own >> 2) & 0x3;
+}
+
 static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
-- 
cgit v1.2.3


From 37bff2b9c6addf6216c8d04e95be596678e8deff Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:13 +0300
Subject: qed: Add VF->PF channel infrastructure

Communication between VF and PF is based on a dedicated HW channel;
VF will prepare a messge, and by signaling the HW the PF would get a
notification of that message existance. The PF would then copy the
message, process it and DMA an answer back to the VF as a response.

The messages themselves are TLV-based - allowing easier backward/forward
compatibility.

This patch adds the infrastructure of the channel on the PF side -
starting with the arrival of the notification and ending with DMAing
the response back to the VF.

It also adds a dummy-response as reference, as it only lays the
groundwork of the communication; it doesn't really add support of any
actual messages.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h         |   6 +
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h |  27 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h     |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_hw.c      |  44 ++-
 drivers/net/ethernet/qlogic/qed/qed_main.c    |  10 +-
 drivers/net/ethernet/qlogic/qed/qed_spq.c     |  16 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c   | 385 ++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h   |  53 ++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h      |  52 ++++
 include/linux/qed/common_hsi.h                |   5 +
 10 files changed, 585 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 2e067c7e8f38..01f9b6c880bd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -378,6 +378,12 @@ struct qed_hwfn {
 
 	struct qed_simd_fp_handler	simd_proto_handler[64];
 
+#ifdef CONFIG_QED_SRIOV
+	struct workqueue_struct *iov_wq;
+	struct delayed_work iov_task;
+	unsigned long iov_task_flags;
+#endif
+
 	struct z_stream_s		*stream;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index 6aac3f855aa1..f567371fe304 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -182,11 +182,15 @@ enum qed_dmae_address_type_t {
  * used mostly to write a zeroed buffer to destination address
  * using DMA
  */
-#define QED_DMAE_FLAG_RW_REPL_SRC       0x00000001
-#define QED_DMAE_FLAG_COMPLETION_DST    0x00000008
+#define QED_DMAE_FLAG_RW_REPL_SRC	0x00000001
+#define QED_DMAE_FLAG_VF_SRC		0x00000002
+#define QED_DMAE_FLAG_VF_DST		0x00000004
+#define QED_DMAE_FLAG_COMPLETION_DST	0x00000008
 
 struct qed_dmae_params {
-	u32	flags; /* consists of QED_DMAE_FLAG_* values */
+	u32 flags; /* consists of QED_DMAE_FLAG_* values */
+	u8 src_vfid;
+	u8 dst_vfid;
 };
 
 /**
@@ -208,6 +212,23 @@ qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
 		  u32 size_in_dwords,
 		  u32 flags);
 
+/**
+ * @brief qed_dmae_host2host - copy data from to source address
+ * to a destination adress (for SRIOV) using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_addr
+ * @param dest_addr
+ * @param size_in_dwords
+ * @param params
+ */
+int qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       dma_addr_t source_addr,
+		       dma_addr_t dest_addr,
+		       u32 size_in_dwords, struct qed_dmae_params *p_params);
+
 /**
  * @brief qed_chain_alloc - Allocate and initialize a chain
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index c4fae71bed11..6ba197c107ed 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -31,7 +31,7 @@ enum common_event_opcode {
 	COMMON_EVENT_PF_STOP,
 	COMMON_EVENT_RESERVED,
 	COMMON_EVENT_RESERVED2,
-	COMMON_EVENT_RESERVED3,
+	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
 	COMMON_EVENT_RESERVED5,
 	COMMON_EVENT_RESERVED6,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index a8cf96c34bef..a9be5a422d2d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -355,8 +355,8 @@ static void qed_dmae_opcode(struct qed_hwfn *p_hwfn,
 			    const u8 is_dst_type_grc,
 			    struct qed_dmae_params *p_params)
 {
+	u16 opcode_b = 0;
 	u32 opcode = 0;
-	u16 opcodeB = 0;
 
 	/* Whether the source is the PCIe or the GRC.
 	 * 0- The source is the PCIe
@@ -398,14 +398,24 @@ static void qed_dmae_opcode(struct qed_hwfn *p_hwfn,
 	opcode |= (DMAE_CMD_DST_ADDR_RESET_MASK <<
 		   DMAE_CMD_DST_ADDR_RESET_SHIFT);
 
-	opcodeB |= (DMAE_CMD_SRC_VF_ID_MASK <<
-		    DMAE_CMD_SRC_VF_ID_SHIFT);
+	/* SRC/DST VFID: all 1's - pf, otherwise VF id */
+	if (p_params->flags & QED_DMAE_FLAG_VF_SRC) {
+		opcode |= 1 << DMAE_CMD_SRC_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->src_vfid << DMAE_CMD_SRC_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_SRC_VF_ID_MASK <<
+			    DMAE_CMD_SRC_VF_ID_SHIFT;
+	}
 
-	opcodeB |= (DMAE_CMD_DST_VF_ID_MASK <<
-		    DMAE_CMD_DST_VF_ID_SHIFT);
+	if (p_params->flags & QED_DMAE_FLAG_VF_DST) {
+		opcode |= 1 << DMAE_CMD_DST_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->dst_vfid << DMAE_CMD_DST_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_DST_VF_ID_MASK << DMAE_CMD_DST_VF_ID_SHIFT;
+	}
 
 	p_hwfn->dmae_info.p_dmae_cmd->opcode = cpu_to_le32(opcode);
-	p_hwfn->dmae_info.p_dmae_cmd->opcode_b = cpu_to_le16(opcodeB);
+	p_hwfn->dmae_info.p_dmae_cmd->opcode_b = cpu_to_le16(opcode_b);
 }
 
 u32 qed_dmae_idx_to_go_cmd(u8 idx)
@@ -753,6 +763,28 @@ int qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+int
+qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt,
+		   dma_addr_t source_addr,
+		   dma_addr_t dest_addr,
+		   u32 size_in_dwords, struct qed_dmae_params *p_params)
+{
+	int rc;
+
+	mutex_lock(&(p_hwfn->dmae_info.mutex));
+
+	rc = qed_dmae_execute_command(p_hwfn, p_ptt, source_addr,
+				      dest_addr,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      size_in_dwords, p_params);
+
+	mutex_unlock(&(p_hwfn->dmae_info.mutex));
+
+	return rc;
+}
+
 u16 qed_get_qm_pq(struct qed_hwfn *p_hwfn,
 		  enum protocol_type proto,
 		  union qed_qm_pq_params *p_params)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1b758bdec587..c209ed49deae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -24,6 +24,7 @@
 #include <linux/qed/qed_if.h>
 
 #include "qed.h"
+#include "qed_sriov.h"
 #include "qed_sp.h"
 #include "qed_dev_api.h"
 #include "qed_mcp.h"
@@ -749,7 +750,10 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	struct qed_mcp_drv_version drv_version;
 	const u8 *data = NULL;
 	struct qed_hwfn *hwfn;
-	int rc;
+	int rc = -EINVAL;
+
+	if (qed_iov_wq_start(cdev))
+		goto err;
 
 	rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
 			      &cdev->pdev->dev);
@@ -826,6 +830,8 @@ err1:
 err:
 	release_firmware(cdev->firmware);
 
+	qed_iov_wq_stop(cdev, false);
+
 	return rc;
 }
 
@@ -842,6 +848,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	qed_disable_msix(cdev);
 	qed_nic_reset(cdev);
 
+	qed_iov_wq_stop(cdev, true);
+
 	release_firmware(cdev->firmware);
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 89469d5aae25..0e439e46fbe9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -27,6 +27,7 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 /***************************************************************************
 * Structures & Definitions
@@ -242,10 +243,17 @@ static int
 qed_async_event_completion(struct qed_hwfn *p_hwfn,
 			   struct event_ring_entry *p_eqe)
 {
-	DP_NOTICE(p_hwfn,
-		  "Unknown Async completion for protocol: %d\n",
-		   p_eqe->protocol_id);
-	return -EINVAL;
+	switch (p_eqe->protocol_id) {
+	case PROTOCOLID_COMMON:
+		return qed_sriov_eqe_event(p_hwfn,
+					   p_eqe->opcode,
+					   p_eqe->echo, &p_eqe->data);
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unknown Async completion for protocol: %d\n",
+			  p_eqe->protocol_id);
+		return -EINVAL;
+	}
 }
 
 /***************************************************************************
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 685e3fa7bc52..4a6af4264141 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -31,6 +31,26 @@ bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 	return true;
 }
 
+static struct qed_vf_info *qed_iov_get_vf_info(struct qed_hwfn *p_hwfn,
+					       u16 relative_vf_id,
+					       bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	if (!p_hwfn->pf_iov_info) {
+		DP_NOTICE(p_hwfn->cdev, "No iov info\n");
+		return NULL;
+	}
+
+	if (qed_iov_is_valid_vfid(p_hwfn, relative_vf_id, b_enabled_only))
+		vf = &p_hwfn->pf_iov_info->vfs_array[relative_vf_id];
+	else
+		DP_ERR(p_hwfn, "qed_iov_get_vf_info: VF[%d] is not enabled\n",
+		       relative_vf_id);
+
+	return vf;
+}
+
 static int qed_iov_pci_cfg_info(struct qed_dev *cdev)
 {
 	struct qed_hw_sriov_info *iov = cdev->p_iov_info;
@@ -349,6 +369,232 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 	return 0;
 }
 
+static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
+{
+	/* Check PF supports sriov */
+	if (!IS_QED_SRIOV(p_hwfn->cdev) || !IS_PF_SRIOV_ALLOC(p_hwfn))
+		return false;
+
+	/* Check VF validity */
+	if (!qed_iov_is_valid_vfid(p_hwfn, vfid, true))
+		return false;
+
+	return true;
+}
+
+static bool qed_iov_tlv_supported(u16 tlvtype)
+{
+	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
+}
+
+/* place a given tlv on the tlv buffer, continuing current tlv list */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length)
+{
+	struct channel_tlv *tl = (struct channel_tlv *)*offset;
+
+	tl->type = type;
+	tl->length = length;
+
+	/* Offset should keep pointing to next TLV (the end of the last) */
+	*offset += length;
+
+	/* Return a pointer to the start of the added tlv */
+	return *offset - length;
+}
+
+/* list the types and lengths of the tlvs on the buffer */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list)
+{
+	u16 i = 1, total_length = 0;
+	struct channel_tlv *tlv;
+
+	do {
+		tlv = (struct channel_tlv *)((u8 *)tlvs_list + total_length);
+
+		/* output tlv */
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "TLV number %d: type %d, length %d\n",
+			   i, tlv->type, tlv->length);
+
+		if (tlv->type == CHANNEL_TLV_LIST_END)
+			return;
+
+		/* Validate entry - protect against malicious VFs */
+		if (!tlv->length) {
+			DP_NOTICE(p_hwfn, "TLV of length 0 found\n");
+			return;
+		}
+
+		total_length += tlv->length;
+
+		if (total_length >= sizeof(struct tlv_buffer_size)) {
+			DP_NOTICE(p_hwfn, "TLV ==> Buffer overflow\n");
+			return;
+		}
+
+		i++;
+	} while (1);
+}
+
+static void qed_iov_send_response(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_vf_info *p_vf,
+				  u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct qed_dmae_params params;
+	u8 eng_vf_id;
+
+	mbx->reply_virt->default_resp.hdr.status = status;
+
+	qed_dp_tlv_list(p_hwfn, mbx->reply_virt);
+
+	eng_vf_id = p_vf->abs_vf_id;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_DST;
+	params.dst_vfid = eng_vf_id;
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys + sizeof(u64),
+			   mbx->req_virt->first_tlv.reply_address +
+			   sizeof(u64),
+			   (sizeof(union pfvf_tlvs) - sizeof(u64)) / 4,
+			   &params);
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys,
+			   mbx->req_virt->first_tlv.reply_address,
+			   sizeof(u64) / 4, &params);
+
+	REG_WR(p_hwfn,
+	       GTT_BAR0_MAP_REG_USDM_RAM +
+	       USTORM_VF_PF_CHANNEL_READY_OFFSET(eng_vf_id), 1);
+}
+
+static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_vf_info *vf_info,
+				 u16 type, u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &vf_info->vf_mbx;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	qed_add_tlv(p_hwfn, &mbx->offset, type, length);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
+}
+
+static void qed_iov_process_mbx_dummy_resp(struct qed_hwfn *p_hwfn,
+					   struct qed_ptt *p_ptt,
+					   struct qed_vf_info *p_vf)
+{
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_NONE,
+			     sizeof(struct pfvf_def_resp_tlv),
+			     PFVF_STATUS_SUCCESS);
+}
+
+static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt, int vfid)
+{
+	struct qed_iov_vf_mbx *mbx;
+	struct qed_vf_info *p_vf;
+	int i;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf)
+		return;
+
+	mbx = &p_vf->vf_mbx;
+
+	/* qed_iov_process_mbx_request */
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "qed_iov_process_mbx_req vfid %d\n", p_vf->abs_vf_id);
+
+	mbx->first_tlv = mbx->req_virt->first_tlv;
+
+	/* check if tlv type is known */
+	if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
+		qed_iov_process_mbx_dummy_resp(p_hwfn, p_ptt, p_vf);
+	} else {
+		/* unknown TLV - this may belong to a VF driver from the future
+		 * - a version written after this PF driver was written, which
+		 * supports features unknown as of yet. Too bad since we don't
+		 * support them. Or this may be because someone wrote a crappy
+		 * VF driver and is sending garbage over the channel.
+		 */
+		DP_ERR(p_hwfn,
+		       "unknown TLV. type %d length %d. first 20 bytes of mailbox buffer:\n",
+		       mbx->first_tlv.tl.type, mbx->first_tlv.tl.length);
+
+		for (i = 0; i < 20; i++) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "%x ",
+				   mbx->req_virt->tlv_buf_size.tlv_buffer[i]);
+		}
+	}
+}
+
+void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	u64 add_bit = 1ULL << (vfid % 64);
+
+	p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit;
+}
+
+static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn,
+						    u64 *events)
+{
+	u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events;
+
+	memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+	memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+}
+
+static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn,
+			      u16 abs_vfid, struct regpair *vf_msg)
+{
+	u8 min = (u8)p_hwfn->cdev->p_iov_info->first_vf_in_pf;
+	struct qed_vf_info *p_vf;
+
+	if (!qed_iov_pf_sanity_check(p_hwfn, (int)abs_vfid - min)) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Got a message from VF [abs 0x%08x] that cannot be handled by PF\n",
+			   abs_vfid);
+		return 0;
+	}
+	p_vf = &p_hwfn->pf_iov_info->vfs_array[(u8)abs_vfid - min];
+
+	/* List the physical address of the request so that handler
+	 * could later on copy the message from it.
+	 */
+	p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
+
+	/* Mark the event and schedule the workqueue */
+	qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id);
+	qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
+
+	return 0;
+}
+
+int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			u8 opcode, __le16 echo, union event_ring_data *data)
+{
+	switch (opcode) {
+	case COMMON_EVENT_VF_PF_CHANNEL:
+		return qed_sriov_vfpf_msg(p_hwfn, le16_to_cpu(echo),
+					  &data->vf_pf_channel.msg_addr);
+	default:
+		DP_INFO(p_hwfn->cdev, "Unknown sriov eqe event 0x%02x\n",
+			opcode);
+		return -EINVAL;
+	}
+}
+
 u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 {
 	struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
@@ -364,3 +610,142 @@ u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 out:
 	return MAX_NUM_VFS;
 }
+
+static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
+			       int vfid)
+{
+	struct qed_dmae_params params;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return -EINVAL;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_SRC | QED_DMAE_FLAG_COMPLETION_DST;
+	params.src_vfid = vf_info->abs_vf_id;
+
+	if (qed_dmae_host2host(p_hwfn, ptt,
+			       vf_info->vf_mbx.pending_req,
+			       vf_info->vf_mbx.req_phys,
+			       sizeof(union vfpf_tlvs) / 4, &params)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Failed to copy message from VF 0x%02x\n", vfid);
+
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * qed_schedule_iov - schedules IOV task for VF and PF
+ * @hwfn: hardware function pointer
+ * @flag: IOV flag for VF/PF
+ */
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag)
+{
+	smp_mb__before_atomic();
+	set_bit(flag, &hwfn->iov_task_flags);
+	smp_mb__after_atomic();
+	DP_VERBOSE(hwfn, QED_MSG_IOV, "Scheduling iov task [Flag: %d]\n", flag);
+	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, 0);
+}
+
+static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
+{
+	u64 events[QED_VF_ARRAY_LENGTH];
+	struct qed_ptt *ptt;
+	int i;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt) {
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Can't acquire PTT; re-scheduling\n");
+		qed_schedule_iov(hwfn, QED_IOV_WQ_MSG_FLAG);
+		return;
+	}
+
+	qed_iov_pf_get_and_clear_pending_events(hwfn, events);
+
+	DP_VERBOSE(hwfn, QED_MSG_IOV,
+		   "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
+		   events[0], events[1], events[2]);
+
+	qed_for_each_vf(hwfn, i) {
+		/* Skip VFs with no pending messages */
+		if (!(events[i / 64] & (1ULL << (i % 64))))
+			continue;
+
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Handling VF message from VF 0x%02x [Abs 0x%02x]\n",
+			   i, hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+		/* Copy VF's message to PF's request buffer for that VF */
+		if (qed_iov_copy_vf_msg(hwfn, ptt, i))
+			continue;
+
+		qed_iov_process_mbx_req(hwfn, ptt, i);
+	}
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+void qed_iov_pf_task(struct work_struct *work)
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     iov_task.work);
+
+	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
+		return;
+
+	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
+		qed_handle_vf_msg(hwfn);
+}
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		if (!cdev->hwfns[i].iov_wq)
+			continue;
+
+		if (schedule_first) {
+			qed_schedule_iov(&cdev->hwfns[i],
+					 QED_IOV_WQ_STOP_WQ_FLAG);
+			cancel_delayed_work_sync(&cdev->hwfns[i].iov_task);
+		}
+
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+		destroy_workqueue(cdev->hwfns[i].iov_wq);
+	}
+}
+
+int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	char name[NAME_SIZE];
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		/* PFs needs a dedicated workqueue only if they support IOV. */
+		if (!IS_PF_SRIOV(p_hwfn))
+			continue;
+
+		snprintf(name, NAME_SIZE, "iov-%02x:%02x.%02x",
+			 cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), p_hwfn->abs_pf_id);
+
+		p_hwfn->iov_wq = create_singlethread_workqueue(name);
+		if (!p_hwfn->iov_wq) {
+			DP_NOTICE(p_hwfn, "Cannot create iov workqueue\n");
+			return -ENOMEM;
+		}
+
+		INIT_DELAYED_WORK(&p_hwfn->iov_task, qed_iov_pf_task);
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 0ac561916e2c..112216812a12 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -50,6 +50,14 @@ struct qed_iov_vf_mbx {
 	dma_addr_t req_phys;
 	union pfvf_tlvs *reply_virt;
 	dma_addr_t reply_phys;
+
+	/* Address in VF where a pending message is located */
+	dma_addr_t pending_req;
+
+	u8 *offset;
+
+	/* saved VF request header */
+	struct vfpf_first_tlv first_tlv;
 };
 
 enum vf_state {
@@ -96,6 +104,14 @@ struct qed_pf_iov {
 	u32 bulletins_size;
 };
 
+enum qed_iov_wq_flag {
+	QED_IOV_WQ_MSG_FLAG,
+	QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+	QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
+	QED_IOV_WQ_STOP_WQ_FLAG,
+	QED_IOV_WQ_FLR_FLAG,
+};
+
 #ifdef CONFIG_QED_SRIOV
 /**
  * @brief - Given a VF index, return index of next [including that] active VF.
@@ -147,6 +163,22 @@ void qed_iov_free(struct qed_hwfn *p_hwfn);
  * @param cdev
  */
 void qed_iov_free_hw_info(struct qed_dev *cdev);
+
+/**
+ * @brief qed_sriov_eqe_event - handle async sriov event arrived on eqe.
+ *
+ * @param p_hwfn
+ * @param opcode
+ * @param echo
+ * @param data
+ */
+int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			u8 opcode, __le16 echo, union event_ring_data *data);
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
+int qed_iov_wq_start(struct qed_dev *cdev);
+
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -175,6 +207,27 @@ static inline void qed_iov_free(struct qed_hwfn *p_hwfn)
 static inline void qed_iov_free_hw_info(struct qed_dev *cdev)
 {
 }
+
+static inline int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+				      u8 opcode,
+				      __le16 echo, union event_ring_data *data)
+{
+	return -EINVAL;
+}
+
+static inline void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+}
+
+static inline int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	return 0;
+}
+
+static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
+				    enum qed_iov_wq_flag flag)
+{
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 52cfa4889d88..f0d8de2be581 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -9,16 +9,62 @@
 #ifndef _QED_VF_H
 #define _QED_VF_H
 
+enum {
+	PFVF_STATUS_WAITING,
+	PFVF_STATUS_SUCCESS,
+	PFVF_STATUS_FAILURE,
+	PFVF_STATUS_NOT_SUPPORTED,
+	PFVF_STATUS_NO_RESOURCE,
+	PFVF_STATUS_FORCED,
+};
+
+/* vf pf channel tlvs */
+/* general tlv header (used for both vf->pf request and pf->vf response) */
+struct channel_tlv {
+	u16 type;
+	u16 length;
+};
+
+/* header of first vf->pf tlv carries the offset used to calculate reponse
+ * buffer address
+ */
+struct vfpf_first_tlv {
+	struct channel_tlv tl;
+	u32 padding;
+	u64 reply_address;
+};
+
+/* header of pf->vf tlvs, carries the status of handling the request */
+struct pfvf_tlv {
+	struct channel_tlv tl;
+	u8 status;
+	u8 padding[3];
+};
+
+/* response tlv used for most tlvs */
+struct pfvf_def_resp_tlv {
+	struct pfvf_tlv hdr;
+};
+
+/* used to terminate and pad a tlv list */
+struct channel_list_end_tlv {
+	struct channel_tlv tl;
+	u8 padding[4];
+};
+
 #define TLV_BUFFER_SIZE                 1024
 struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
 };
 
 union vfpf_tlvs {
+	struct vfpf_first_tlv first_tlv;
+	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
 union pfvf_tlvs {
+	struct pfvf_def_resp_tlv default_resp;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -38,4 +84,10 @@ struct qed_bulletin {
 	u32 size;
 };
 
+enum {
+	CHANNEL_TLV_NONE,	/* ends tlv sequence */
+	CHANNEL_TLV_LIST_END,
+	CHANNEL_TLV_MAX
+};
+
 #endif
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 53ecb37ae563..8914d271ba73 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -327,9 +327,14 @@ struct regpair {
 	__le32	hi;
 };
 
+struct vf_pf_channel_eqe_data {
+	struct regpair msg_addr;
+};
+
 /* Event Data Union */
 union event_ring_data {
 	u8				bytes[8];
+	struct vf_pf_channel_eqe_data	vf_pf_channel;
 	struct async_data		async_info;
 };
 
-- 
cgit v1.2.3


From 1408cc1fa48c5450c0dc4b40cbd9718ecb09d1c9 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:14 +0300
Subject: qed: Introduce VFs

This adds the qed VFs for the first time -
The vfs are limited functions, with a very different PCI bar structure
[when compared with PFs] to better impose the related security demands
associated with them.

This patch includes the logic neccesary to allow VFs to successfully probe
[without actually adding the ability to enable iov].
This includes diverging all the flows that would occur as part of the pci
probe of the driver, preventing VF from accessing registers/memories it
can't and instead utilize the VF->PF channel to query the PF for needed
information.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h             |   5 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c         | 186 +++++++++--
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |   3 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 182 +++++++++--
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  47 ++-
 drivers/net/ethernet/qlogic/qed/qed_hw.c          |  12 +-
 drivers/net/ethernet/qlogic/qed/qed_init_ops.c    |   4 +
 drivers/net/ethernet/qlogic/qed/qed_int.c         | 101 +++++-
 drivers/net/ethernet/qlogic/qed/qed_int.h         |  16 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  57 +++-
 drivers/net/ethernet/qlogic/qed/qed_main.c        | 192 +++++++----
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |  83 ++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |  27 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h    |   6 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |   2 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   8 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c         |   3 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       | 379 +++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |  49 +++
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 357 ++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h          | 229 +++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  13 +-
 include/linux/qed/common_hsi.h                    |  57 ++++
 include/linux/qed/qed_if.h                        |  10 +-
 25 files changed, 1842 insertions(+), 188 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_vf.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index e11a809034ea..a44874562cfd 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -3,4 +3,4 @@ obj-$(CONFIG_QED) := qed.o
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
 	 qed_selftest.o
-qed-$(CONFIG_QED_SRIOV) += qed_sriov.o
+qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 01f9b6c880bd..f9a3576305a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -311,6 +311,8 @@ struct qed_hwfn {
 	bool				first_on_engine;
 	bool				hw_init_done;
 
+	u8				num_funcs_on_engine;
+
 	/* BAR access */
 	void __iomem			*regview;
 	void __iomem			*doorbells;
@@ -361,6 +363,7 @@ struct qed_hwfn {
 	/* True if the driver requests for the link */
 	bool				b_drv_link_init;
 
+	struct qed_vf_iov		*vf_iov_info;
 	struct qed_pf_iov		*pf_iov_info;
 	struct qed_mcp_info		*mcp_info;
 
@@ -497,6 +500,8 @@ struct qed_dev {
 #define IS_QED_SRIOV(cdev)              (!!(cdev)->p_iov_info)
 
 	unsigned long			tunn_mode;
+
+	bool				b_is_vf;
 	u32				drv_type;
 
 	struct qed_eth_stats		*reset_stats;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index fc767c07a264..ac284c58d8c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -24,11 +24,13 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 /* Max number of connection types in HW (DQ/CDU etc.) */
 #define MAX_CONN_TYPES		PROTOCOLID_COMMON
 #define NUM_TASK_TYPES		2
 #define NUM_TASK_PF_SEGMENTS	4
+#define NUM_TASK_VF_SEGMENTS	1
 
 /* QM constants */
 #define QM_PQ_ELEMENT_SIZE	4 /* in bytes */
@@ -63,10 +65,12 @@ union conn_context {
 struct qed_conn_type_cfg {
 	u32 cid_count;
 	u32 cid_start;
+	u32 cids_per_vf;
 };
 
 /* ILT Client configuration, Per connection type (protocol) resources. */
 #define ILT_CLI_PF_BLOCKS	(1 + NUM_TASK_PF_SEGMENTS * 2)
+#define ILT_CLI_VF_BLOCKS       (1 + NUM_TASK_VF_SEGMENTS * 2)
 #define CDUC_BLK		(0)
 
 enum ilt_clients {
@@ -97,6 +101,10 @@ struct qed_ilt_client_cfg {
 	/* ILT client blocks for PF */
 	struct qed_ilt_cli_blk pf_blks[ILT_CLI_PF_BLOCKS];
 	u32 pf_total_lines;
+
+	/* ILT client blocks for VFs */
+	struct qed_ilt_cli_blk vf_blks[ILT_CLI_VF_BLOCKS];
+	u32 vf_total_lines;
 };
 
 /* Per Path -
@@ -123,6 +131,11 @@ struct qed_cxt_mngr {
 	/* computed ILT structure */
 	struct qed_ilt_client_cfg	clients[ILT_CLI_MAX];
 
+	/* total number of VFs for this hwfn -
+	 * ALL VFs are symmetric in terms of HW resources
+	 */
+	u32				vf_count;
+
 	/* Acquired CIDs */
 	struct qed_cid_acquired_map	acquired[MAX_CONN_TYPES];
 
@@ -131,37 +144,60 @@ struct qed_cxt_mngr {
 	u32				pf_start_line;
 };
 
-static u32 qed_cxt_cdu_iids(struct qed_cxt_mngr *p_mngr)
-{
-	u32 type, pf_cids = 0;
+/* counts the iids for the CDU/CDUC ILT client configuration */
+struct qed_cdu_iids {
+	u32 pf_cids;
+	u32 per_vf_cids;
+};
 
-	for (type = 0; type < MAX_CONN_TYPES; type++)
-		pf_cids += p_mngr->conn_cfg[type].cid_count;
+static void qed_cxt_cdu_iids(struct qed_cxt_mngr *p_mngr,
+			     struct qed_cdu_iids *iids)
+{
+	u32 type;
 
-	return pf_cids;
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		iids->pf_cids += p_mngr->conn_cfg[type].cid_count;
+		iids->per_vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+	}
 }
 
 static void qed_cxt_qm_iids(struct qed_hwfn *p_hwfn,
 			    struct qed_qm_iids *iids)
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
-	int type;
+	u32 vf_cids = 0, type;
 
-	for (type = 0; type < MAX_CONN_TYPES; type++)
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
 		iids->cids += p_mngr->conn_cfg[type].cid_count;
+		vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+	}
 
-	DP_VERBOSE(p_hwfn, QED_MSG_ILT, "iids: CIDS %08x\n", iids->cids);
+	iids->vf_cids += vf_cids * p_mngr->vf_count;
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "iids: CIDS %08x vf_cids %08x\n",
+		   iids->cids, iids->vf_cids);
 }
 
 /* set the iids count per protocol */
 static void qed_cxt_set_proto_cid_count(struct qed_hwfn *p_hwfn,
 					enum protocol_type type,
-					u32 cid_count)
+					u32 cid_count, u32 vf_cid_cnt)
 {
 	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
 	struct qed_conn_type_cfg *p_conn = &p_mgr->conn_cfg[type];
 
 	p_conn->cid_count = roundup(cid_count, DQ_RANGE_ALIGN);
+	p_conn->cids_per_vf = roundup(vf_cid_cnt, DQ_RANGE_ALIGN);
+}
+
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn		*p_hwfn,
+				enum protocol_type	type,
+				u32			*vf_cid)
+{
+	if (vf_cid)
+		*vf_cid = p_hwfn->p_cxt_mngr->conn_cfg[type].cids_per_vf;
+
+	return p_hwfn->p_cxt_mngr->conn_cfg[type].cid_count;
 }
 
 static void qed_ilt_cli_blk_fill(struct qed_ilt_client_cfg *p_cli,
@@ -210,10 +246,12 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	struct qed_ilt_client_cfg *p_cli;
 	struct qed_ilt_cli_blk *p_blk;
-	u32 curr_line, total, pf_cids;
+	struct qed_cdu_iids cdu_iids;
 	struct qed_qm_iids qm_iids;
+	u32 curr_line, total, i;
 
 	memset(&qm_iids, 0, sizeof(qm_iids));
+	memset(&cdu_iids, 0, sizeof(cdu_iids));
 
 	p_mngr->pf_start_line = RESC_START(p_hwfn, QED_ILT);
 
@@ -224,14 +262,16 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	/* CDUC */
 	p_cli = &p_mngr->clients[ILT_CLI_CDUC];
 	curr_line = p_mngr->pf_start_line;
+
+	/* CDUC PF */
 	p_cli->pf_total_lines = 0;
 
 	/* get the counters for the CDUC and QM clients  */
-	pf_cids = qed_cxt_cdu_iids(p_mngr);
+	qed_cxt_cdu_iids(p_mngr, &cdu_iids);
 
 	p_blk = &p_cli->pf_blks[CDUC_BLK];
 
-	total = pf_cids * CONN_CXT_SIZE(p_hwfn);
+	total = cdu_iids.pf_cids * CONN_CXT_SIZE(p_hwfn);
 
 	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 			     total, CONN_CXT_SIZE(p_hwfn));
@@ -239,17 +279,36 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
 	p_cli->pf_total_lines = curr_line - p_blk->start_line;
 
+	/* CDUC VF */
+	p_blk = &p_cli->vf_blks[CDUC_BLK];
+	total = cdu_iids.per_vf_cids * CONN_CXT_SIZE(p_hwfn);
+
+	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+			     total, CONN_CXT_SIZE(p_hwfn));
+
+	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
+	p_cli->vf_total_lines = curr_line - p_blk->start_line;
+
+	for (i = 1; i < p_mngr->vf_count; i++)
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUC);
+
 	/* QM */
 	p_cli = &p_mngr->clients[ILT_CLI_QM];
 	p_blk = &p_cli->pf_blks[0];
 
 	qed_cxt_qm_iids(p_hwfn, &qm_iids);
-	total = qed_qm_pf_mem_size(p_hwfn->rel_pf_id, qm_iids.cids, 0, 0,
-				   p_hwfn->qm_info.num_pqs, 0);
-
-	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
-		   "QM ILT Info, (cids=%d, num_pqs=%d, memory_size=%d)\n",
-		   qm_iids.cids, p_hwfn->qm_info.num_pqs, total);
+	total = qed_qm_pf_mem_size(p_hwfn->rel_pf_id, qm_iids.cids,
+				   qm_iids.vf_cids, 0,
+				   p_hwfn->qm_info.num_pqs,
+				   p_hwfn->qm_info.num_vf_pqs);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_ILT,
+		   "QM ILT Info, (cids=%d, vf_cids=%d, num_pqs=%d, num_vf_pqs=%d, memory_size=%d)\n",
+		   qm_iids.cids,
+		   qm_iids.vf_cids,
+		   p_hwfn->qm_info.num_pqs, p_hwfn->qm_info.num_vf_pqs, total);
 
 	qed_ilt_cli_blk_fill(p_cli, p_blk,
 			     curr_line, total * 0x1000,
@@ -358,7 +417,7 @@ static int qed_ilt_shadow_alloc(struct qed_hwfn *p_hwfn)
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	struct qed_ilt_client_cfg *clients = p_mngr->clients;
 	struct qed_ilt_cli_blk *p_blk;
-	u32 size, i, j;
+	u32 size, i, j, k;
 	int rc;
 
 	size = qed_cxt_ilt_shadow_size(clients);
@@ -383,6 +442,16 @@ static int qed_ilt_shadow_alloc(struct qed_hwfn *p_hwfn)
 			if (rc != 0)
 				goto ilt_shadow_fail;
 		}
+		for (k = 0; k < p_mngr->vf_count; k++) {
+			for (j = 0; j < ILT_CLI_VF_BLOCKS; j++) {
+				u32 lines = clients[i].vf_total_lines * k;
+
+				p_blk = &clients[i].vf_blks[j];
+				rc = qed_ilt_blk_alloc(p_hwfn, p_blk, i, lines);
+				if (rc != 0)
+					goto ilt_shadow_fail;
+			}
+		}
 	}
 
 	return 0;
@@ -467,6 +536,9 @@ int qed_cxt_mngr_alloc(struct qed_hwfn *p_hwfn)
 	for (i = 0; i < ILT_CLI_MAX; i++)
 		p_mngr->clients[i].p_size.val = ILT_DEFAULT_HW_P_SIZE;
 
+	if (p_hwfn->cdev->p_iov_info)
+		p_mngr->vf_count = p_hwfn->cdev->p_iov_info->total_vfs;
+
 	/* Set the cxt mangr pointer priori to further allocations */
 	p_hwfn->p_cxt_mngr = p_mngr;
 
@@ -579,8 +651,10 @@ void qed_qm_init_pf(struct qed_hwfn *p_hwfn)
 	params.max_phys_tcs_per_port = qm_info->max_phys_tcs_per_port;
 	params.is_first_pf = p_hwfn->first_on_engine;
 	params.num_pf_cids = iids.cids;
+	params.num_vf_cids = iids.vf_cids;
 	params.start_pq = qm_info->start_pq;
-	params.num_pf_pqs = qm_info->num_pqs;
+	params.num_pf_pqs = qm_info->num_pqs - qm_info->num_vf_pqs;
+	params.num_vf_pqs = qm_info->num_vf_pqs;
 	params.start_vport = qm_info->start_vport;
 	params.num_vports = qm_info->num_vports;
 	params.pf_wfq = qm_info->pf_wfq;
@@ -610,26 +684,55 @@ static int qed_cm_init_pf(struct qed_hwfn *p_hwfn)
 static void qed_dq_init_pf(struct qed_hwfn *p_hwfn)
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
-	u32 dq_pf_max_cid = 0;
+	u32 dq_pf_max_cid = 0, dq_vf_max_cid = 0;
 
 	dq_pf_max_cid += (p_mngr->conn_cfg[0].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_0_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[0].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_0_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[1].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_1_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[1].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_1_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[2].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_2_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[2].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_2_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[3].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_3_RT_OFFSET, dq_pf_max_cid);
 
+	dq_vf_max_cid += (p_mngr->conn_cfg[3].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_3_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[4].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_4_RT_OFFSET, dq_pf_max_cid);
 
-	/* 5 - PF */
+	dq_vf_max_cid += (p_mngr->conn_cfg[4].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_4_RT_OFFSET, dq_vf_max_cid);
+
 	dq_pf_max_cid += (p_mngr->conn_cfg[5].cid_count >> DQ_RANGE_SHIFT);
 	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_5_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[5].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_5_RT_OFFSET, dq_vf_max_cid);
+
+	/* Connection types 6 & 7 are not in use, yet they must be configured
+	 * as the highest possible connection. Not configuring them means the
+	 * defaults will be  used, and with a large number of cids a bug may
+	 * occur, if the defaults will be smaller than dq_pf_max_cid /
+	 * dq_vf_max_cid.
+	 */
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_6_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_6_RT_OFFSET, dq_vf_max_cid);
+
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_7_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_7_RT_OFFSET, dq_vf_max_cid);
 }
 
 static void qed_ilt_bounds_init(struct qed_hwfn *p_hwfn)
@@ -653,6 +756,38 @@ static void qed_ilt_bounds_init(struct qed_hwfn *p_hwfn)
 	}
 }
 
+static void qed_ilt_vf_bounds_init(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	u32 blk_factor;
+
+	/* For simplicty  we set the 'block' to be an ILT page */
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_BASE_RT_OFFSET,
+			     p_iov->first_vf_in_pf);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET,
+			     p_iov->first_vf_in_pf + p_iov->total_vfs);
+	}
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+	blk_factor = ilog2(ILT_PAGE_IN_BYTES(p_cli->p_size.val) >> 10);
+	if (p_cli->active) {
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET,
+			     blk_factor);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET,
+			     p_cli->pf_total_lines);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET,
+			     p_cli->vf_total_lines);
+	}
+}
+
 /* ILT (PSWRQ2) PF */
 static void qed_ilt_init_pf(struct qed_hwfn *p_hwfn)
 {
@@ -662,6 +797,7 @@ static void qed_ilt_init_pf(struct qed_hwfn *p_hwfn)
 	u32 line, rt_offst, i;
 
 	qed_ilt_bounds_init(p_hwfn);
+	qed_ilt_vf_bounds_init(p_hwfn);
 
 	p_mngr = p_hwfn->p_cxt_mngr;
 	p_shdw = p_mngr->ilt_shadow;
@@ -839,10 +975,10 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn)
 	/* Set the number of required CORE connections */
 	u32 core_cids = 1; /* SPQ */
 
-	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids);
+	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids, 0);
 
 	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_ETH,
-				    p_params->num_cons);
+				    p_params->num_cons, 1);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index c8e1f5e5c42b..078ff3fd7920 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -51,6 +51,9 @@ enum qed_cxt_elem_type {
 	QED_ELEM_TASK
 };
 
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type, u32 *vf_cid);
+
 /**
  * @brief qed_cxt_set_pf_params - Set the PF params for cxt init
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 7a359c45360f..362e8db2b374 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -41,10 +41,14 @@ enum BAR_ID {
 static u32 qed_hw_bar_size(struct qed_hwfn	*p_hwfn,
 			   enum BAR_ID		bar_id)
 {
-	u32	bar_reg = (bar_id == BAR_ID_0 ?
-			   PGLUE_B_REG_PF_BAR0_SIZE : PGLUE_B_REG_PF_BAR1_SIZE);
-	u32	val = qed_rd(p_hwfn, p_hwfn->p_main_ptt, bar_reg);
+	u32 bar_reg = (bar_id == BAR_ID_0 ?
+		       PGLUE_B_REG_PF_BAR0_SIZE : PGLUE_B_REG_PF_BAR1_SIZE);
+	u32 val;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 1 << 17;
+
+	val = qed_rd(p_hwfn, p_hwfn->p_main_ptt, bar_reg);
 	if (val)
 		return 1 << (val + 15);
 
@@ -114,6 +118,9 @@ void qed_resc_free(struct qed_dev *cdev)
 {
 	int i;
 
+	if (IS_VF(cdev))
+		return;
+
 	kfree(cdev->fw_data);
 	cdev->fw_data = NULL;
 
@@ -144,14 +151,19 @@ void qed_resc_free(struct qed_dev *cdev)
 
 static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 {
+	u8 num_vports, vf_offset = 0, i, vport_id, num_ports, curr_queue = 0;
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
 	struct init_qm_port_params *p_qm_port;
-	u8 num_vports, i, vport_id, num_ports;
 	u16 num_pqs, multi_cos_tcs = 1;
+	u16 num_vfs = 0;
 
+#ifdef CONFIG_QED_SRIOV
+	if (p_hwfn->cdev->p_iov_info)
+		num_vfs = p_hwfn->cdev->p_iov_info->total_vfs;
+#endif
 	memset(qm_info, 0, sizeof(*qm_info));
 
-	num_pqs = multi_cos_tcs + 1; /* The '1' is for pure-LB */
+	num_pqs = multi_cos_tcs + num_vfs + 1;	/* The '1' is for pure-LB */
 	num_vports = (u8)RESC_NUM(p_hwfn, QED_VPORT);
 
 	/* Sanity checking that setup requires legal number of resources */
@@ -187,8 +199,9 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
 
 	/* First init per-TC PQs */
-	for (i = 0; i < multi_cos_tcs; i++) {
-		struct init_qm_pq_params *params = &qm_info->qm_pq_params[i];
+	for (i = 0; i < multi_cos_tcs; i++, curr_queue++) {
+		struct init_qm_pq_params *params =
+		    &qm_info->qm_pq_params[curr_queue];
 
 		params->vport_id = vport_id;
 		params->tc_id = p_hwfn->hw_info.non_offload_tc;
@@ -196,13 +209,26 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	}
 
 	/* Then init pure-LB PQ */
-	qm_info->pure_lb_pq = i;
-	qm_info->qm_pq_params[i].vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
-	qm_info->qm_pq_params[i].tc_id = PURE_LB_TC;
-	qm_info->qm_pq_params[i].wrr_group = 1;
-	i++;
+	qm_info->pure_lb_pq = curr_queue;
+	qm_info->qm_pq_params[curr_queue].vport_id =
+	    (u8) RESC_START(p_hwfn, QED_VPORT);
+	qm_info->qm_pq_params[curr_queue].tc_id = PURE_LB_TC;
+	qm_info->qm_pq_params[curr_queue].wrr_group = 1;
+	curr_queue++;
 
 	qm_info->offload_pq = 0;
+	/* Then init per-VF PQs */
+	vf_offset = curr_queue;
+	for (i = 0; i < num_vfs; i++) {
+		/* First vport is used by the PF */
+		qm_info->qm_pq_params[curr_queue].vport_id = vport_id + i + 1;
+		qm_info->qm_pq_params[curr_queue].tc_id =
+		    p_hwfn->hw_info.non_offload_tc;
+		qm_info->qm_pq_params[curr_queue].wrr_group = 1;
+		curr_queue++;
+	}
+
+	qm_info->vf_queues_offset = vf_offset;
 	qm_info->num_pqs = num_pqs;
 	qm_info->num_vports = num_vports;
 
@@ -220,7 +246,8 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 
 	qm_info->start_pq = (u16)RESC_START(p_hwfn, QED_PQ);
 
-	qm_info->start_vport = (u8)RESC_START(p_hwfn, QED_VPORT);
+	qm_info->num_vf_pqs = num_vfs;
+	qm_info->start_vport = (u8) RESC_START(p_hwfn, QED_VPORT);
 
 	for (i = 0; i < qm_info->num_vports; i++)
 		qm_info->qm_vport_params[i].vport_wfq = 1;
@@ -244,6 +271,9 @@ int qed_resc_alloc(struct qed_dev *cdev)
 	struct qed_eq *p_eq;
 	int i, rc = 0;
 
+	if (IS_VF(cdev))
+		return rc;
+
 	cdev->fw_data = kzalloc(sizeof(*cdev->fw_data), GFP_KERNEL);
 	if (!cdev->fw_data)
 		return -ENOMEM;
@@ -364,6 +394,9 @@ void qed_resc_setup(struct qed_dev *cdev)
 {
 	int i;
 
+	if (IS_VF(cdev))
+		return;
+
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
@@ -508,7 +541,9 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
 	struct qed_qm_common_rt_init_params params;
 	struct qed_dev *cdev = p_hwfn->cdev;
+	u32 concrete_fid;
 	int rc = 0;
+	u8 vf_id;
 
 	qed_init_cau_rt_data(cdev);
 
@@ -558,6 +593,14 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 	qed_wr(p_hwfn, p_ptt, 0x20b4,
 	       qed_rd(p_hwfn, p_ptt, 0x20b4) & ~0x10);
 
+	for (vf_id = 0; vf_id < MAX_NUM_VFS_BB; vf_id++) {
+		concrete_fid = qed_vfid_to_concrete(p_hwfn, vf_id);
+		qed_fid_pretend(p_hwfn, p_ptt, (u16) concrete_fid);
+		qed_wr(p_hwfn, p_ptt, CCFC_REG_STRONG_ENABLE_VF, 0x1);
+	}
+	/* pretend to original PF */
+	qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+
 	return rc;
 }
 
@@ -698,13 +741,20 @@ int qed_hw_init(struct qed_dev *cdev,
 	u32 load_code, param;
 	int rc, mfw_rc, i;
 
-	rc = qed_init_fw_data(cdev, bin_fw_data);
-	if (rc != 0)
-		return rc;
+	if (IS_PF(cdev)) {
+		rc = qed_init_fw_data(cdev, bin_fw_data);
+		if (rc != 0)
+			return rc;
+	}
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			p_hwfn->b_int_enabled = 1;
+			continue;
+		}
+
 		/* Enable DMAE in PXP */
 		rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true);
 
@@ -829,6 +879,11 @@ int qed_hw_stop(struct qed_dev *cdev)
 
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Stopping hw/fw\n");
 
+		if (IS_VF(cdev)) {
+			/* To be implemented in a later patch */
+			continue;
+		}
+
 		/* mark the hw as uninitialized... */
 		p_hwfn->hw_init_done = false;
 
@@ -860,15 +915,16 @@ int qed_hw_stop(struct qed_dev *cdev)
 		usleep_range(1000, 2000);
 	}
 
-	/* Disable DMAE in PXP - in CMT, this should only be done for
-	 * first hw-function, and only after all transactions have
-	 * stopped for all active hw-functions.
-	 */
-	t_rc = qed_change_pci_hwfn(&cdev->hwfns[0],
-				   cdev->hwfns[0].p_main_ptt,
-				   false);
-	if (t_rc != 0)
-		rc = t_rc;
+	if (IS_PF(cdev)) {
+		/* Disable DMAE in PXP - in CMT, this should only be done for
+		 * first hw-function, and only after all transactions have
+		 * stopped for all active hw-functions.
+		 */
+		t_rc = qed_change_pci_hwfn(&cdev->hwfns[0],
+					   cdev->hwfns[0].p_main_ptt, false);
+		if (t_rc != 0)
+			rc = t_rc;
+	}
 
 	return rc;
 }
@@ -932,6 +988,11 @@ int qed_hw_reset(struct qed_dev *cdev)
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			/* Will be implemented in a later patch */
+			continue;
+		}
+
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Resetting hw/fw\n");
 
 		/* Check for incorrect states */
@@ -1027,11 +1088,10 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
 static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 {
 	u32 *resc_start = p_hwfn->hw_info.resc_start;
+	u8 num_funcs = p_hwfn->num_funcs_on_engine;
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	struct qed_sb_cnt_info sb_cnt_info;
-	int num_funcs, i;
-
-	num_funcs = MAX_NUM_PFS_BB;
+	int i;
 
 	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
 	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
@@ -1238,6 +1298,51 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn,
 	return qed_mcp_fill_shmem_func_info(p_hwfn, p_ptt);
 }
 
+static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 reg_function_hide, tmp, eng_mask;
+	u8 num_funcs;
+
+	num_funcs = MAX_NUM_PFS_BB;
+
+	/* Bit 0 of MISCS_REG_FUNCTION_HIDE indicates whether the bypass values
+	 * in the other bits are selected.
+	 * Bits 1-15 are for functions 1-15, respectively, and their value is
+	 * '0' only for enabled functions (function 0 always exists and
+	 * enabled).
+	 * In case of CMT, only the "even" functions are enabled, and thus the
+	 * number of functions for both hwfns is learnt from the same bits.
+	 */
+	reg_function_hide = qed_rd(p_hwfn, p_ptt, MISCS_REG_FUNCTION_HIDE);
+
+	if (reg_function_hide & 0x1) {
+		if (QED_PATH_ID(p_hwfn) && p_hwfn->cdev->num_hwfns == 1) {
+			num_funcs = 0;
+			eng_mask = 0xaaaa;
+		} else {
+			num_funcs = 1;
+			eng_mask = 0x5554;
+		}
+
+		/* Get the number of the enabled functions on the engine */
+		tmp = (reg_function_hide ^ 0xffffffff) & eng_mask;
+		while (tmp) {
+			if (tmp & 0x1)
+				num_funcs++;
+			tmp >>= 0x1;
+		}
+	}
+
+	p_hwfn->num_funcs_on_engine = num_funcs;
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_PROBE,
+		   "PF [rel_id %d, abs_id %d] within the %d enabled functions on the engine\n",
+		   p_hwfn->rel_pf_id,
+		   p_hwfn->abs_pf_id,
+		   p_hwfn->num_funcs_on_engine);
+}
+
 static int
 qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		struct qed_ptt *p_ptt,
@@ -1296,6 +1401,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		p_hwfn->hw_info.personality = protocol;
 	}
 
+	qed_get_num_funcs(p_hwfn, p_ptt);
+
 	qed_hw_get_resc(p_hwfn);
 
 	return rc;
@@ -1361,6 +1468,9 @@ static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn,
 	p_hwfn->regview = p_regview;
 	p_hwfn->doorbells = p_doorbells;
 
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_hw_prepare(p_hwfn);
+
 	/* Validate that chip access is feasible */
 	if (REG_RD(p_hwfn, PXP_PF_ME_OPAQUE_ADDR) == 0xffffffff) {
 		DP_ERR(p_hwfn,
@@ -1428,7 +1538,8 @@ int qed_hw_prepare(struct qed_dev *cdev,
 	int rc;
 
 	/* Store the precompiled init data ptrs */
-	qed_init_iro_array(cdev);
+	if (IS_PF(cdev))
+		qed_init_iro_array(cdev);
 
 	/* Initialize the first hwfn - will learn number of hwfns */
 	rc = qed_hw_prepare_single(p_hwfn,
@@ -1460,9 +1571,11 @@ int qed_hw_prepare(struct qed_dev *cdev,
 		 * initiliazed hwfn 0.
 		 */
 		if (rc) {
-			qed_init_free(p_hwfn);
-			qed_mcp_free(p_hwfn);
-			qed_hw_hwfn_free(p_hwfn);
+			if (IS_PF(cdev)) {
+				qed_init_free(p_hwfn);
+				qed_mcp_free(p_hwfn);
+				qed_hw_hwfn_free(p_hwfn);
+			}
 		}
 	}
 
@@ -1476,6 +1589,11 @@ void qed_hw_remove(struct qed_dev *cdev)
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (IS_VF(cdev)) {
+			/* Will be implemented in a later patch */
+			continue;
+		}
+
 		qed_init_free(p_hwfn);
 		qed_hw_hwfn_free(p_hwfn);
 		qed_mcp_free(p_hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 6ba197c107ed..c511106870d0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -29,7 +29,7 @@ struct qed_ptt;
 enum common_event_opcode {
 	COMMON_EVENT_PF_START,
 	COMMON_EVENT_PF_STOP,
-	COMMON_EVENT_RESERVED,
+	COMMON_EVENT_VF_START,
 	COMMON_EVENT_RESERVED2,
 	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
@@ -44,7 +44,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_UNUSED,
 	COMMON_RAMROD_PF_START /* PF Function Start Ramrod */,
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
-	COMMON_RAMROD_RESERVED,
+	COMMON_RAMROD_VF_START,
 	COMMON_RAMROD_RESERVED2,
 	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
@@ -573,6 +573,14 @@ union event_ring_element {
 	struct event_ring_next_addr	next_addr;
 };
 
+struct mstorm_non_trigger_vf_zone {
+	struct eth_mstorm_per_queue_stat eth_queue_stat;
+};
+
+struct mstorm_vf_zone {
+	struct mstorm_non_trigger_vf_zone non_trigger;
+};
+
 enum personality_type {
 	BAD_PERSONALITY_TYP,
 	PERSONALITY_RESERVED,
@@ -671,6 +679,16 @@ enum ports_mode {
 	MAX_PORTS_MODE
 };
 
+struct pstorm_non_trigger_vf_zone {
+	struct eth_pstorm_per_queue_stat eth_queue_stat;
+	struct regpair reserved[2];
+};
+
+struct pstorm_vf_zone {
+	struct pstorm_non_trigger_vf_zone non_trigger;
+	struct regpair reserved[7];
+};
+
 /* Ramrod Header of SPQE */
 struct ramrod_header {
 	__le32	cid /* Slowpath Connection CID */;
@@ -700,6 +718,29 @@ struct tstorm_per_port_stat {
 	struct regpair	preroce_irregular_pkt;
 };
 
+struct ustorm_non_trigger_vf_zone {
+	struct eth_ustorm_per_queue_stat eth_queue_stat;
+	struct regpair vf_pf_msg_addr;
+};
+
+struct ustorm_trigger_vf_zone {
+	u8 vf_pf_msg_valid;
+	u8 reserved[7];
+};
+
+struct ustorm_vf_zone {
+	struct ustorm_non_trigger_vf_zone non_trigger;
+	struct ustorm_trigger_vf_zone trigger;
+};
+
+struct vf_start_ramrod_data {
+	u8 vf_id;
+	u8 enable_flr_ack;
+	__le16 opaque_fid;
+	u8 personality;
+	u8 reserved[3];
+};
+
 struct atten_status_block {
 	__le32	atten_bits;
 	__le32	atten_ack;
@@ -1026,7 +1067,7 @@ enum init_phases {
 	PHASE_ENGINE,
 	PHASE_PORT,
 	PHASE_PF,
-	PHASE_RESERVED,
+	PHASE_VF,
 	PHASE_QM_PF,
 	MAX_INIT_PHASES
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index a9be5a422d2d..0ada7fdb91bc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -23,6 +23,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 #define QED_BAR_ACQUIRE_TIMEOUT 1000
 
@@ -236,8 +237,12 @@ static void qed_memcpy_hw(struct qed_hwfn *p_hwfn,
 		quota = min_t(size_t, n - done,
 			      PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE);
 
-		qed_ptt_set_win(p_hwfn, p_ptt, hw_addr + done);
-		hw_offset = qed_ptt_get_bar_addr(p_ptt);
+		if (IS_PF(p_hwfn->cdev)) {
+			qed_ptt_set_win(p_hwfn, p_ptt, hw_addr + done);
+			hw_offset = qed_ptt_get_bar_addr(p_ptt);
+		} else {
+			hw_offset = hw_addr + done;
+		}
 
 		dw_count = quota / 4;
 		host_addr = (u32 *)((u8 *)addr + done);
@@ -808,6 +813,9 @@ u16 qed_get_qm_pq(struct qed_hwfn *p_hwfn,
 		break;
 	case PROTOCOLID_ETH:
 		pq_id = p_params->eth.tc;
+		if (p_params->eth.is_vf)
+			pq_id += p_hwfn->qm_info.vf_queues_offset +
+				 p_params->eth.vf_id;
 		break;
 	default:
 		pq_id = 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
index 3269b3610e03..d358c3bb1308 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -18,6 +18,7 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
 
 #define QED_INIT_MAX_POLL_COUNT 100
 #define QED_INIT_POLL_PERIOD_US 500
@@ -128,6 +129,9 @@ int qed_init_alloc(struct qed_hwfn *p_hwfn)
 {
 	struct qed_rt_data *rt_data = &p_hwfn->rt_data;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	rt_data->b_valid = kzalloc(sizeof(bool) * RUNTIME_ARRAY_SIZE,
 				   GFP_KERNEL);
 	if (!rt_data->b_valid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index 2017b0121f5f..bbecfa579364 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -26,6 +26,8 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
 
 struct qed_pi_info {
 	qed_int_comp_cb_t	comp_cb;
@@ -2513,6 +2515,9 @@ void qed_int_cau_conf_pi(struct qed_hwfn *p_hwfn,
 	u32 sb_offset;
 	u32 pi_offset;
 
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
 	sb_offset = igu_sb_id * PIS_PER_SB;
 	memset(&pi_entry, 0, sizeof(struct cau_pi_entry));
 
@@ -2542,8 +2547,9 @@ void qed_int_sb_setup(struct qed_hwfn *p_hwfn,
 	sb_info->sb_ack = 0;
 	memset(sb_info->sb_virt, 0, sizeof(*sb_info->sb_virt));
 
-	qed_int_cau_conf_sb(p_hwfn, p_ptt, sb_info->sb_phys,
-			    sb_info->igu_sb_id, 0, 0);
+	if (IS_PF(p_hwfn->cdev))
+		qed_int_cau_conf_sb(p_hwfn, p_ptt, sb_info->sb_phys,
+				    sb_info->igu_sb_id, 0, 0);
 }
 
 /**
@@ -2563,8 +2569,10 @@ static u16 qed_get_igu_sb_id(struct qed_hwfn *p_hwfn,
 	/* Assuming continuous set of IGU SBs dedicated for given PF */
 	if (sb_id == QED_SP_SB_ID)
 		igu_sb_id = p_hwfn->hw_info.p_igu_info->igu_dsb_id;
-	else
+	else if (IS_PF(p_hwfn->cdev))
 		igu_sb_id = sb_id + p_hwfn->hw_info.p_igu_info->igu_base_sb;
+	else
+		igu_sb_id = qed_vf_get_igu_sb_id(p_hwfn, sb_id);
 
 	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR, "SB [%s] index is 0x%04x\n",
 		   (sb_id == QED_SP_SB_ID) ? "DSB" : "non-DSB", igu_sb_id);
@@ -2594,9 +2602,16 @@ int qed_int_sb_init(struct qed_hwfn *p_hwfn,
 	/* The igu address will hold the absolute address that needs to be
 	 * written to for a specific status block
 	 */
-	sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
-					  GTT_BAR0_MAP_REG_IGU_CMD +
-					  (sb_info->igu_sb_id << 3);
+	if (IS_PF(p_hwfn->cdev)) {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  GTT_BAR0_MAP_REG_IGU_CMD +
+						  (sb_info->igu_sb_id << 3);
+	} else {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  PXP_VF_BAR0_START_IGU +
+						  ((IGU_CMD_INT_ACK_BASE +
+						    sb_info->igu_sb_id) << 3);
+	}
 
 	sb_info->flags |= QED_SB_INFO_INIT;
 
@@ -2783,6 +2798,9 @@ void qed_int_igu_disable_int(struct qed_hwfn *p_hwfn,
 {
 	p_hwfn->b_int_enabled = 0;
 
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
 	qed_wr(p_hwfn, p_ptt, IGU_REG_PF_CONFIGURATION, 0);
 }
 
@@ -2935,9 +2953,9 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt)
 {
 	struct qed_igu_info *p_igu_info;
+	u32 val, min_vf = 0, max_vf = 0;
+	u16 sb_id, last_iov_sb_id = 0;
 	struct qed_igu_block *blk;
-	u32 val;
-	u16 sb_id;
 	u16 prev_sb_id = 0xFF;
 
 	p_hwfn->hw_info.p_igu_info = kzalloc(sizeof(*p_igu_info), GFP_KERNEL);
@@ -2947,12 +2965,19 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 
 	p_igu_info = p_hwfn->hw_info.p_igu_info;
 
-	/* Initialize base sb / sb cnt for PFs */
+	/* Initialize base sb / sb cnt for PFs and VFs */
 	p_igu_info->igu_base_sb		= 0xffff;
 	p_igu_info->igu_sb_cnt		= 0;
 	p_igu_info->igu_dsb_id		= 0xffff;
 	p_igu_info->igu_base_sb_iov	= 0xffff;
 
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		min_vf	= p_iov->first_vf_in_pf;
+		max_vf	= p_iov->first_vf_in_pf + p_iov->total_vfs;
+	}
+
 	for (sb_id = 0; sb_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev);
 	     sb_id++) {
 		blk = &p_igu_info->igu_map.igu_blocks[sb_id];
@@ -2986,14 +3011,43 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
 					(p_igu_info->igu_sb_cnt)++;
 				}
 			}
+		} else {
+			if ((blk->function_id >= min_vf) &&
+			    (blk->function_id < max_vf)) {
+				/* Available for VFs of this PF */
+				if (p_igu_info->igu_base_sb_iov == 0xffff) {
+					p_igu_info->igu_base_sb_iov = sb_id;
+				} else if (last_iov_sb_id != sb_id - 1) {
+					if (!val) {
+						DP_VERBOSE(p_hwfn->cdev,
+							   NETIF_MSG_INTR,
+							   "First uninitialized IGU CAM entry at index 0x%04x\n",
+							   sb_id);
+					} else {
+						DP_NOTICE(p_hwfn->cdev,
+							  "Consecutive igu vectors for HWFN %x vfs is broken [jumps from %04x to %04x]\n",
+							  p_hwfn->rel_pf_id,
+							  last_iov_sb_id,
+							  sb_id); }
+					break;
+				}
+				blk->status |= QED_IGU_STATUS_FREE;
+				p_hwfn->hw_info.p_igu_info->free_blks++;
+				last_iov_sb_id = sb_id;
+			}
 		}
 	}
-
-	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
-		   "IGU igu_base_sb=0x%x igu_sb_cnt=%d igu_dsb_id=0x%x\n",
-		   p_igu_info->igu_base_sb,
-		   p_igu_info->igu_sb_cnt,
-		   p_igu_info->igu_dsb_id);
+	p_igu_info->igu_sb_cnt_iov = p_igu_info->free_blks;
+
+	DP_VERBOSE(
+		p_hwfn,
+		NETIF_MSG_INTR,
+		"IGU igu_base_sb=0x%x [IOV 0x%x] igu_sb_cnt=%d [IOV 0x%x] igu_dsb_id=0x%x\n",
+		p_igu_info->igu_base_sb,
+		p_igu_info->igu_base_sb_iov,
+		p_igu_info->igu_sb_cnt,
+		p_igu_info->igu_sb_cnt_iov,
+		p_igu_info->igu_dsb_id);
 
 	if (p_igu_info->igu_base_sb == 0xffff ||
 	    p_igu_info->igu_dsb_id == 0xffff ||
@@ -3116,6 +3170,23 @@ void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
 	p_sb_cnt_info->sb_free_blk	= info->free_blks;
 }
 
+u16 qed_int_queue_id_from_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+
+	/* Determine origin of SB id */
+	if ((sb_id >= p_info->igu_base_sb) &&
+	    (sb_id < p_info->igu_base_sb + p_info->igu_sb_cnt)) {
+		return sb_id - p_info->igu_base_sb;
+	} else if ((sb_id >= p_info->igu_base_sb_iov) &&
+		   (sb_id < p_info->igu_base_sb_iov + p_info->igu_sb_cnt_iov)) {
+		return sb_id - p_info->igu_base_sb_iov + p_info->igu_sb_cnt;
+	} else {
+		DP_NOTICE(p_hwfn, "SB %d not in range for function\n", sb_id);
+		return 0;
+	}
+}
+
 void qed_int_disable_post_isr_release(struct qed_dev *cdev)
 {
 	int i;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
index c57f2e680770..295df4451e31 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -20,6 +20,12 @@
 #define IGU_PF_CONF_ATTN_BIT_EN   (0x1 << 3)    /* attention enable       */
 #define IGU_PF_CONF_SINGLE_ISR_EN (0x1 << 4)    /* single ISR mode enable */
 #define IGU_PF_CONF_SIMD_MODE     (0x1 << 5)    /* simd all ones mode     */
+/* Fields of IGU VF CONFIGRATION REGISTER */
+#define IGU_VF_CONF_FUNC_EN        (0x1 << 0)	/* function enable        */
+#define IGU_VF_CONF_MSI_MSIX_EN    (0x1 << 1)	/* MSI/MSIX enable        */
+#define IGU_VF_CONF_SINGLE_ISR_EN  (0x1 << 4)	/* single ISR mode enable */
+#define IGU_VF_CONF_PARENT_MASK    (0xF)	/* Parent PF              */
+#define IGU_VF_CONF_PARENT_SHIFT   5		/* Parent PF              */
 
 /* Igu control commands
  */
@@ -364,6 +370,16 @@ void qed_int_free(struct qed_hwfn *p_hwfn);
 void qed_int_setup(struct qed_hwfn *p_hwfn,
 		   struct qed_ptt *p_ptt);
 
+/**
+ * @brief - Returns an Rx queue index appropriate for usage with given SB.
+ *
+ * @param p_hwfn
+ * @param sb_id - absolute index of SB
+ *
+ * @return index of Rx queue
+ */
+u16 qed_int_queue_id_from_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
 /**
  * @brief - Enable Interrupt & Attention for hw function
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 31e1d510a991..8bcbf92b776f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -34,6 +34,7 @@
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 struct qed_rss_params {
 	u8	update_rss_config;
@@ -1580,32 +1581,53 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 
 	info->num_tc = 1;
 
-	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
-		for_each_hwfn(cdev, i)
-			info->num_queues += FEAT_NUM(&cdev->hwfns[i],
-						     QED_PF_L2_QUE);
-		if (cdev->int_params.fp_msix_cnt)
-			info->num_queues = min_t(u8, info->num_queues,
-						 cdev->int_params.fp_msix_cnt);
+	if (IS_PF(cdev)) {
+		if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+			for_each_hwfn(cdev, i)
+			    info->num_queues +=
+			    FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE);
+			if (cdev->int_params.fp_msix_cnt)
+				info->num_queues =
+				    min_t(u8, info->num_queues,
+					  cdev->int_params.fp_msix_cnt);
+		} else {
+			info->num_queues = cdev->num_hwfns;
+		}
+
+		info->num_vlan_filters = RESC_NUM(&cdev->hwfns[0], QED_VLAN);
+		ether_addr_copy(info->port_mac,
+				cdev->hwfns[0].hw_info.hw_mac_addr);
 	} else {
-		info->num_queues = cdev->num_hwfns;
-	}
+		qed_vf_get_num_rxqs(QED_LEADING_HWFN(cdev), &info->num_queues);
+		if (cdev->num_hwfns > 1) {
+			u8 queues = 0;
 
-	info->num_vlan_filters = RESC_NUM(&cdev->hwfns[0], QED_VLAN);
-	ether_addr_copy(info->port_mac,
-			cdev->hwfns[0].hw_info.hw_mac_addr);
+			qed_vf_get_num_rxqs(&cdev->hwfns[1], &queues);
+			info->num_queues += queues;
+		}
+
+		qed_vf_get_num_vlan_filters(&cdev->hwfns[0],
+					    &info->num_vlan_filters);
+		qed_vf_get_port_mac(&cdev->hwfns[0], info->port_mac);
+	}
 
 	qed_fill_dev_info(cdev, &info->common);
 
+	if (IS_VF(cdev))
+		memset(info->common.hw_mac, 0, ETH_ALEN);
+
 	return 0;
 }
 
 static void qed_register_eth_ops(struct qed_dev *cdev,
-				 struct qed_eth_cb_ops *ops,
-				 void *cookie)
+				 struct qed_eth_cb_ops *ops, void *cookie)
 {
-	cdev->protocol_ops.eth	= ops;
-	cdev->ops_cookie	= cookie;
+	cdev->protocol_ops.eth = ops;
+	cdev->ops_cookie = cookie;
+
+	/* For VF, we start bulletin reading */
+	if (IS_VF(cdev))
+		qed_vf_start_iov_wq(cdev);
 }
 
 static int qed_start_vport(struct qed_dev *cdev,
@@ -1890,6 +1912,9 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 	struct qed_tunn_update_params tunn_info;
 	int i, rc;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	memset(&tunn_info, 0, sizeof(tunn_info));
 	if (tunn_params->update_vxlan_port == 1) {
 		tunn_info.update_vxlan_udp_port = 1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c209ed49deae..898347bd2db7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -126,7 +126,7 @@ static int qed_init_pci(struct qed_dev *cdev,
 		goto err1;
 	}
 
-	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+	if (IS_PF(cdev) && !(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
 		DP_NOTICE(cdev, "No memory region found in bar #2\n");
 		rc = -EIO;
 		goto err1;
@@ -176,12 +176,14 @@ static int qed_init_pci(struct qed_dev *cdev,
 		goto err2;
 	}
 
-	cdev->db_phys_addr = pci_resource_start(cdev->pdev, 2);
-	cdev->db_size = pci_resource_len(cdev->pdev, 2);
-	cdev->doorbells = ioremap_wc(cdev->db_phys_addr, cdev->db_size);
-	if (!cdev->doorbells) {
-		DP_NOTICE(cdev, "Cannot map doorbell space\n");
-		return -ENOMEM;
+	if (IS_PF(cdev)) {
+			cdev->db_phys_addr = pci_resource_start(cdev->pdev, 2);
+		cdev->db_size = pci_resource_len(cdev->pdev, 2);
+		cdev->doorbells = ioremap_wc(cdev->db_phys_addr, cdev->db_size);
+		if (!cdev->doorbells) {
+			DP_NOTICE(cdev, "Cannot map doorbell space\n");
+			return -ENOMEM;
+		}
 	}
 
 	return 0;
@@ -208,20 +210,32 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
 	ether_addr_copy(dev_info->hw_mac, cdev->hwfns[0].hw_info.hw_mac_addr);
 
-	dev_info->fw_major = FW_MAJOR_VERSION;
-	dev_info->fw_minor = FW_MINOR_VERSION;
-	dev_info->fw_rev = FW_REVISION_VERSION;
-	dev_info->fw_eng = FW_ENGINEERING_VERSION;
-	dev_info->mf_mode = cdev->mf_mode;
+	if (IS_PF(cdev)) {
+		dev_info->fw_major = FW_MAJOR_VERSION;
+		dev_info->fw_minor = FW_MINOR_VERSION;
+		dev_info->fw_rev = FW_REVISION_VERSION;
+		dev_info->fw_eng = FW_ENGINEERING_VERSION;
+		dev_info->mf_mode = cdev->mf_mode;
+	} else {
+		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
+				      &dev_info->fw_minor, &dev_info->fw_rev,
+				      &dev_info->fw_eng);
+	}
 
-	qed_mcp_get_mfw_ver(cdev, &dev_info->mfw_rev);
+	if (IS_PF(cdev)) {
+		ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+		if (ptt) {
+			qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), ptt,
+					    &dev_info->mfw_rev, NULL);
 
-	ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
-	if (ptt) {
-		qed_mcp_get_flash_size(QED_LEADING_HWFN(cdev), ptt,
-				       &dev_info->flash_size);
+			qed_mcp_get_flash_size(QED_LEADING_HWFN(cdev), ptt,
+					       &dev_info->flash_size);
 
-		qed_ptt_release(QED_LEADING_HWFN(cdev), ptt);
+			qed_ptt_release(QED_LEADING_HWFN(cdev), ptt);
+		}
+	} else {
+		qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), NULL,
+				    &dev_info->mfw_rev, NULL);
 	}
 
 	return 0;
@@ -258,9 +272,7 @@ static int qed_set_power_state(struct qed_dev *cdev,
 
 /* probing */
 static struct qed_dev *qed_probe(struct pci_dev *pdev,
-				 enum qed_protocol protocol,
-				 u32 dp_module,
-				 u8 dp_level)
+				 struct qed_probe_params *params)
 {
 	struct qed_dev *cdev;
 	int rc;
@@ -269,9 +281,12 @@ static struct qed_dev *qed_probe(struct pci_dev *pdev,
 	if (!cdev)
 		goto err0;
 
-	cdev->protocol = protocol;
+	cdev->protocol = params->protocol;
 
-	qed_init_dp(cdev, dp_module, dp_level);
+	if (params->is_vf)
+		cdev->b_is_vf = true;
+
+	qed_init_dp(cdev, params->dp_module, params->dp_level);
 
 	rc = qed_init_pci(cdev, pdev);
 	if (rc) {
@@ -665,6 +680,35 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_slowpath_vf_setup_int(struct qed_dev *cdev)
+{
+	int rc;
+
+	memset(&cdev->int_params, 0, sizeof(struct qed_int_params));
+	cdev->int_params.in.int_mode = QED_INT_MODE_MSIX;
+
+	qed_vf_get_num_rxqs(QED_LEADING_HWFN(cdev),
+			    &cdev->int_params.in.num_vectors);
+	if (cdev->num_hwfns > 1) {
+		u8 vectors = 0;
+
+		qed_vf_get_num_rxqs(&cdev->hwfns[1], &vectors);
+		cdev->int_params.in.num_vectors += vectors;
+	}
+
+	/* We want a minimum of one fastpath vector per vf hwfn */
+	cdev->int_params.in.min_msix_cnt = cdev->num_hwfns;
+
+	rc = qed_set_int_mode(cdev, true);
+	if (rc)
+		return rc;
+
+	cdev->int_params.fp_msix_base = 0;
+	cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors;
+
+	return 0;
+}
+
 u32 qed_unzip_data(struct qed_hwfn *p_hwfn, u32 input_len,
 		   u8 *input_buf, u32 max_size, u8 *unzip_buf)
 {
@@ -755,32 +799,38 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	if (qed_iov_wq_start(cdev))
 		goto err;
 
-	rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
-			      &cdev->pdev->dev);
-	if (rc) {
-		DP_NOTICE(cdev,
-			  "Failed to find fw file - /lib/firmware/%s\n",
-			  QED_FW_FILE_NAME);
-		goto err;
+	if (IS_PF(cdev)) {
+		rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
+				      &cdev->pdev->dev);
+		if (rc) {
+			DP_NOTICE(cdev,
+				  "Failed to find fw file - /lib/firmware/%s\n",
+				  QED_FW_FILE_NAME);
+			goto err;
+		}
 	}
 
 	rc = qed_nic_setup(cdev);
 	if (rc)
 		goto err;
 
-	rc = qed_slowpath_setup_int(cdev, params->int_mode);
+	if (IS_PF(cdev))
+		rc = qed_slowpath_setup_int(cdev, params->int_mode);
+	else
+		rc = qed_slowpath_vf_setup_int(cdev);
 	if (rc)
 		goto err1;
 
-	/* Allocate stream for unzipping */
-	rc = qed_alloc_stream_mem(cdev);
-	if (rc) {
-		DP_NOTICE(cdev, "Failed to allocate stream memory\n");
-		goto err2;
-	}
+	if (IS_PF(cdev)) {
+		/* Allocate stream for unzipping */
+		rc = qed_alloc_stream_mem(cdev);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed to allocate stream memory\n");
+			goto err2;
+		}
 
-	/* Start the slowpath */
-	data = cdev->firmware->data;
+		data = cdev->firmware->data;
+	}
 
 	memset(&tunn_info, 0, sizeof(tunn_info));
 	tunn_info.tunn_mode |=  1 << QED_MODE_VXLAN_TUNN |
@@ -793,6 +843,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	tunn_info.tunn_clss_l2gre = QED_TUNN_CLSS_MAC_VLAN;
 	tunn_info.tunn_clss_ipgre = QED_TUNN_CLSS_MAC_VLAN;
 
+	/* Start the slowpath */
 	rc = qed_hw_init(cdev, &tunn_info, true,
 			 cdev->int_params.out.int_mode,
 			 true, data);
@@ -802,18 +853,20 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	DP_INFO(cdev,
 		"HW initialization and function start completed successfully\n");
 
-	hwfn = QED_LEADING_HWFN(cdev);
-	drv_version.version = (params->drv_major << 24) |
-			      (params->drv_minor << 16) |
-			      (params->drv_rev << 8) |
-			      (params->drv_eng);
-	strlcpy(drv_version.name, params->name,
-		MCP_DRV_VER_STR_SIZE - 4);
-	rc = qed_mcp_send_drv_version(hwfn, hwfn->p_main_ptt,
-				      &drv_version);
-	if (rc) {
-		DP_NOTICE(cdev, "Failed sending drv version command\n");
-		return rc;
+	if (IS_PF(cdev)) {
+		hwfn = QED_LEADING_HWFN(cdev);
+		drv_version.version = (params->drv_major << 24) |
+				      (params->drv_minor << 16) |
+				      (params->drv_rev << 8) |
+				      (params->drv_eng);
+		strlcpy(drv_version.name, params->name,
+			MCP_DRV_VER_STR_SIZE - 4);
+		rc = qed_mcp_send_drv_version(hwfn, hwfn->p_main_ptt,
+					      &drv_version);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed sending drv version command\n");
+			return rc;
+		}
 	}
 
 	qed_reset_vport_stats(cdev);
@@ -822,13 +875,15 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 
 err2:
 	qed_hw_timers_stop_all(cdev);
-	qed_slowpath_irq_free(cdev);
+	if (IS_PF(cdev))
+		qed_slowpath_irq_free(cdev);
 	qed_free_stream_mem(cdev);
 	qed_disable_msix(cdev);
 err1:
 	qed_resc_free(cdev);
 err:
-	release_firmware(cdev->firmware);
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
 
 	qed_iov_wq_stop(cdev, false);
 
@@ -840,17 +895,20 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	if (!cdev)
 		return -ENODEV;
 
-	qed_free_stream_mem(cdev);
+	if (IS_PF(cdev)) {
+		qed_free_stream_mem(cdev);
 
-	qed_nic_stop(cdev);
-	qed_slowpath_irq_free(cdev);
+		qed_nic_stop(cdev);
+		qed_slowpath_irq_free(cdev);
+	}
 
 	qed_disable_msix(cdev);
 	qed_nic_reset(cdev);
 
 	qed_iov_wq_stop(cdev, true);
 
-	release_firmware(cdev->firmware);
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
 
 	return 0;
 }
@@ -940,6 +998,9 @@ static int qed_set_link(struct qed_dev *cdev,
 	if (!cdev)
 		return -ENODEV;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	/* The link should be set only once per PF */
 	hwfn = &cdev->hwfns[0];
 
@@ -1051,10 +1112,16 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 	memset(if_link, 0, sizeof(*if_link));
 
 	/* Prepare source inputs */
-	memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
-	memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
-	memcpy(&link_caps, qed_mcp_get_link_capabilities(hwfn),
-	       sizeof(link_caps));
+	if (IS_PF(hwfn->cdev)) {
+		memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
+		memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
+		memcpy(&link_caps, qed_mcp_get_link_capabilities(hwfn),
+		       sizeof(link_caps));
+	} else {
+		memset(&params, 0, sizeof(params));
+		memset(&link, 0, sizeof(link));
+		memset(&link_caps, 0, sizeof(link_caps));
+	}
 
 	/* Set the link parameters to pass to protocol driver */
 	if (link.link_up)
@@ -1177,6 +1244,9 @@ static int qed_drain(struct qed_dev *cdev)
 	struct qed_ptt *ptt;
 	int i, rc;
 
+	if (IS_VF(cdev))
+		return 0;
+
 	for_each_hwfn(cdev, i) {
 		hwfn = &cdev->hwfns[i];
 		ptt = qed_ptt_acquire(hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 2f8309d772c8..83175007f616 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -19,6 +19,8 @@
 #include "qed_hw.h"
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
 #define CHIP_MCP_RESP_ITER_US 10
 
 #define QED_DRV_MB_MAX_RETRIES	(500 * 1000)	/* Account for 5 sec */
@@ -787,26 +789,42 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-int qed_mcp_get_mfw_ver(struct qed_dev *cdev,
-			u32 *p_mfw_ver)
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id)
 {
-	struct qed_hwfn *p_hwfn = &cdev->hwfns[0];
-	struct qed_ptt *p_ptt;
 	u32 global_offsize;
 
-	p_ptt = qed_ptt_acquire(p_hwfn);
-	if (!p_ptt)
-		return -EBUSY;
+	if (IS_VF(p_hwfn->cdev)) {
+		if (p_hwfn->vf_iov_info) {
+			struct pfvf_acquire_resp_tlv *p_resp;
+
+			p_resp = &p_hwfn->vf_iov_info->acquire_resp;
+			*p_mfw_ver = p_resp->pfdev_info.mfw_ver;
+			return 0;
+		} else {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF requested MFW version prior to ACQUIRE\n");
+			return -EINVAL;
+		}
+	}
 
 	global_offsize = qed_rd(p_hwfn, p_ptt,
-				SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->
-						     public_base,
+				SECTION_OFFSIZE_ADDR(p_hwfn->
+						     mcp_info->public_base,
 						     PUBLIC_GLOBAL));
-	*p_mfw_ver = qed_rd(p_hwfn, p_ptt,
-			    SECTION_ADDR(global_offsize, 0) +
-			    offsetof(struct public_global, mfw_ver));
-
-	qed_ptt_release(p_hwfn, p_ptt);
+	*p_mfw_ver =
+	    qed_rd(p_hwfn, p_ptt,
+		   SECTION_ADDR(global_offsize,
+				0) + offsetof(struct public_global, mfw_ver));
+
+	if (p_running_bundle_id != NULL) {
+		*p_running_bundle_id = qed_rd(p_hwfn, p_ptt,
+					      SECTION_ADDR(global_offsize, 0) +
+					      offsetof(struct public_global,
+						       running_bundle_id));
+	}
 
 	return 0;
 }
@@ -817,6 +835,9 @@ int qed_mcp_get_media_type(struct qed_dev *cdev,
 	struct qed_hwfn *p_hwfn = &cdev->hwfns[0];
 	struct qed_ptt  *p_ptt;
 
+	if (IS_VF(cdev))
+		return -EINVAL;
+
 	if (!qed_mcp_is_init(p_hwfn)) {
 		DP_NOTICE(p_hwfn, "MFW is not initialized !\n");
 		return -EBUSY;
@@ -951,6 +972,9 @@ int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
 {
 	u32 flash_size;
 
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
 	flash_size = qed_rd(p_hwfn, p_ptt, MCP_REG_NVM_CFG4);
 	flash_size = (flash_size & MCP_REG_NVM_CFG4_FLASH_SIZE) >>
 		      MCP_REG_NVM_CFG4_FLASH_SIZE_SHIFT;
@@ -961,6 +985,37 @@ int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num)
+{
+	u32 resp = 0, param = 0, rc_param = 0;
+	int rc;
+
+	/* Only Leader can configure MSIX, and need to take CMT into account */
+	if (!IS_LEAD_HWFN(p_hwfn))
+		return 0;
+	num *= p_hwfn->cdev->num_hwfns;
+
+	param |= (vf_id << DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK;
+	param |= (num << DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_MASK;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_CFG_VF_MSIX, param,
+			 &resp, &rc_param);
+
+	if (resp != FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE) {
+		DP_NOTICE(p_hwfn, "VF[%d]: MFW failed to set MSI-X\n", vf_id);
+		rc = -EINVAL;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Requested 0x%02x MSI-x interrupts from VF 0x%02x\n",
+			   num, vf_id);
+	}
+
+	return rc;
+}
+
 int
 qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 5f218eed0541..e3d5cdfe8e8d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -149,13 +149,16 @@ int qed_mcp_set_link(struct qed_hwfn   *p_hwfn,
 /**
  * @brief Get the management firmware version value
  *
- * @param cdev       - qed dev pointer
- * @param mfw_ver    - mfw version value
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_mfw_ver    - mfw version value
+ * @param p_running_bundle_id	- image id in nvram; Optional.
  *
- * @return int - 0 - operation was successul.
+ * @return int - 0 - operation was successful.
  */
-int qed_mcp_get_mfw_ver(struct qed_dev *cdev,
-			u32 *mfw_ver);
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id);
 
 /**
  * @brief Get media type value of the port.
@@ -418,6 +421,20 @@ int qed_mcp_reset(struct qed_hwfn *p_hwfn,
  * @return true iff MFW is running and mcp_info is initialized
  */
 bool qed_mcp_is_init(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief request MFW to configure MSI-X for a VF
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf_id - absolute inside engine
+ * @param num_sbs - number of entries to request
+ *
+ * @return int
+ */
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num);
+
 int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw);
 int qed_configure_pf_max_bandwidth(struct qed_dev *cdev, u8 max_bw);
 int __qed_configure_pf_max_bandwidth(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index bf4d7ccd56bb..a508b6b7f1d4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -39,6 +39,8 @@
 	0x2aae04UL
 #define  PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER	\
 	0x2aa16cUL
+#define PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR \
+	0x2aa118UL
 #define  BAR0_MAP_REG_MSDM_RAM \
 	0x1d00000UL
 #define  BAR0_MAP_REG_USDM_RAM \
@@ -111,6 +113,8 @@
 	0x009778UL
 #define  MISCS_REG_CHIP_METAL \
 	0x009774UL
+#define MISCS_REG_FUNCTION_HIDE \
+	0x0096f0UL
 #define  BRB_REG_HEADER_SIZE \
 	0x340804UL
 #define  BTB_REG_HEADER_SIZE \
@@ -119,6 +123,8 @@
 	0x1c0708UL
 #define  CCFC_REG_ACTIVITY_COUNTER \
 	0x2e8800UL
+#define CCFC_REG_STRONG_ENABLE_VF \
+	0x2e070cUL
 #define  CDU_REG_CID_ADDR_PARAMS	\
 	0x580900UL
 #define  DBG_REG_CLIENT_ENABLE \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index eec137f40895..dde69090379f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -62,6 +62,8 @@ union ramrod_data {
 	struct vport_stop_ramrod_data vport_stop;
 	struct vport_update_ramrod_data vport_update;
 	struct vport_filter_update_ramrod_data vport_filter_update;
+
+	struct vf_start_ramrod_data vf_start;
 };
 
 #define EQ_MAX_CREDIT   0xffffffff
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index e1e2344b1906..ed90947c451d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -20,6 +20,7 @@
 #include "qed_int.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_sriov.h"
 
 int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 			struct qed_spq_entry **pp_ent,
@@ -357,6 +358,13 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 				     &p_ramrod->tunnel_config);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		p_ramrod->base_vf_id = (u8) p_iov->first_vf_in_pf;
+		p_ramrod->num_vfs = (u8) p_iov->total_vfs;
+	}
+
 	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
 		   "Setting event_ring_sb [id %04x index %02x], outer_tag [%d]\n",
 		   sb, sb_index,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 0e439e46fbe9..acac6626a1b2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -387,6 +387,9 @@ static int qed_cqe_completion(
 	struct eth_slow_path_rx_cqe *cqe,
 	enum protocol_type protocol)
 {
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	/* @@@tmp - it's possible we'll eventually want to handle some
 	 * actual commands that can arrive here, but for now this is only
 	 * used to complete the ramrod using the echo value on the cqe
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 4a6af4264141..699d96fb87f0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -6,12 +6,48 @@
  * this source tree.
  */
 
+#include "qed_cxt.h"
+#include "qed_hsi.h"
 #include "qed_hw.h"
+#include "qed_init_ops.h"
 #include "qed_int.h"
+#include "qed_mcp.h"
 #include "qed_reg_addr.h"
+#include "qed_sp.h"
 #include "qed_sriov.h"
 #include "qed_vf.h"
 
+/* IOV ramrods */
+static int qed_sp_vf_start(struct qed_hwfn *p_hwfn,
+			   u32 concrete_vfid, u16 opaque_vfid)
+{
+	struct vf_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_vfid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_START,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_start;
+
+	p_ramrod->vf_id = GET_FIELD(concrete_vfid, PXP_CONCRETE_FID_VFID);
+	p_ramrod->opaque_fid = cpu_to_le16(opaque_vfid);
+
+	p_ramrod->personality = PERSONALITY_ETH;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 			   int rel_vf_id, bool b_enabled_only)
 {
@@ -321,6 +357,9 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 	int pos;
 	int rc;
 
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
 	/* Learn the PCI configuration */
 	pos = pci_find_ext_capability(p_hwfn->cdev->pdev,
 				      PCI_EXT_CAP_ID_SRIOV);
@@ -376,12 +415,189 @@ static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
 		return false;
 
 	/* Check VF validity */
-	if (!qed_iov_is_valid_vfid(p_hwfn, vfid, true))
+	if (IS_VF(p_hwfn->cdev) || !IS_QED_SRIOV(p_hwfn->cdev) ||
+	    !IS_PF_SRIOV_ALLOC(p_hwfn))
 		return false;
 
 	return true;
 }
 
+static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, u8 abs_vfid)
+{
+	qed_wr(p_hwfn, p_ptt,
+	       PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR + (abs_vfid >> 5) * 4,
+	       1 << (abs_vfid & 0x1f));
+}
+
+static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	u32 igu_vf_conf = IGU_VF_CONF_FUNC_EN;
+	int rc;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "Enable internal access for vf %x [abs %x]\n",
+		   vf->abs_vf_id, QED_VF_ABS_ID(p_hwfn, vf));
+
+	qed_iov_vf_pglue_clear_err(p_hwfn, p_ptt, QED_VF_ABS_ID(p_hwfn, vf));
+
+	rc = qed_mcp_config_vf_msix(p_hwfn, p_ptt, vf->abs_vf_id, vf->num_sbs);
+	if (rc)
+		return rc;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	SET_FIELD(igu_vf_conf, IGU_VF_CONF_PARENT, p_hwfn->rel_pf_id);
+	STORE_RT_REG(p_hwfn, IGU_REG_VF_CONFIGURATION_RT_OFFSET, igu_vf_conf);
+
+	qed_init_run(p_hwfn, p_ptt, PHASE_VF, vf->abs_vf_id,
+		     p_hwfn->hw_info.hw_mode);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	if (vf->state != VF_STOPPED) {
+		DP_NOTICE(p_hwfn, "VF[%02x] is already started\n",
+			  vf->abs_vf_id);
+		return -EINVAL;
+	}
+
+	/* Start VF */
+	rc = qed_sp_vf_start(p_hwfn, vf->concrete_fid, vf->opaque_fid);
+	if (rc)
+		DP_NOTICE(p_hwfn, "Failed to start VF[%02x]\n", vf->abs_vf_id);
+
+	vf->state = VF_FREE;
+
+	return rc;
+}
+
+static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, u16 num_rx_queues)
+{
+	struct qed_igu_block *igu_blocks;
+	int qid = 0, igu_id = 0;
+	u32 val = 0;
+
+	igu_blocks = p_hwfn->hw_info.p_igu_info->igu_map.igu_blocks;
+
+	if (num_rx_queues > p_hwfn->hw_info.p_igu_info->free_blks)
+		num_rx_queues = p_hwfn->hw_info.p_igu_info->free_blks;
+	p_hwfn->hw_info.p_igu_info->free_blks -= num_rx_queues;
+
+	SET_FIELD(val, IGU_MAPPING_LINE_FUNCTION_NUMBER, vf->abs_vf_id);
+	SET_FIELD(val, IGU_MAPPING_LINE_VALID, 1);
+	SET_FIELD(val, IGU_MAPPING_LINE_PF_VALID, 0);
+
+	while ((qid < num_rx_queues) &&
+	       (igu_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev))) {
+		if (igu_blocks[igu_id].status & QED_IGU_STATUS_FREE) {
+			struct cau_sb_entry sb_entry;
+
+			vf->igu_sbs[qid] = (u16)igu_id;
+			igu_blocks[igu_id].status &= ~QED_IGU_STATUS_FREE;
+
+			SET_FIELD(val, IGU_MAPPING_LINE_VECTOR_NUMBER, qid);
+
+			qed_wr(p_hwfn, p_ptt,
+			       IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_id,
+			       val);
+
+			/* Configure igu sb in CAU which were marked valid */
+			qed_init_cau_sb_entry(p_hwfn, &sb_entry,
+					      p_hwfn->rel_pf_id,
+					      vf->abs_vf_id, 1);
+			qed_dmae_host2grc(p_hwfn, p_ptt,
+					  (u64)(uintptr_t)&sb_entry,
+					  CAU_REG_SB_VAR_MEMORY +
+					  igu_id * sizeof(u64), 2, 0);
+			qid++;
+		}
+		igu_id++;
+	}
+
+	vf->num_sbs = (u8) num_rx_queues;
+
+	return vf->num_sbs;
+}
+
+static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u16 rel_vf_id, u16 num_rx_queues)
+{
+	u8 num_of_vf_avaiable_chains = 0;
+	struct qed_vf_info *vf = NULL;
+	int rc = 0;
+	u32 cids;
+	u8 i;
+
+	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_init_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->b_init) {
+		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n", rel_vf_id);
+		return -EINVAL;
+	}
+
+	/* Limit number of queues according to number of CIDs */
+	qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, &cids);
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] - requesting to initialize for 0x%04x queues [0x%04x CIDs available]\n",
+		   vf->relative_vf_id, num_rx_queues, (u16) cids);
+	num_rx_queues = min_t(u16, num_rx_queues, ((u16) cids));
+
+	num_of_vf_avaiable_chains = qed_iov_alloc_vf_igu_sbs(p_hwfn,
+							     p_ptt,
+							     vf,
+							     num_rx_queues);
+	if (!num_of_vf_avaiable_chains) {
+		DP_ERR(p_hwfn, "no available igu sbs\n");
+		return -ENOMEM;
+	}
+
+	/* Choose queue number and index ranges */
+	vf->num_rxqs = num_of_vf_avaiable_chains;
+	vf->num_txqs = num_of_vf_avaiable_chains;
+
+	for (i = 0; i < vf->num_rxqs; i++) {
+		u16 queue_id = qed_int_queue_id_from_sb_id(p_hwfn,
+							   vf->igu_sbs[i]);
+
+		if (queue_id > RESC_NUM(p_hwfn, QED_L2_QUEUE)) {
+			DP_NOTICE(p_hwfn,
+				  "VF[%d] will require utilizing of out-of-bounds queues - %04x\n",
+				  vf->relative_vf_id, queue_id);
+			return -EINVAL;
+		}
+
+		/* CIDs are per-VF, so no problem having them 0-based. */
+		vf->vf_queues[i].fw_rx_qid = queue_id;
+		vf->vf_queues[i].fw_tx_qid = queue_id;
+		vf->vf_queues[i].fw_cid = i;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - [%d] SB %04x, Tx/Rx queue %04x CID %04x\n",
+			   vf->relative_vf_id, i, vf->igu_sbs[i], queue_id, i);
+	}
+	rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, vf);
+	if (!rc) {
+		vf->b_init = true;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs++;
+	}
+
+	return rc;
+}
+
 static bool qed_iov_tlv_supported(u16 tlvtype)
 {
 	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
@@ -486,13 +702,147 @@ static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
 	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
 }
 
-static void qed_iov_process_mbx_dummy_resp(struct qed_hwfn *p_hwfn,
-					   struct qed_ptt *p_ptt,
-					   struct qed_vf_info *p_vf)
+static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf)
 {
-	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_NONE,
-			     sizeof(struct pfvf_def_resp_tlv),
-			     PFVF_STATUS_SUCCESS);
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct pfvf_acquire_resp_tlv *resp = &mbx->reply_virt->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	struct vfpf_acquire_tlv *req = &mbx->req_virt->acquire;
+	u8 i, vfpf_status = PFVF_STATUS_SUCCESS;
+	struct pf_vf_resc *resc = &resp->resc;
+
+	/* Validate FW compatibility */
+	if (req->vfdev_info.fw_major != FW_MAJOR_VERSION ||
+	    req->vfdev_info.fw_minor != FW_MINOR_VERSION ||
+	    req->vfdev_info.fw_revision != FW_REVISION_VERSION ||
+	    req->vfdev_info.fw_engineering != FW_ENGINEERING_VERSION) {
+		DP_INFO(p_hwfn,
+			"VF[%d] is running an incompatible driver [VF needs FW %02x:%02x:%02x:%02x but Hypervisor is using %02x:%02x:%02x:%02x]\n",
+			vf->abs_vf_id,
+			req->vfdev_info.fw_major,
+			req->vfdev_info.fw_minor,
+			req->vfdev_info.fw_revision,
+			req->vfdev_info.fw_engineering,
+			FW_MAJOR_VERSION,
+			FW_MINOR_VERSION,
+			FW_REVISION_VERSION, FW_ENGINEERING_VERSION);
+		vfpf_status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	/* On 100g PFs, prevent old VFs from loading */
+	if ((p_hwfn->cdev->num_hwfns > 1) &&
+	    !(req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_100G)) {
+		DP_INFO(p_hwfn,
+			"VF[%d] is running an old driver that doesn't support 100g\n",
+			vf->abs_vf_id);
+		vfpf_status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	memset(resp, 0, sizeof(*resp));
+
+	/* Fill in vf info stuff */
+	vf->opaque_fid = req->vfdev_info.opaque_fid;
+	vf->num_mac_filters = 1;
+	vf->num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+
+	vf->vf_bulletin = req->bulletin_addr;
+	vf->bulletin.size = (vf->bulletin.size < req->bulletin_size) ?
+			    vf->bulletin.size : req->bulletin_size;
+
+	/* fill in pfdev info */
+	pfdev_info->chip_num = p_hwfn->cdev->chip_num;
+	pfdev_info->db_size = 0;
+	pfdev_info->indices_per_sb = PIS_PER_SB;
+
+	pfdev_info->capabilities = PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED |
+				   PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE;
+	if (p_hwfn->cdev->num_hwfns > 1)
+		pfdev_info->capabilities |= PFVF_ACQUIRE_CAP_100G;
+
+	pfdev_info->stats_info.mstats.address =
+	    PXP_VF_BAR0_START_MSDM_ZONE_B +
+	    offsetof(struct mstorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.mstats.len =
+	    sizeof(struct eth_mstorm_per_queue_stat);
+
+	pfdev_info->stats_info.ustats.address =
+	    PXP_VF_BAR0_START_USDM_ZONE_B +
+	    offsetof(struct ustorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.ustats.len =
+	    sizeof(struct eth_ustorm_per_queue_stat);
+
+	pfdev_info->stats_info.pstats.address =
+	    PXP_VF_BAR0_START_PSDM_ZONE_B +
+	    offsetof(struct pstorm_vf_zone, non_trigger.eth_queue_stat);
+	pfdev_info->stats_info.pstats.len =
+	    sizeof(struct eth_pstorm_per_queue_stat);
+
+	pfdev_info->stats_info.tstats.address = 0;
+	pfdev_info->stats_info.tstats.len = 0;
+
+	memcpy(pfdev_info->port_mac, p_hwfn->hw_info.hw_mac_addr, ETH_ALEN);
+
+	pfdev_info->fw_major = FW_MAJOR_VERSION;
+	pfdev_info->fw_minor = FW_MINOR_VERSION;
+	pfdev_info->fw_rev = FW_REVISION_VERSION;
+	pfdev_info->fw_eng = FW_ENGINEERING_VERSION;
+	pfdev_info->os_type = VFPF_ACQUIRE_OS_LINUX;
+	qed_mcp_get_mfw_ver(p_hwfn, p_ptt, &pfdev_info->mfw_ver, NULL);
+
+	pfdev_info->dev_type = p_hwfn->cdev->type;
+	pfdev_info->chip_rev = p_hwfn->cdev->chip_rev;
+
+	resc->num_rxqs = vf->num_rxqs;
+	resc->num_txqs = vf->num_txqs;
+	resc->num_sbs = vf->num_sbs;
+	for (i = 0; i < resc->num_sbs; i++) {
+		resc->hw_sbs[i].hw_sb_id = vf->igu_sbs[i];
+		resc->hw_sbs[i].sb_qid = 0;
+	}
+
+	for (i = 0; i < resc->num_rxqs; i++) {
+		qed_fw_l2_queue(p_hwfn, vf->vf_queues[i].fw_rx_qid,
+				(u16 *)&resc->hw_qid[i]);
+		resc->cid[i] = vf->vf_queues[i].fw_cid;
+	}
+
+	resc->num_mac_filters = min_t(u8, vf->num_mac_filters,
+				      req->resc_request.num_mac_filters);
+	resc->num_vlan_filters = min_t(u8, vf->num_vlan_filters,
+				       req->resc_request.num_vlan_filters);
+
+	/* This isn't really required as VF isn't limited, but some VFs might
+	 * actually test this value, so need to provide it.
+	 */
+	resc->num_mc_filters = req->resc_request.num_mc_filters;
+
+	/* Fill agreed size of bulletin board in response */
+	resp->bulletin_size = vf->bulletin.size;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] ACQUIRE_RESPONSE: pfdev_info- chip_num=0x%x, db_size=%d, idx_per_sb=%d, pf_cap=0x%llx\n"
+		   "resources- n_rxq-%d, n_txq-%d, n_sbs-%d, n_macs-%d, n_vlans-%d\n",
+		   vf->abs_vf_id,
+		   resp->pfdev_info.chip_num,
+		   resp->pfdev_info.db_size,
+		   resp->pfdev_info.indices_per_sb,
+		   resp->pfdev_info.capabilities,
+		   resc->num_rxqs,
+		   resc->num_txqs,
+		   resc->num_sbs,
+		   resc->num_mac_filters,
+		   resc->num_vlan_filters);
+	vf->state = VF_ACQUIRED;
+
+	/* Prepare Response */
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_ACQUIRE,
+			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
@@ -517,7 +867,11 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 
 	/* check if tlv type is known */
 	if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
-		qed_iov_process_mbx_dummy_resp(p_hwfn, p_ptt, p_vf);
+		switch (mbx->first_tlv.tl.type) {
+		case CHANNEL_TLV_ACQUIRE:
+			qed_iov_vf_mbx_acquire(p_hwfn, p_ptt, p_vf);
+			break;
+		}
 	} else {
 		/* unknown TLV - this may belong to a VF driver from the future
 		 * - a version written after this PF driver was written, which
@@ -652,6 +1006,15 @@ void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag)
 	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, 0);
 }
 
+void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i)
+	    queue_delayed_work(cdev->hwfns[i].iov_wq,
+			       &cdev->hwfns[i].iov_task, 0);
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 112216812a12..4f190d25ee14 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -21,6 +21,9 @@
 #endif
 #define IS_PF_SRIOV_ALLOC(p_hwfn)       (!!((p_hwfn)->pf_iov_info))
 
+#define QED_MAX_VF_CHAINS_PER_PF 16
+#define QED_ETH_VF_NUM_VLAN_FILTERS 2
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -60,7 +63,17 @@ struct qed_iov_vf_mbx {
 	struct vfpf_first_tlv first_tlv;
 };
 
+struct qed_vf_q_info {
+	u16 fw_rx_qid;
+	u16 fw_tx_qid;
+	u8 fw_cid;
+	u8 rxq_active;
+	u8 txq_active;
+};
+
 enum vf_state {
+	VF_FREE = 0,		/* VF ready to be acquired holds no resc */
+	VF_ACQUIRED,		/* VF, acquired, but not initalized */
 	VF_STOPPED		/* VF, Stopped */
 };
 
@@ -82,6 +95,17 @@ struct qed_vf_info {
 #define QED_VF_ABS_ID(p_hwfn, p_vf)	(QED_PATH_ID(p_hwfn) ?		      \
 					 (p_vf)->abs_vf_id + MAX_NUM_VFS_BB : \
 					 (p_vf)->abs_vf_id)
+
+	u8 num_rxqs;
+	u8 num_txqs;
+
+	u8 num_sbs;
+
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+	struct qed_vf_q_info vf_queues[QED_MAX_VF_CHAINS_PER_PF];
+	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
+
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
@@ -133,6 +157,26 @@ u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id);
  */
 int qed_iov_hw_info(struct qed_hwfn *p_hwfn);
 
+/**
+ * @brief qed_add_tlv - place a given tlv on the tlv buffer at next offset
+ *
+ * @param p_hwfn
+ * @param p_iov
+ * @param type
+ * @param length
+ *
+ * @return pointer to the newly placed tlv
+ */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length);
+
+/**
+ * @brief list the types and lengths of the tlvs on the buffer
+ *
+ * @param p_hwfn
+ * @param tlvs_list
+ */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list);
+
 /**
  * @brief qed_iov_alloc - allocate sriov related resources
  *
@@ -179,6 +223,7 @@ void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
 int qed_iov_wq_start(struct qed_dev *cdev);
 
 void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
+void qed_vf_start_iov_wq(struct qed_dev *cdev);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -228,6 +273,10 @@ static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
 				    enum qed_iov_wq_flag flag)
 {
 }
+
+static inline void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
new file mode 100644
index 000000000000..a3c8f4e1b9c1
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -0,0 +1,357 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include "qed.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+
+static void *qed_vf_pf_prep(struct qed_hwfn *p_hwfn, u16 type, u16 length)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	void *p_tlv;
+
+	/* This lock is released when we receive PF's response
+	 * in qed_send_msg2pf().
+	 * So, qed_vf_pf_prep() and qed_send_msg2pf()
+	 * must come in sequence.
+	 */
+	mutex_lock(&(p_iov->mutex));
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "preparing to send 0x%04x tlv over vf pf channel\n",
+		   type);
+
+	/* Reset Requst offset */
+	p_iov->offset = (u8 *)p_iov->vf2pf_request;
+
+	/* Clear mailbox - both request and reply */
+	memset(p_iov->vf2pf_request, 0, sizeof(union vfpf_tlvs));
+	memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+
+	/* Init type and length */
+	p_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, type, length);
+
+	/* Init first tlv header */
+	((struct vfpf_first_tlv *)p_tlv)->reply_address =
+	    (u64)p_iov->pf2vf_reply_phys;
+
+	return p_tlv;
+}
+
+static int qed_send_msg2pf(struct qed_hwfn *p_hwfn, u8 *done, u32 resp_size)
+{
+	union vfpf_tlvs *p_req = p_hwfn->vf_iov_info->vf2pf_request;
+	struct ustorm_trigger_vf_zone trigger;
+	struct ustorm_vf_zone *zone_data;
+	int rc = 0, time = 100;
+
+	zone_data = (struct ustorm_vf_zone *)PXP_VF_BAR0_START_USDM_ZONE_B;
+
+	/* output tlvs list */
+	qed_dp_tlv_list(p_hwfn, p_req);
+
+	/* need to add the END TLV to the message size */
+	resp_size += sizeof(struct channel_list_end_tlv);
+
+	/* Send TLVs over HW channel */
+	memset(&trigger, 0, sizeof(struct ustorm_trigger_vf_zone));
+	trigger.vf_pf_msg_valid = 1;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF -> PF [%02x] message: [%08x, %08x] --> %p, %08x --> %p\n",
+		   GET_FIELD(p_hwfn->hw_info.concrete_fid,
+			     PXP_CONCRETE_FID_PFID),
+		   upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   &zone_data->non_trigger.vf_pf_msg_addr,
+		   *((u32 *)&trigger), &zone_data->trigger);
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.lo,
+	       lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.hi,
+	       upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	/* The message data must be written first, to prevent trigger before
+	 * data is written.
+	 */
+	wmb();
+
+	REG_WR(p_hwfn, (uintptr_t)&zone_data->trigger, *((u32 *)&trigger));
+
+	/* When PF would be done with the response, it would write back to the
+	 * `done' address. Poll until then.
+	 */
+	while ((!*done) && time) {
+		msleep(25);
+		time--;
+	}
+
+	if (!*done) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF <-- PF Timeout [Type %d]\n",
+			   p_req->first_tlv.tl.type);
+		rc = -EBUSY;
+		goto exit;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "PF response: %d [Type %d]\n",
+			   *done, p_req->first_tlv.tl.type);
+	}
+
+exit:
+	mutex_unlock(&(p_hwfn->vf_iov_info->mutex));
+
+	return rc;
+}
+
+#define VF_ACQUIRE_THRESH 3
+#define VF_ACQUIRE_MAC_FILTERS 1
+
+static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_acquire_resp_tlv *resp = &p_iov->pf2vf_reply->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	u8 rx_count = 1, tx_count = 1, num_sbs = 1;
+	u8 num_mac = VF_ACQUIRE_MAC_FILTERS;
+	bool resources_acquired = false;
+	struct vfpf_acquire_tlv *req;
+	int rc = 0, attempts = 0;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_ACQUIRE, sizeof(*req));
+
+	/* starting filling the request */
+	req->vfdev_info.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	req->resc_request.num_rxqs = rx_count;
+	req->resc_request.num_txqs = tx_count;
+	req->resc_request.num_sbs = num_sbs;
+	req->resc_request.num_mac_filters = num_mac;
+	req->resc_request.num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+
+	req->vfdev_info.os_type = VFPF_ACQUIRE_OS_LINUX;
+	req->vfdev_info.fw_major = FW_MAJOR_VERSION;
+	req->vfdev_info.fw_minor = FW_MINOR_VERSION;
+	req->vfdev_info.fw_revision = FW_REVISION_VERSION;
+	req->vfdev_info.fw_engineering = FW_ENGINEERING_VERSION;
+
+	/* Fill capability field with any non-deprecated config we support */
+	req->vfdev_info.capabilities |= VFPF_ACQUIRE_CAP_100G;
+
+	/* pf 2 vf bulletin board address */
+	req->bulletin_addr = p_iov->bulletin.phys;
+	req->bulletin_size = p_iov->bulletin.size;
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	while (!resources_acquired) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV, "attempting to acquire resources\n");
+
+		/* send acquire request */
+		rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+		if (rc)
+			return rc;
+
+		/* copy acquire response from buffer to p_hwfn */
+		memcpy(&p_iov->acquire_resp, resp, sizeof(p_iov->acquire_resp));
+
+		attempts++;
+
+		if (resp->hdr.status == PFVF_STATUS_SUCCESS) {
+			/* PF agrees to allocate our resources */
+			if (!(resp->pfdev_info.capabilities &
+			      PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE)) {
+				DP_INFO(p_hwfn,
+					"PF is using old incompatible driver; Either downgrade driver or request provider to update hypervisor version\n");
+				return -EINVAL;
+			}
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV, "resources acquired\n");
+			resources_acquired = true;
+		} else if (resp->hdr.status == PFVF_STATUS_NO_RESOURCE &&
+			   attempts < VF_ACQUIRE_THRESH) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "PF unwilling to fullfill resource request. Try PF recommended amount\n");
+
+			/* humble our request */
+			req->resc_request.num_txqs = resp->resc.num_txqs;
+			req->resc_request.num_rxqs = resp->resc.num_rxqs;
+			req->resc_request.num_sbs = resp->resc.num_sbs;
+			req->resc_request.num_mac_filters =
+			    resp->resc.num_mac_filters;
+			req->resc_request.num_vlan_filters =
+			    resp->resc.num_vlan_filters;
+
+			/* Clear response buffer */
+			memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+		} else {
+			DP_ERR(p_hwfn,
+			       "PF returned error %d to VF acquisition request\n",
+			       resp->hdr.status);
+			return -EAGAIN;
+		}
+	}
+
+	/* Update bulletin board size with response from PF */
+	p_iov->bulletin.size = resp->bulletin_size;
+
+	/* get HW info */
+	p_hwfn->cdev->type = resp->pfdev_info.dev_type;
+	p_hwfn->cdev->chip_rev = resp->pfdev_info.chip_rev;
+
+	p_hwfn->cdev->chip_num = pfdev_info->chip_num & 0xffff;
+
+	/* Learn of the possibility of CMT */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		if (resp->pfdev_info.capabilities & PFVF_ACQUIRE_CAP_100G) {
+			DP_NOTICE(p_hwfn, "100g VF\n");
+			p_hwfn->cdev->num_hwfns = 2;
+		}
+	}
+
+	return 0;
+}
+
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov;
+	u32 reg;
+
+	/* Set number of hwfns - might be overriden once leading hwfn learns
+	 * actual configuration from PF.
+	 */
+	if (IS_LEAD_HWFN(p_hwfn))
+		p_hwfn->cdev->num_hwfns = 1;
+
+	/* Set the doorbell bar. Assumption: regview is set */
+	p_hwfn->doorbells = (u8 __iomem *)p_hwfn->regview +
+					  PXP_VF_BAR0_START_DQ;
+
+	reg = PXP_VF_BAR0_ME_OPAQUE_ADDRESS;
+	p_hwfn->hw_info.opaque_fid = (u16)REG_RD(p_hwfn, reg);
+
+	reg = PXP_VF_BAR0_ME_CONCRETE_ADDRESS;
+	p_hwfn->hw_info.concrete_fid = REG_RD(p_hwfn, reg);
+
+	/* Allocate vf sriov info */
+	p_iov = kzalloc(sizeof(*p_iov), GFP_KERNEL);
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "Failed to allocate `struct qed_sriov'\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate vf2pf msg */
+	p_iov->vf2pf_request = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						  sizeof(union vfpf_tlvs),
+						  &p_iov->vf2pf_request_phys,
+						  GFP_KERNEL);
+	if (!p_iov->vf2pf_request) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate `vf2pf_request' DMA memory\n");
+		goto free_p_iov;
+	}
+
+	p_iov->pf2vf_reply = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						sizeof(union pfvf_tlvs),
+						&p_iov->pf2vf_reply_phys,
+						GFP_KERNEL);
+	if (!p_iov->pf2vf_reply) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate `pf2vf_reply' DMA memory\n");
+		goto free_vf2pf_request;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF's Request mailbox [%p virt 0x%llx phys], Response mailbox [%p virt 0x%llx phys]\n",
+		   p_iov->vf2pf_request,
+		   (u64) p_iov->vf2pf_request_phys,
+		   p_iov->pf2vf_reply, (u64)p_iov->pf2vf_reply_phys);
+
+	/* Allocate Bulletin board */
+	p_iov->bulletin.size = sizeof(struct qed_bulletin_content);
+	p_iov->bulletin.p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						    p_iov->bulletin.size,
+						    &p_iov->bulletin.phys,
+						    GFP_KERNEL);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF's bulletin Board [%p virt 0x%llx phys 0x%08x bytes]\n",
+		   p_iov->bulletin.p_virt,
+		   (u64)p_iov->bulletin.phys, p_iov->bulletin.size);
+
+	mutex_init(&p_iov->mutex);
+
+	p_hwfn->vf_iov_info = p_iov;
+
+	p_hwfn->hw_info.personality = QED_PCI_ETH;
+
+	return qed_vf_pf_acquire(p_hwfn);
+
+free_vf2pf_request:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(union vfpf_tlvs),
+			  p_iov->vf2pf_request, p_iov->vf2pf_request_phys);
+free_p_iov:
+	kfree(p_iov);
+
+	return -ENOMEM;
+}
+
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "vf_sriov_info isn't initialized\n");
+		return 0;
+	}
+
+	return p_iov->acquire_resp.resc.hw_sbs[sb_id].hw_sb_id;
+}
+
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+	*num_rxqs = p_hwfn->vf_iov_info->acquire_resp.resc.num_rxqs;
+}
+
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+	memcpy(port_mac,
+	       p_hwfn->vf_iov_info->acquire_resp.pfdev_info.port_mac, ETH_ALEN);
+}
+
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn, u8 *num_vlan_filters)
+{
+	struct qed_vf_iov *p_vf;
+
+	p_vf = p_hwfn->vf_iov_info;
+	*num_vlan_filters = p_vf->acquire_resp.resc.num_vlan_filters;
+}
+
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng)
+{
+	struct pf_vf_pfdev_info *info;
+
+	info = &p_hwfn->vf_iov_info->acquire_resp.pfdev_info;
+
+	*fw_major = info->fw_major;
+	*fw_minor = info->fw_minor;
+	*fw_rev = info->fw_rev;
+	*fw_eng = info->fw_eng;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index f0d8de2be581..de9fe8501d21 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -9,6 +9,22 @@
 #ifndef _QED_VF_H
 #define _QED_VF_H
 
+struct vf_pf_resc_request {
+	u8 num_rxqs;
+	u8 num_txqs;
+	u8 num_sbs;
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+	u8 num_mc_filters;
+	u16 padding;
+};
+
+struct hw_sb_info {
+	u16 hw_sb_id;
+	u8 sb_qid;
+	u8 padding[5];
+};
+
 enum {
 	PFVF_STATUS_WAITING,
 	PFVF_STATUS_SUCCESS,
@@ -52,6 +68,107 @@ struct channel_list_end_tlv {
 	u8 padding[4];
 };
 
+#define VFPF_ACQUIRE_OS_LINUX (0)
+#define VFPF_ACQUIRE_OS_WINDOWS (1)
+#define VFPF_ACQUIRE_OS_ESX (2)
+#define VFPF_ACQUIRE_OS_SOLARIS (3)
+#define VFPF_ACQUIRE_OS_LINUX_USERSPACE (4)
+
+struct vfpf_acquire_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	struct vf_pf_vfdev_info {
+#define VFPF_ACQUIRE_CAP_OBSOLETE	(1 << 0)
+#define VFPF_ACQUIRE_CAP_100G		(1 << 1) /* VF can support 100g */
+		u64 capabilities;
+		u8 fw_major;
+		u8 fw_minor;
+		u8 fw_revision;
+		u8 fw_engineering;
+		u32 driver_version;
+		u16 opaque_fid;	/* ME register value */
+		u8 os_type;	/* VFPF_ACQUIRE_OS_* value */
+		u8 padding[5];
+	} vfdev_info;
+
+	struct vf_pf_resc_request resc_request;
+
+	u64 bulletin_addr;
+	u32 bulletin_size;
+	u32 padding;
+};
+
+struct pfvf_storm_stats {
+	u32 address;
+	u32 len;
+};
+
+struct pfvf_stats_info {
+	struct pfvf_storm_stats mstats;
+	struct pfvf_storm_stats pstats;
+	struct pfvf_storm_stats tstats;
+	struct pfvf_storm_stats ustats;
+};
+
+struct pfvf_acquire_resp_tlv {
+	struct pfvf_tlv hdr;
+
+	struct pf_vf_pfdev_info {
+		u32 chip_num;
+		u32 mfw_ver;
+
+		u16 fw_major;
+		u16 fw_minor;
+		u16 fw_rev;
+		u16 fw_eng;
+
+		u64 capabilities;
+#define PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED	BIT(0)
+#define PFVF_ACQUIRE_CAP_100G			BIT(1)	/* If set, 100g PF */
+/* There are old PF versions where the PF might mistakenly override the sanity
+ * mechanism [version-based] and allow a VF that can't be supported to pass
+ * the acquisition phase.
+ * To overcome this, PFs now indicate that they're past that point and the new
+ * VFs would fail probe on the older PFs that fail to do so.
+ */
+#define PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE	BIT(2)
+
+		u16 db_size;
+		u8 indices_per_sb;
+		u8 os_type;
+
+		/* These should match the PF's qed_dev values */
+		u16 chip_rev;
+		u8 dev_type;
+
+		u8 padding;
+
+		struct pfvf_stats_info stats_info;
+
+		u8 port_mac[ETH_ALEN];
+		u8 padding2[2];
+	} pfdev_info;
+
+	struct pf_vf_resc {
+#define PFVF_MAX_QUEUES_PER_VF		16
+#define PFVF_MAX_SBS_PER_VF		16
+		struct hw_sb_info hw_sbs[PFVF_MAX_SBS_PER_VF];
+		u8 hw_qid[PFVF_MAX_QUEUES_PER_VF];
+		u8 cid[PFVF_MAX_QUEUES_PER_VF];
+
+		u8 num_rxqs;
+		u8 num_txqs;
+		u8 num_sbs;
+		u8 num_mac_filters;
+		u8 num_vlan_filters;
+		u8 num_mc_filters;
+		u8 padding[2];
+	} resc;
+
+	u32 bulletin_size;
+	u32 padding;
+};
+
 #define TLV_BUFFER_SIZE                 1024
 struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
@@ -59,12 +176,14 @@ struct tlv_buffer_size {
 
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
+	struct vfpf_acquire_tlv acquire;
 	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
 union pfvf_tlvs {
 	struct pfvf_def_resp_tlv default_resp;
+	struct pfvf_acquire_resp_tlv acquire_resp;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -86,8 +205,118 @@ struct qed_bulletin {
 
 enum {
 	CHANNEL_TLV_NONE,	/* ends tlv sequence */
+	CHANNEL_TLV_ACQUIRE,
 	CHANNEL_TLV_LIST_END,
 	CHANNEL_TLV_MAX
 };
 
+/* This data is held in the qed_hwfn structure for VFs only. */
+struct qed_vf_iov {
+	union vfpf_tlvs *vf2pf_request;
+	dma_addr_t vf2pf_request_phys;
+	union pfvf_tlvs *pf2vf_reply;
+	dma_addr_t pf2vf_reply_phys;
+
+	/* Should be taken whenever the mailbox buffers are accessed */
+	struct mutex mutex;
+	u8 *offset;
+
+	/* Bulletin Board */
+	struct qed_bulletin bulletin;
+	struct qed_bulletin_content bulletin_shadow;
+
+	/* we set aside a copy of the acquire response */
+	struct pfvf_acquire_resp_tlv acquire_resp;
+};
+
+#ifdef CONFIG_QED_SRIOV
+/**
+ * @brief Get number of Rx queues allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated RX queues
+ */
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs);
+
+/**
+ * @brief Get port mac address for VF
+ *
+ * @param p_hwfn
+ * @param port_mac - destination location for port mac
+ */
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac);
+
+/**
+ * @brief Get number of VLAN filters allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated VLAN filters
+ */
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+				 u8 *num_vlan_filters);
+
+/**
+ * @brief Set firmware version information in dev_info from VFs acquire response tlv
+ *
+ * @param p_hwfn
+ * @param fw_major
+ * @param fw_minor
+ * @param fw_rev
+ * @param fw_eng
+ */
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng);
+
+/**
+ * @brief hw preparation for VF
+ *      sends ACQUIRE message
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_vf_get_igu_sb_id - Get the IGU SB ID for a given
+ *        sb_id. For VFs igu sbs don't have to be contiguous
+ *
+ * @param p_hwfn
+ * @param sb_id
+ *
+ * @return INLINE u16
+ */
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+#else
+static inline void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+}
+
+static inline void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+}
+
+static inline void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+					       u8 *num_vlan_filters)
+{
+}
+
+static inline void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+					 u16 *fw_major, u16 *fw_minor,
+					 u16 *fw_rev, u16 *fw_eng)
+{
+}
+
+static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 075faa52eb48..2d5f2735dc0a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2283,8 +2283,9 @@ enum qede_probe_mode {
 };
 
 static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level,
-			enum qede_probe_mode mode)
+			bool is_vf, enum qede_probe_mode mode)
 {
+	struct qed_probe_params probe_params;
 	struct qed_slowpath_params params;
 	struct qed_dev_eth_info dev_info;
 	struct qede_dev *edev;
@@ -2294,8 +2295,12 @@ static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level,
 	if (unlikely(dp_level & QED_LEVEL_INFO))
 		pr_notice("Starting qede probe\n");
 
-	cdev = qed_ops->common->probe(pdev, QED_PROTOCOL_ETH,
-				      dp_module, dp_level);
+	memset(&probe_params, 0, sizeof(probe_params));
+	probe_params.protocol = QED_PROTOCOL_ETH;
+	probe_params.dp_module = dp_module;
+	probe_params.dp_level = dp_level;
+	probe_params.is_vf = is_vf;
+	cdev = qed_ops->common->probe(pdev, &probe_params);
 	if (!cdev) {
 		rc = -ENODEV;
 		goto err0;
@@ -2365,7 +2370,7 @@ static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	qede_config_debug(debug, &dp_module, &dp_level);
 
-	return __qede_probe(pdev, dp_module, dp_level,
+	return __qede_probe(pdev, dp_module, dp_level, false,
 			    QEDE_PROBE_NORMAL);
 }
 
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 8914d271ba73..3f14c7efe68f 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -285,6 +285,63 @@
 #define PXP_ILT_PAGE_SIZE_NUM_BITS_MIN	12
 #define PXP_ILT_BLOCK_FACTOR_MULTIPLIER	1024
 
+#define PXP_VF_BAR0_START_IGU                   0
+#define PXP_VF_BAR0_IGU_LENGTH                  0x3000
+#define PXP_VF_BAR0_END_IGU                     (PXP_VF_BAR0_START_IGU + \
+						 PXP_VF_BAR0_IGU_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_DQ                    0x3000
+#define PXP_VF_BAR0_DQ_LENGTH                   0x200
+#define PXP_VF_BAR0_DQ_OPAQUE_OFFSET            0
+#define PXP_VF_BAR0_ME_OPAQUE_ADDRESS           (PXP_VF_BAR0_START_DQ +	\
+						 PXP_VF_BAR0_DQ_OPAQUE_OFFSET)
+#define PXP_VF_BAR0_ME_CONCRETE_ADDRESS         (PXP_VF_BAR0_ME_OPAQUE_ADDRESS \
+						 + 4)
+#define PXP_VF_BAR0_END_DQ                      (PXP_VF_BAR0_START_DQ +	\
+						 PXP_VF_BAR0_DQ_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_TSDM_ZONE_B           0x3200
+#define PXP_VF_BAR0_SDM_LENGTH_ZONE_B           0x200
+#define PXP_VF_BAR0_END_TSDM_ZONE_B             (PXP_VF_BAR0_START_TSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_MSDM_ZONE_B           0x3400
+#define PXP_VF_BAR0_END_MSDM_ZONE_B             (PXP_VF_BAR0_START_MSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_USDM_ZONE_B           0x3600
+#define PXP_VF_BAR0_END_USDM_ZONE_B             (PXP_VF_BAR0_START_USDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_XSDM_ZONE_B           0x3800
+#define PXP_VF_BAR0_END_XSDM_ZONE_B             (PXP_VF_BAR0_START_XSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_YSDM_ZONE_B           0x3a00
+#define PXP_VF_BAR0_END_YSDM_ZONE_B             (PXP_VF_BAR0_START_YSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_PSDM_ZONE_B           0x3c00
+#define PXP_VF_BAR0_END_PSDM_ZONE_B             (PXP_VF_BAR0_START_PSDM_ZONE_B \
+						 +			       \
+						 PXP_VF_BAR0_SDM_LENGTH_ZONE_B \
+						 - 1)
+
+#define PXP_VF_BAR0_START_SDM_ZONE_A            0x4000
+#define PXP_VF_BAR0_END_SDM_ZONE_A              0x10000
+
+#define PXP_VF_BAR0_GRC_WINDOW_LENGTH           32
+
 /* ILT Records */
 #define PXP_NUM_ILT_RECORDS_BB 7600
 #define PXP_NUM_ILT_RECORDS_K2 11000
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d72c832a9397..76a6f168a190 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -140,6 +140,13 @@ struct qed_link_output {
 	u32	pause_config;
 };
 
+struct qed_probe_params {
+	enum qed_protocol protocol;
+	u32 dp_module;
+	u8 dp_level;
+	bool is_vf;
+};
+
 #define QED_DRV_VER_STR_SIZE 12
 struct qed_slowpath_params {
 	u32	int_mode;
@@ -207,8 +214,7 @@ struct qed_common_ops {
 	struct qed_selftest_ops *selftest;
 
 	struct qed_dev*	(*probe)(struct pci_dev *dev,
-				 enum qed_protocol protocol,
-				 u32 dp_module, u8 dp_level);
+				 struct qed_probe_params *params);
 
 	void		(*remove)(struct qed_dev *cdev);
 
-- 
cgit v1.2.3


From 0b55e27d563f493665693b494735574e68c3c5b9 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:15 +0300
Subject: qed: IOV configure and FLR

While previous patches have already added the necessary logic to probe
VFs as well as enabling them in the HW, this patch adds the ability to
support VF FLR & SRIOV disable.

It then wraps both flows together into the first IOV callback to be
provided to the protocol driver - `configure'. This would later to be used
to enable and disable SRIOV in the adapter.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c      |  17 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h  |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h      |  11 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c       |   7 +
 drivers/net/ethernet/qlogic/qed/qed_main.c     |   1 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.c      |  72 +++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h      |  12 +
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |  10 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h       |   1 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 660 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h    |  33 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c       |  97 ++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       |  45 ++
 include/linux/qed/qed_eth_if.h                 |   4 +
 include/linux/qed/qed_iov_if.h                 |  20 +
 15 files changed, 983 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/qed/qed_iov_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 362e8db2b374..78e25cf6836f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -31,6 +31,7 @@
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 #include "qed_sriov.h"
+#include "qed_vf.h"
 
 /* API common to all protocols */
 enum BAR_ID {
@@ -420,8 +421,7 @@ void qed_resc_setup(struct qed_dev *cdev)
 #define FINAL_CLEANUP_POLL_CNT          (100)
 #define FINAL_CLEANUP_POLL_TIME         (10)
 int qed_final_cleanup(struct qed_hwfn *p_hwfn,
-		      struct qed_ptt *p_ptt,
-		      u16 id)
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf)
 {
 	u32 command = 0, addr, count = FINAL_CLEANUP_POLL_CNT;
 	int rc = -EBUSY;
@@ -429,6 +429,9 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 	addr = GTT_BAR0_MAP_REG_USDM_RAM +
 		USTORM_FLR_FINAL_ACK_OFFSET(p_hwfn->rel_pf_id);
 
+	if (is_vf)
+		id += 0x10;
+
 	command |= X_FINAL_CLEANUP_AGG_INT <<
 		SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT;
 	command |= 1 << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT;
@@ -663,7 +666,7 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
 
 	/* Cleanup chip from previous driver if such remains exist */
-	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id);
+	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id, false);
 	if (rc != 0)
 		return rc;
 
@@ -880,7 +883,7 @@ int qed_hw_stop(struct qed_dev *cdev)
 		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Stopping hw/fw\n");
 
 		if (IS_VF(cdev)) {
-			/* To be implemented in a later patch */
+			qed_vf_pf_int_cleanup(p_hwfn);
 			continue;
 		}
 
@@ -989,7 +992,9 @@ int qed_hw_reset(struct qed_dev *cdev)
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
 		if (IS_VF(cdev)) {
-			/* Will be implemented in a later patch */
+			rc = qed_vf_pf_reset(p_hwfn);
+			if (rc)
+				return rc;
 			continue;
 		}
 
@@ -1590,7 +1595,7 @@ void qed_hw_remove(struct qed_dev *cdev)
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
 		if (IS_VF(cdev)) {
-			/* Will be implemented in a later patch */
+			qed_vf_pf_release(p_hwfn);
 			continue;
 		}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index f567371fe304..dde364d6f502 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -303,11 +303,11 @@ int qed_fw_rss_eng(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_ptt
  * @param id - For PF, engine-relative. For VF, PF-relative.
+ * @param is_vf - true iff cleanup is made for a VF.
  *
  * @return int
  */
 int qed_final_cleanup(struct qed_hwfn *p_hwfn,
-		      struct qed_ptt *p_ptt,
-		      u16 id);
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf);
 
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index c511106870d0..82b7727d090b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -30,7 +30,7 @@ enum common_event_opcode {
 	COMMON_EVENT_PF_START,
 	COMMON_EVENT_PF_STOP,
 	COMMON_EVENT_VF_START,
-	COMMON_EVENT_RESERVED2,
+	COMMON_EVENT_VF_STOP,
 	COMMON_EVENT_VF_PF_CHANNEL,
 	COMMON_EVENT_RESERVED4,
 	COMMON_EVENT_RESERVED5,
@@ -45,7 +45,7 @@ enum common_ramrod_cmd_id {
 	COMMON_RAMROD_PF_START /* PF Function Start Ramrod */,
 	COMMON_RAMROD_PF_STOP /* PF Function Stop Ramrod */,
 	COMMON_RAMROD_VF_START,
-	COMMON_RAMROD_RESERVED2,
+	COMMON_RAMROD_VF_STOP,
 	COMMON_RAMROD_PF_UPDATE,
 	COMMON_RAMROD_EMPTY,
 	MAX_COMMON_RAMROD_CMD_ID
@@ -741,6 +741,13 @@ struct vf_start_ramrod_data {
 	u8 reserved[3];
 };
 
+struct vf_stop_ramrod_data {
+	u8 vf_id;
+	u8 reserved0;
+	__le16 reserved1;
+	__le32 reserved2;
+};
+
 struct atten_status_block {
 	__le32	atten_bits;
 	__le32	atten_ack;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 8bcbf92b776f..5978bb57f883 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2066,8 +2066,15 @@ static int qed_fp_cqe_completion(struct qed_dev *dev,
 				      cqe);
 }
 
+#ifdef CONFIG_QED_SRIOV
+extern const struct qed_iov_hv_ops qed_iov_ops_pass;
+#endif
+
 static const struct qed_eth_ops qed_eth_ops_pass = {
 	.common = &qed_common_ops_pass,
+#ifdef CONFIG_QED_SRIOV
+	.iov = &qed_iov_ops_pass,
+#endif
 	.fill_dev_info = &qed_fill_eth_dev_info,
 	.register_ops = &qed_register_eth_ops,
 	.vport_start = &qed_start_vport,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 898347bd2db7..e98610e5bf70 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -897,6 +897,7 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 
 	if (IS_PF(cdev)) {
 		qed_free_stream_mem(cdev);
+		qed_sriov_disable(cdev, true);
 
 		qed_nic_stop(cdev);
 		qed_slowpath_irq_free(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 83175007f616..2be943b91916 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -442,6 +442,75 @@ int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static void qed_mcp_handle_vf_flr(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_PATH);
+	u32 mfw_path_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 path_addr = SECTION_ADDR(mfw_path_offsize,
+				     QED_PATH_ID(p_hwfn));
+	u32 disabled_vfs[VF_MAX_STATIC / 32];
+	int i;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Reading Disabled VF information from [offset %08x], path_addr %08x\n",
+		   mfw_path_offsize, path_addr);
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++) {
+		disabled_vfs[i] = qed_rd(p_hwfn, p_ptt,
+					 path_addr +
+					 offsetof(struct public_path,
+						  mcp_vf_disabled) +
+					 sizeof(u32) * i);
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "FLR-ed VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, disabled_vfs[i]);
+	}
+
+	if (qed_iov_mark_vf_flr(p_hwfn, disabled_vfs))
+		qed_schedule_iov(p_hwfn, QED_IOV_WQ_FLR_FLAG);
+}
+
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_FUNC);
+	u32 mfw_func_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 func_addr = SECTION_ADDR(mfw_func_offsize,
+				     MCP_PF_ID(p_hwfn));
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data union_data;
+	int rc;
+	int i;
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "Acking VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, vfs_to_ack[i]);
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_VF_DISABLED_DONE;
+	memcpy(&union_data.ack_vf_disabled, vfs_to_ack, VF_MAX_STATIC / 8);
+	mb_params.p_data_src = &union_data;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to pass ACK for VF flr to MFW\n");
+		return -EBUSY;
+	}
+
+	/* Clear the ACK bits */
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		qed_wr(p_hwfn, p_ptt,
+		       func_addr +
+		       offsetof(struct public_func, drv_ack_vf_disabled) +
+		       i * sizeof(u32), 0);
+
+	return rc;
+}
+
 static void qed_mcp_handle_transceiver_change(struct qed_hwfn *p_hwfn,
 					      struct qed_ptt *p_ptt)
 {
@@ -753,6 +822,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_LINK_CHANGE:
 			qed_mcp_handle_link_change(p_hwfn, p_ptt, false);
 			break;
+		case MFW_DRV_MSG_VF_DISABLED:
+			qed_mcp_handle_vf_flr(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index e3d5cdfe8e8d..6dd59eb7f4c6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -392,6 +392,18 @@ int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
 void qed_mcp_read_mb(struct qed_hwfn *p_hwfn,
 		     struct qed_ptt *p_ptt);
 
+/**
+ * @brief Ack to mfw that driver finished FLR process for VFs
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vfs_to_ack - bit mask of all engine VFs for which the PF acks.
+ *
+ * @param return int - 0 upon success.
+ */
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack);
+
 /**
  * @brief - calls during init to read shmem of all function-related info.
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index a508b6b7f1d4..80a621754f13 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -41,6 +41,8 @@
 	0x2aa16cUL
 #define PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR \
 	0x2aa118UL
+#define PSWHST_REG_ZONE_PERMISSION_TABLE \
+	0x2a0800UL
 #define  BAR0_MAP_REG_MSDM_RAM \
 	0x1d00000UL
 #define  BAR0_MAP_REG_USDM_RAM \
@@ -79,6 +81,8 @@
 	0x2f2eb0UL
 #define  DORQ_REG_PF_DB_ENABLE \
 	0x100508UL
+#define DORQ_REG_VF_USAGE_CNT \
+	0x1009c4UL
 #define  QM_REG_PF_EN \
 	0x2f2ea4UL
 #define  TCFC_REG_STRONG_ENABLE_PF \
@@ -167,6 +171,10 @@
 	0x040200UL
 #define  PBF_REG_INIT \
 	0xd80000UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 \
+	0xd806c8UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 \
+	0xd806ccUL
 #define  PTU_REG_ATC_INIT_ARRAY \
 	0x560000UL
 #define  PCM_REG_INIT \
@@ -391,6 +399,8 @@
 	0x1d0000UL
 #define  IGU_REG_PF_CONFIGURATION \
 	0x180800UL
+#define IGU_REG_VF_CONFIGURATION \
+	0x180804UL
 #define  MISC_REG_AEU_ENABLE1_IGU_OUT_0 \
 	0x00849cUL
 #define MISC_REG_AEU_AFTER_INVERT_1_IGU	\
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index dde69090379f..c2999cb5d1e2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -64,6 +64,7 @@ union ramrod_data {
 	struct vport_filter_update_ramrod_data vport_filter_update;
 
 	struct vf_start_ramrod_data vf_start;
+	struct vf_stop_ramrod_data vf_stop;
 };
 
 #define EQ_MAX_CREDIT   0xffffffff
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 699d96fb87f0..750166db57cd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -6,6 +6,7 @@
  * this source tree.
  */
 
+#include <linux/qed/qed_iov_if.h>
 #include "qed_cxt.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -48,6 +49,33 @@ static int qed_sp_vf_start(struct qed_hwfn *p_hwfn,
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+static int qed_sp_vf_stop(struct qed_hwfn *p_hwfn,
+			  u32 concrete_vfid, u16 opaque_vfid)
+{
+	struct vf_stop_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_vfid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_STOP,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_stop;
+
+	p_ramrod->vf_id = GET_FIELD(concrete_vfid, PXP_CONCRETE_FID_VFID);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
 			   int rel_vf_id, bool b_enabled_only)
 {
@@ -422,6 +450,34 @@ static bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
 	return true;
 }
 
+static void qed_iov_set_vf_to_disable(struct qed_dev *cdev,
+				      u16 rel_vf_id, u8 to_disable)
+{
+	struct qed_vf_info *vf;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+		if (!vf)
+			continue;
+
+		vf->to_disable = to_disable;
+	}
+}
+
+void qed_iov_set_vfs_to_disable(struct qed_dev *cdev, u8 to_disable)
+{
+	u16 i;
+
+	if (!IS_QED_SRIOV(cdev))
+		return;
+
+	for (i = 0; i < cdev->p_iov_info->total_vfs; i++)
+		qed_iov_set_vf_to_disable(cdev, i, to_disable);
+}
+
 static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt, u8 abs_vfid)
 {
@@ -430,6 +486,27 @@ static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
 	       1 << (abs_vfid & 0x1f));
 }
 
+static void qed_iov_vf_igu_set_int(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, bool enable)
+{
+	u32 igu_vf_conf;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	igu_vf_conf = qed_rd(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION);
+
+	if (enable)
+		igu_vf_conf |= IGU_VF_CONF_MSI_MSIX_EN;
+	else
+		igu_vf_conf &= ~IGU_VF_CONF_MSI_MSIX_EN;
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION, igu_vf_conf);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+}
+
 static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt,
 				    struct qed_vf_info *vf)
@@ -437,6 +514,9 @@ static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 	u32 igu_vf_conf = IGU_VF_CONF_FUNC_EN;
 	int rc;
 
+	if (vf->to_disable)
+		return 0;
+
 	DP_VERBOSE(p_hwfn,
 		   QED_MSG_IOV,
 		   "Enable internal access for vf %x [abs %x]\n",
@@ -475,6 +555,36 @@ static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+/**
+ * @brief qed_iov_config_perm_table - configure the permission
+ *      zone table.
+ *      In E4, queue zone permission table size is 320x9. There
+ *      are 320 VF queues for single engine device (256 for dual
+ *      engine device), and each entry has the following format:
+ *      {Valid, VF[7:0]}
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf
+ * @param enable
+ */
+static void qed_iov_config_perm_table(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *vf, u8 enable)
+{
+	u32 reg_addr, val;
+	u16 qzone_id = 0;
+	int qid;
+
+	for (qid = 0; qid < vf->num_rxqs; qid++) {
+		qed_fw_l2_queue(p_hwfn, vf->vf_queues[qid].fw_rx_qid,
+				&qzone_id);
+
+		reg_addr = PSWHST_REG_ZONE_PERMISSION_TABLE + qzone_id * 4;
+		val = enable ? (vf->abs_vf_id | (1 << 8)) : 0;
+		qed_wr(p_hwfn, p_ptt, reg_addr, val);
+	}
+}
+
 static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   struct qed_vf_info *vf, u16 num_rx_queues)
@@ -525,6 +635,32 @@ static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 	return vf->num_sbs;
 }
 
+static void qed_iov_free_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+	int idx, igu_id;
+	u32 addr, val;
+
+	/* Invalidate igu CAM lines and mark them as free */
+	for (idx = 0; idx < vf->num_sbs; idx++) {
+		igu_id = vf->igu_sbs[idx];
+		addr = IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_id;
+
+		val = qed_rd(p_hwfn, p_ptt, addr);
+		SET_FIELD(val, IGU_MAPPING_LINE_VALID, 0);
+		qed_wr(p_hwfn, p_ptt, addr, val);
+
+		p_info->igu_map.igu_blocks[igu_id].status |=
+		    QED_IGU_STATUS_FREE;
+
+		p_hwfn->hw_info.p_igu_info->free_blks++;
+	}
+
+	vf->num_sbs = 0;
+}
+
 static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
 				  u16 rel_vf_id, u16 num_rx_queues)
@@ -598,6 +734,54 @@ static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+static int qed_iov_release_hw_for_vf(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 rel_vf_id)
+{
+	struct qed_vf_info *vf = NULL;
+	int rc = 0;
+
+	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_release_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->state != VF_STOPPED) {
+		/* Stopping the VF */
+		rc = qed_sp_vf_stop(p_hwfn, vf->concrete_fid, vf->opaque_fid);
+
+		if (rc != 0) {
+			DP_ERR(p_hwfn, "qed_sp_vf_stop returned error %d\n",
+			       rc);
+			return rc;
+		}
+
+		vf->state = VF_STOPPED;
+	}
+
+	/* disablng interrupts and resetting permission table was done during
+	 * vf-close, however, we could get here without going through vf_close
+	 */
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	vf->num_rxqs = 0;
+	vf->num_txqs = 0;
+	qed_iov_free_vf_igu_sbs(p_hwfn, p_ptt, vf);
+
+	if (vf->b_init) {
+		vf->b_init = false;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs--;
+	}
+
+	return 0;
+}
+
 static bool qed_iov_tlv_supported(u16 tlvtype)
 {
 	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
@@ -702,6 +886,51 @@ static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
 	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
 }
 
+struct qed_public_vf_info *qed_iov_get_public_vf_info(struct qed_hwfn *p_hwfn,
+						      u16 relative_vf_id,
+						      bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	vf = qed_iov_get_vf_info(p_hwfn, relative_vf_id, b_enabled_only);
+	if (!vf)
+		return NULL;
+
+	return &vf->p_vf_info;
+}
+
+void qed_iov_clean_vf(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	struct qed_public_vf_info *vf_info;
+
+	vf_info = qed_iov_get_public_vf_info(p_hwfn, vfid, false);
+
+	if (!vf_info)
+		return;
+
+	/* Clear the VF mac */
+	memset(vf_info->mac, 0, ETH_ALEN);
+}
+
+static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf)
+{
+	u32 i;
+
+	p_vf->vf_bulletin = 0;
+	p_vf->num_mac_filters = 0;
+	p_vf->num_vlan_filters = 0;
+
+	/* If VF previously requested less resources, go back to default */
+	p_vf->num_rxqs = p_vf->num_sbs;
+	p_vf->num_txqs = p_vf->num_sbs;
+
+	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
+		p_vf->vf_queues[i].rxq_active = 0;
+
+	qed_iov_clean_vf(p_hwfn, p_vf->relative_vf_id);
+}
+
 static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   struct qed_vf_info *vf)
@@ -845,6 +1074,271 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static void qed_iov_vf_mbx_int_cleanup(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	int i;
+
+	/* Reset the SBs */
+	for (i = 0; i < vf->num_sbs; i++)
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt,
+						vf->igu_sbs[i],
+						vf->opaque_fid, false);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_INT_CLEANUP,
+			     sizeof(struct pfvf_def_resp_tlv),
+			     PFVF_STATUS_SUCCESS);
+}
+
+static void qed_iov_vf_mbx_close(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, struct qed_vf_info *vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	u8 status = PFVF_STATUS_SUCCESS;
+
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_CLOSE,
+			     length, status);
+}
+
+static void qed_iov_vf_mbx_release(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *p_vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+
+	qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_RELEASE,
+			     length, PFVF_STATUS_SUCCESS);
+}
+
+static int
+qed_iov_vf_flr_poll_dorq(struct qed_hwfn *p_hwfn,
+			 struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int cnt;
+	u32 val;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_vf->concrete_fid);
+
+	for (cnt = 0; cnt < 50; cnt++) {
+		val = qed_rd(p_hwfn, p_ptt, DORQ_REG_VF_USAGE_CNT);
+		if (!val)
+			break;
+		msleep(20);
+	}
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn,
+		       "VF[%d] - dorq failed to cleanup [usage 0x%08x]\n",
+		       p_vf->abs_vf_id, val);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int
+qed_iov_vf_flr_poll_pbf(struct qed_hwfn *p_hwfn,
+			struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	u32 cons[MAX_NUM_VOQS], distance[MAX_NUM_VOQS];
+	int i, cnt;
+
+	/* Read initial consumers & producers */
+	for (i = 0; i < MAX_NUM_VOQS; i++) {
+		u32 prod;
+
+		cons[i] = qed_rd(p_hwfn, p_ptt,
+				 PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				 i * 0x40);
+		prod = qed_rd(p_hwfn, p_ptt,
+			      PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 +
+			      i * 0x40);
+		distance[i] = prod - cons[i];
+	}
+
+	/* Wait for consumers to pass the producers */
+	i = 0;
+	for (cnt = 0; cnt < 50; cnt++) {
+		for (; i < MAX_NUM_VOQS; i++) {
+			u32 tmp;
+
+			tmp = qed_rd(p_hwfn, p_ptt,
+				     PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				     i * 0x40);
+			if (distance[i] > tmp - cons[i])
+				break;
+		}
+
+		if (i == MAX_NUM_VOQS)
+			break;
+
+		msleep(20);
+	}
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn, "VF[%d] - pbf polling failed on VOQ %d\n",
+		       p_vf->abs_vf_id, i);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int qed_iov_vf_flr_poll(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int rc;
+
+	rc = qed_iov_vf_flr_poll_dorq(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	rc = qed_iov_vf_flr_poll_pbf(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int
+qed_iov_execute_vf_flr_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 rel_vf_id, u32 *ack_vfs)
+{
+	struct qed_vf_info *p_vf;
+	int rc = 0;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	if (!p_vf)
+		return 0;
+
+	if (p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &
+	    (1ULL << (rel_vf_id % 64))) {
+		u16 vfid = p_vf->abs_vf_id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - Handling FLR\n", vfid);
+
+		qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+		/* If VF isn't active, no need for anything but SW */
+		if (!p_vf->b_init)
+			goto cleanup;
+
+		rc = qed_iov_vf_flr_poll(p_hwfn, p_vf, p_ptt);
+		if (rc)
+			goto cleanup;
+
+		rc = qed_final_cleanup(p_hwfn, p_ptt, vfid, true);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed handle FLR of VF[%d]\n", vfid);
+			return rc;
+		}
+
+		/* VF_STOPPED has to be set only after final cleanup
+		 * but prior to re-enabling the VF.
+		 */
+		p_vf->state = VF_STOPPED;
+
+		rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, p_vf);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed to re-enable VF[%d] acces\n",
+			       vfid);
+			return rc;
+		}
+cleanup:
+		/* Mark VF for ack and clean pending state */
+		if (p_vf->state == VF_RESET)
+			p_vf->state = VF_STOPPED;
+		ack_vfs[vfid / 32] |= (1 << (vfid % 32));
+		p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
+		    ~(1ULL << (rel_vf_id % 64));
+		p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &=
+		    ~(1ULL << (rel_vf_id % 64));
+	}
+
+	return rc;
+}
+
+int qed_iov_vf_flr_cleanup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 ack_vfs[VF_MAX_STATIC / 32];
+	int rc = 0;
+	u16 i;
+
+	memset(ack_vfs, 0, sizeof(u32) * (VF_MAX_STATIC / 32));
+
+	/* Since BRB <-> PRS interface can't be tested as part of the flr
+	 * polling due to HW limitations, simply sleep a bit. And since
+	 * there's no need to wait per-vf, do it before looping.
+	 */
+	msleep(100);
+
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++)
+		qed_iov_execute_vf_flr_cleanup(p_hwfn, p_ptt, i, ack_vfs);
+
+	rc = qed_mcp_ack_vf_flr(p_hwfn, p_ptt, ack_vfs);
+	return rc;
+}
+
+int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *p_disabled_vfs)
+{
+	u16 i, found = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV, "Marking FLR-ed VFs\n");
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "[%08x,...,%08x]: %08x\n",
+			   i * 32, (i + 1) * 32 - 1, p_disabled_vfs[i]);
+
+	if (!p_hwfn->cdev->p_iov_info) {
+		DP_NOTICE(p_hwfn, "VF flr but no IOV\n");
+		return 0;
+	}
+
+	/* Mark VFs */
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_vf_info *p_vf;
+		u8 vfid;
+
+		p_vf = qed_iov_get_vf_info(p_hwfn, i, false);
+		if (!p_vf)
+			continue;
+
+		vfid = p_vf->abs_vf_id;
+		if ((1 << (vfid % 32)) & p_disabled_vfs[vfid / 32]) {
+			u64 *p_flr = p_hwfn->pf_iov_info->pending_flr;
+			u16 rel_vf_id = p_vf->relative_vf_id;
+
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d] [rel %d] got FLR-ed\n",
+				   vfid, rel_vf_id);
+
+			p_vf->state = VF_RESET;
+
+			/* No need to lock here, since pending_flr should
+			 * only change here and before ACKing MFw. Since
+			 * MFW will not trigger an additional attention for
+			 * VF flr until ACKs, we're safe.
+			 */
+			p_flr[rel_vf_id / 64] |= 1ULL << (rel_vf_id % 64);
+			found = 1;
+		}
+	}
+
+	return found;
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -871,6 +1365,15 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_ACQUIRE:
 			qed_iov_vf_mbx_acquire(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_CLOSE:
+			qed_iov_vf_mbx_close(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_INT_CLEANUP:
+			qed_iov_vf_mbx_int_cleanup(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_RELEASE:
+			qed_iov_vf_mbx_release(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else {
 		/* unknown TLV - this may belong to a VF driver from the future
@@ -992,6 +1495,17 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return true;
+
+	return p_vf_info->state == VF_STOPPED;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -1015,6 +1529,132 @@ void qed_vf_start_iov_wq(struct qed_dev *cdev)
 			       &cdev->hwfns[i].iov_task, 0);
 }
 
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	int i, j;
+
+	for_each_hwfn(cdev, i)
+	    if (cdev->hwfns[i].iov_wq)
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+
+	/* Mark VFs for disablement */
+	qed_iov_set_vfs_to_disable(cdev, true);
+
+	if (cdev->p_iov_info && cdev->p_iov_info->num_vfs && pci_enabled)
+		pci_disable_sriov(cdev->pdev);
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		/* Failure to acquire the ptt in 100g creates an odd error
+		 * where the first engine has already relased IOV.
+		 */
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			return -EBUSY;
+		}
+
+		qed_for_each_vf(hwfn, j) {
+			int k;
+
+			if (!qed_iov_is_valid_vfid(hwfn, j, true))
+				continue;
+
+			/* Wait until VF is disabled before releasing */
+			for (k = 0; k < 100; k++) {
+				if (!qed_iov_is_vf_stopped(hwfn, j))
+					msleep(20);
+				else
+					break;
+			}
+
+			if (k < 100)
+				qed_iov_release_hw_for_vf(&cdev->hwfns[i],
+							  ptt, j);
+			else
+				DP_ERR(hwfn,
+				       "Timeout waiting for VF's FLR to end\n");
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	qed_iov_set_vfs_to_disable(cdev, false);
+
+	return 0;
+}
+
+static int qed_sriov_enable(struct qed_dev *cdev, int num)
+{
+	struct qed_sb_cnt_info sb_cnt_info;
+	int i, j, rc;
+
+	if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
+		DP_NOTICE(cdev, "Can start at most %d VFs\n",
+			  RESC_NUM(&cdev->hwfns[0], QED_VPORT) - 1);
+		return -EINVAL;
+	}
+
+	/* Initialize HW for VF access */
+	for_each_hwfn(cdev, j) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[j];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+		int num_sbs = 0, limit = 16;
+
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			rc = -EBUSY;
+			goto err;
+		}
+
+		memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+		qed_int_get_num_sbs(hwfn, &sb_cnt_info);
+		num_sbs = min_t(int, sb_cnt_info.sb_free_blk, limit);
+
+		for (i = 0; i < num; i++) {
+			if (!qed_iov_is_valid_vfid(hwfn, i, false))
+				continue;
+
+			rc = qed_iov_init_hw_for_vf(hwfn,
+						    ptt, i, num_sbs / num);
+			if (rc) {
+				DP_ERR(cdev, "Failed to enable VF[%d]\n", i);
+				qed_ptt_release(hwfn, ptt);
+				goto err;
+			}
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	/* Enable SRIOV PCIe functions */
+	rc = pci_enable_sriov(cdev->pdev, num);
+	if (rc) {
+		DP_ERR(cdev, "Failed to enable sriov [%d]\n", rc);
+		goto err;
+	}
+
+	return num;
+
+err:
+	qed_sriov_disable(cdev, false);
+	return rc;
+}
+
+static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
+{
+	if (!IS_QED_SRIOV(cdev)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV, "SR-IOV is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (num_vfs_param)
+		return qed_sriov_enable(cdev, num_vfs_param);
+	else
+		return qed_sriov_disable(cdev, true);
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
@@ -1058,10 +1698,26 @@ void qed_iov_pf_task(struct work_struct *work)
 {
 	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
 					     iov_task.work);
+	int rc;
 
 	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
 		return;
 
+	if (test_and_clear_bit(QED_IOV_WQ_FLR_FLAG, &hwfn->iov_task_flags)) {
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		if (!ptt) {
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+			return;
+		}
+
+		rc = qed_iov_vf_flr_cleanup(hwfn, ptt);
+		if (rc)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
 	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
 		qed_handle_vf_msg(hwfn);
 }
@@ -1112,3 +1768,7 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 	return 0;
 }
+
+const struct qed_iov_hv_ops qed_iov_ops_pass = {
+	.configure = &qed_sriov_configure,
+};
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 4f190d25ee14..10794b08fd21 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -24,6 +24,13 @@
 #define QED_MAX_VF_CHAINS_PER_PF 16
 #define QED_ETH_VF_NUM_VLAN_FILTERS 2
 
+struct qed_public_vf_info {
+	/* These copies will later be reflected in the bulletin board,
+	 * but this copy should be newer.
+	 */
+	u8 mac[ETH_ALEN];
+};
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -74,6 +81,7 @@ struct qed_vf_q_info {
 enum vf_state {
 	VF_FREE = 0,		/* VF ready to be acquired holds no resc */
 	VF_ACQUIRED,		/* VF, acquired, but not initalized */
+	VF_RESET,		/* VF, FLR'd, pending cleanup */
 	VF_STOPPED		/* VF, Stopped */
 };
 
@@ -82,6 +90,7 @@ struct qed_vf_info {
 	struct qed_iov_vf_mbx vf_mbx;
 	enum vf_state state;
 	bool b_init;
+	u8 to_disable;
 
 	struct qed_bulletin bulletin;
 	dma_addr_t vf_bulletin;
@@ -105,7 +114,7 @@ struct qed_vf_info {
 	u8 num_vlan_filters;
 	struct qed_vf_q_info vf_queues[QED_MAX_VF_CHAINS_PER_PF];
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
-
+	struct qed_public_vf_info p_vf_info;
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
@@ -219,11 +228,22 @@ void qed_iov_free_hw_info(struct qed_dev *cdev);
 int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
 			u8 opcode, __le16 echo, union event_ring_data *data);
 
+/**
+ * @brief Mark structs of vfs that have been FLR-ed.
+ *
+ * @param p_hwfn
+ * @param disabled_vfs - bitmask of all VFs on path that were FLRed
+ *
+ * @return 1 iff one of the PF's vfs got FLRed. 0 otherwise.
+ */
+int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *disabled_vfs);
+
 void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
 int qed_iov_wq_start(struct qed_dev *cdev);
 
 void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
 void qed_vf_start_iov_wq(struct qed_dev *cdev);
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled);
 #else
 static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 					     u16 rel_vf_id)
@@ -260,6 +280,12 @@ static inline int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
 	return -EINVAL;
 }
 
+static inline int qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn,
+				      u32 *disabled_vfs)
+{
+	return 0;
+}
+
 static inline void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
 {
 }
@@ -277,6 +303,11 @@ static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
 static inline void qed_vf_start_iov_wq(struct qed_dev *cdev)
 {
 }
+
+static inline int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	return 0;
+}
 #endif
 
 #define qed_for_each_vf(_p_hwfn, _i)			  \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index a3c8f4e1b9c1..2460e39724f1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -311,6 +311,103 @@ free_p_iov:
 	return -ENOMEM;
 }
 
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_CLOSE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		return rc;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		return -EAGAIN;
+
+	p_hwfn->b_int_enabled = 0;
+
+	return 0;
+}
+
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	u32 size;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_RELEASE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+
+	if (!rc && resp->hdr.status != PFVF_STATUS_SUCCESS)
+		rc = -EAGAIN;
+
+	p_hwfn->b_int_enabled = 0;
+
+	if (p_iov->vf2pf_request)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union vfpf_tlvs),
+				  p_iov->vf2pf_request,
+				  p_iov->vf2pf_request_phys);
+	if (p_iov->pf2vf_reply)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union pfvf_tlvs),
+				  p_iov->pf2vf_reply, p_iov->pf2vf_reply_phys);
+
+	if (p_iov->bulletin.p_virt) {
+		size = sizeof(struct qed_bulletin_content);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  size,
+				  p_iov->bulletin.p_virt, p_iov->bulletin.phys);
+	}
+
+	kfree(p_hwfn->vf_iov_info);
+	p_hwfn->vf_iov_info = NULL;
+
+	return rc;
+}
+
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp = &p_iov->pf2vf_reply->default_resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_INT_CLEANUP,
+		       sizeof(struct vfpf_first_tlv));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		return rc;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		return -EINVAL;
+
+	return 0;
+}
+
 u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index de9fe8501d21..c872e5e2985e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -206,6 +206,9 @@ struct qed_bulletin {
 enum {
 	CHANNEL_TLV_NONE,	/* ends tlv sequence */
 	CHANNEL_TLV_ACQUIRE,
+	CHANNEL_TLV_INT_CLEANUP,
+	CHANNEL_TLV_CLOSE,
+	CHANNEL_TLV_RELEASE,
 	CHANNEL_TLV_LIST_END,
 	CHANNEL_TLV_MAX
 };
@@ -278,6 +281,24 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
  */
 int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
 
+/**
+ *
+ * @brief VF - send a close message to PF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief VF - free vf`s memories
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn);
 /**
  * @brief qed_vf_get_igu_sb_id - Get the IGU SB ID for a given
  *        sb_id. For VFs igu sbs don't have to be contiguous
@@ -288,6 +309,15 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
  * @return INLINE u16
  */
 u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
+/**
+ * @brief qed_vf_pf_int_cleanup - clean the SB of the VF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn);
 #else
 static inline void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
 {
@@ -313,10 +343,25 @@ static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 	return -EINVAL;
 }
 
+static inline int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
 static inline u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
 {
 	return 0;
 }
+
+static inline int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 3a4c806be156..acfafca43aa5 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -13,6 +13,7 @@
 #include <linux/if_link.h>
 #include <linux/qed/eth_common.h>
 #include <linux/qed/qed_if.h>
+#include <linux/qed/qed_iov_if.h>
 
 struct qed_dev_eth_info {
 	struct qed_dev_info common;
@@ -125,6 +126,9 @@ struct qed_eth_cb_ops {
 
 struct qed_eth_ops {
 	const struct qed_common_ops *common;
+#ifdef CONFIG_QED_SRIOV
+	const struct qed_iov_hv_ops *iov;
+#endif
 
 	int (*fill_dev_info)(struct qed_dev *cdev,
 			     struct qed_dev_eth_info *info);
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
new file mode 100644
index 000000000000..c53bfa6374c5
--- /dev/null
+++ b/include/linux/qed/qed_iov_if.h
@@ -0,0 +1,20 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_IOV_IF_H
+#define _QED_IOV_IF_H
+
+#include <linux/qed/qed_if.h>
+
+/* Structs used by PF to control and manipulate child VFs */
+struct qed_iov_hv_ops {
+	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
+
+};
+
+#endif
-- 
cgit v1.2.3


From 08feecd7fc709077ce92d21a979f522a5f57170a Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:20 +0300
Subject: qed*: Support PVID configuration

This adds support for PF control over the VF vlan configuration.
I.e., `ip link ... vf <x> vlan <vid>' should now be supported.

 1. <vid> != 0 => VF receives [unknowingly] only traffic tagged by
    <vid> and tags all outgoing traffic sent by VF with <vid>.
 2. <vid> == 0 ==> Remove the pvid configuration, reverting to previous.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |   9 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  14 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.h     |   6 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 364 ++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  26 ++
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  18 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  19 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c |  18 ++
 include/linux/qed/qed_iov_if.h               |   1 +
 9 files changed, 468 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 9d01a16bfb1a..e75e73a77b27 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1104,9 +1104,16 @@ static void qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 	u8 num_funcs = p_hwfn->num_funcs_on_engine;
 	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	struct qed_sb_cnt_info sb_cnt_info;
-	int i;
+	int i, max_vf_vlan_filters;
 
 	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+
+#ifdef CONFIG_QED_SRIOV
+	max_vf_vlan_filters = QED_ETH_MAX_VF_NUM_VLAN_FILTERS;
+#else
+	max_vf_vlan_filters = 0;
+#endif
+
 	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
 
 	resc_num[QED_SB] = min_t(u32,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 80f0b853a142..7fb6b82f1a97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -114,7 +114,8 @@ int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 					     p_params->mtu,
 					     p_params->remove_inner_vlan,
 					     p_params->tpa_mode,
-					     p_params->max_buffers_per_cqe);
+					     p_params->max_buffers_per_cqe,
+					     p_params->only_untagged);
 	}
 
 	return qed_sp_eth_vport_start(p_hwfn, p_params);
@@ -367,6 +368,16 @@ int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 	p_cmn->inner_vlan_removal_en = p_params->inner_vlan_removal_flg;
 	val = p_params->update_inner_vlan_removal_flg;
 	p_cmn->update_inner_vlan_removal_en_flg = val;
+
+	p_cmn->default_vlan_en = p_params->default_vlan_enable_flg;
+	val = p_params->update_default_vlan_enable_flg;
+	p_cmn->update_default_vlan_en_flg = val;
+
+	p_cmn->default_vlan = cpu_to_le16(p_params->default_vlan);
+	p_cmn->update_default_vlan_flg = p_params->update_default_vlan_flg;
+
+	p_cmn->silent_vlan_removal_en = p_params->silent_vlan_removal_flg;
+
 	p_ramrod->common.tx_switching_en = p_params->tx_switching_flg;
 	p_cmn->update_tx_switching_en_flg = p_params->update_tx_switching_flg;
 
@@ -1702,6 +1713,7 @@ static int qed_start_vport(struct qed_dev *cdev,
 		start.tpa_mode = params->gro_enable ? QED_TPA_MODE_GRO :
 							QED_TPA_MODE_NONE;
 		start.remove_inner_vlan = params->remove_inner_vlan;
+		start.only_untagged = true;	/* untagged only */
 		start.drop_ttl0 = params->drop_ttl0;
 		start.opaque_fid = p_hwfn->hw_info.opaque_fid;
 		start.concrete_fid = p_hwfn->hw_info.concrete_fid;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index f9e677a29751..fad30ae12f63 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -94,6 +94,7 @@ enum qed_tpa_mode {
 struct qed_sp_vport_start_params {
 	enum qed_tpa_mode tpa_mode;
 	bool remove_inner_vlan;
+	bool only_untagged;
 	bool drop_ttl0;
 	u8 max_buffers_per_cqe;
 	u32 concrete_fid;
@@ -140,6 +141,11 @@ struct qed_sp_vport_update_params {
 	u8				vport_active_tx_flg;
 	u8				update_inner_vlan_removal_flg;
 	u8				inner_vlan_removal_flg;
+	u8				silent_vlan_removal_flg;
+	u8				update_default_vlan_enable_flg;
+	u8				default_vlan_enable_flg;
+	u8				update_default_vlan_flg;
+	u16				default_vlan;
 	u8				update_tx_switching_flg;
 	u8				tx_switching_flg;
 	u8				update_approx_mcast_flg;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 29a53dd0d9fd..77d44baa5df3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1075,6 +1075,7 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 	p_vf->vport_instance = 0;
 	p_vf->num_mac_filters = 0;
 	p_vf->num_vlan_filters = 0;
+	p_vf->configured_features = 0;
 
 	/* If VF previously requested less resources, go back to default */
 	p_vf->num_rxqs = p_vf->num_sbs;
@@ -1085,6 +1086,7 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
 		p_vf->vf_queues[i].rxq_active = 0;
 
+	memset(&p_vf->shadow_config, 0, sizeof(p_vf->shadow_config));
 	qed_iov_clean_vf(p_hwfn, p_vf->relative_vf_id);
 }
 
@@ -1232,6 +1234,149 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static int qed_iov_reconfigure_unicast_vlan(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf)
+{
+	struct qed_filter_ucast filter;
+	int rc = 0;
+	int i;
+
+	memset(&filter, 0, sizeof(filter));
+	filter.is_rx_filter = 1;
+	filter.is_tx_filter = 1;
+	filter.vport_to_add_to = p_vf->vport_id;
+	filter.opcode = QED_FILTER_ADD;
+
+	/* Reconfigure vlans */
+	for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+		if (!p_vf->shadow_config.vlans[i].used)
+			continue;
+
+		filter.type = QED_FILTER_VLAN;
+		filter.vlan = p_vf->shadow_config.vlans[i].vid;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Reconfiguring VLAN [0x%04x] for VF [%04x]\n",
+			   filter.vlan, p_vf->relative_vf_id);
+		rc = qed_sp_eth_filter_ucast(p_hwfn,
+					     p_vf->opaque_fid,
+					     &filter,
+					     QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to configure VLAN [%04x] to VF [%04x]\n",
+				  filter.vlan, p_vf->relative_vf_id);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int
+qed_iov_reconfigure_unicast_shadow(struct qed_hwfn *p_hwfn,
+				   struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+
+	if ((events & (1 << VLAN_ADDR_FORCED)) &&
+	    !(p_vf->configured_features & (1 << VLAN_ADDR_FORCED)))
+		rc = qed_iov_reconfigure_unicast_vlan(p_hwfn, p_vf);
+
+	return rc;
+}
+
+static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
+					  struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+	struct qed_filter_ucast filter;
+
+	if (!p_vf->vport_instance)
+		return -EINVAL;
+
+	if (events & (1 << VLAN_ADDR_FORCED)) {
+		struct qed_sp_vport_update_params vport_update;
+		u8 removal;
+		int i;
+
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_VLAN;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		filter.vlan = p_vf->bulletin.p_virt->pvid;
+		filter.opcode = filter.vlan ? QED_FILTER_REPLACE :
+					      QED_FILTER_FLUSH;
+
+		/* Send the ramrod */
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VLAN for VF\n");
+			return rc;
+		}
+
+		/* Update the default-vlan & silent vlan stripping */
+		memset(&vport_update, 0, sizeof(vport_update));
+		vport_update.opaque_fid = p_vf->opaque_fid;
+		vport_update.vport_id = p_vf->vport_id;
+		vport_update.update_default_vlan_enable_flg = 1;
+		vport_update.default_vlan_enable_flg = filter.vlan ? 1 : 0;
+		vport_update.update_default_vlan_flg = 1;
+		vport_update.default_vlan = filter.vlan;
+
+		vport_update.update_inner_vlan_removal_flg = 1;
+		removal = filter.vlan ? 1
+				      : p_vf->shadow_config.inner_vlan_removal;
+		vport_update.inner_vlan_removal_flg = removal;
+		vport_update.silent_vlan_removal_flg = filter.vlan ? 1 : 0;
+		rc = qed_sp_vport_update(p_hwfn,
+					 &vport_update,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VF vport for vlan\n");
+			return rc;
+		}
+
+		/* Update all the Rx queues */
+		for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+			u16 qid;
+
+			if (!p_vf->vf_queues[i].rxq_active)
+				continue;
+
+			qid = p_vf->vf_queues[i].fw_rx_qid;
+
+			rc = qed_sp_eth_rx_queues_update(p_hwfn, qid,
+							 1, 0, 1,
+							 QED_SPQ_MODE_EBLOCK,
+							 NULL);
+			if (rc) {
+				DP_NOTICE(p_hwfn,
+					  "Failed to send Rx update fo queue[0x%04x]\n",
+					  qid);
+				return rc;
+			}
+		}
+
+		if (filter.vlan)
+			p_vf->configured_features |= 1 << VLAN_ADDR_FORCED;
+		else
+			p_vf->configured_features &= ~(1 << VLAN_ADDR_FORCED);
+	}
+
+	/* If forced features are terminated, we need to configure the shadow
+	 * configuration back again.
+	 */
+	if (events)
+		qed_iov_reconfigure_unicast_shadow(p_hwfn, p_vf, events);
+
+	return rc;
+}
+
 static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_vf_info *vf)
@@ -1241,6 +1386,7 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 	struct vfpf_vport_start_tlv *start;
 	u8 status = PFVF_STATUS_SUCCESS;
 	struct qed_vf_info *vf_info;
+	u64 *p_bitmap;
 	int sb_id;
 	int rc;
 
@@ -1272,10 +1418,24 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 	qed_iov_enable_vf_traffic(p_hwfn, p_ptt, vf);
 
 	vf->mtu = start->mtu;
+	vf->shadow_config.inner_vlan_removal = start->inner_vlan_removal;
+
+	/* Take into consideration configuration forced by hypervisor;
+	 * If none is configured, use the supplied VF values [for old
+	 * vfs that would still be fine, since they passed '0' as padding].
+	 */
+	p_bitmap = &vf_info->bulletin.p_virt->valid_bitmap;
+	if (!(*p_bitmap & (1 << VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED))) {
+		u8 vf_req = start->only_untagged;
+
+		vf_info->bulletin.p_virt->default_only_untagged = vf_req;
+		*p_bitmap |= 1 << VFPF_BULLETIN_UNTAGGED_DEFAULT;
+	}
 
 	params.tpa_mode = start->tpa_mode;
 	params.remove_inner_vlan = start->inner_vlan_removal;
 
+	params.only_untagged = vf_info->bulletin.p_virt->default_only_untagged;
 	params.drop_ttl0 = false;
 	params.concrete_fid = vf->concrete_fid;
 	params.opaque_fid = vf->opaque_fid;
@@ -1290,6 +1450,9 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 	} else {
 		vf->vport_instance++;
+
+		/* Force configuration if needed on the newly opened vport */
+		qed_iov_configure_vport_forced(p_hwfn, vf, *p_bitmap);
 	}
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_START,
 			     sizeof(struct pfvf_def_resp_tlv), status);
@@ -1311,6 +1474,10 @@ static void qed_iov_vf_mbx_stop_vport(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 	}
 
+	/* Forget the configuration on the vport */
+	vf->configured_features = 0;
+	memset(&vf->shadow_config, 0, sizeof(vf->shadow_config));
+
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_TEARDOWN,
 			     sizeof(struct pfvf_def_resp_tlv), status);
 }
@@ -1634,8 +1801,13 @@ qed_iov_vp_update_vlan_param(struct qed_hwfn *p_hwfn,
 	if (!p_vlan_tlv)
 		return;
 
-	p_data->update_inner_vlan_removal_flg = 1;
-	p_data->inner_vlan_removal_flg = p_vlan_tlv->remove_vlan;
+	p_vf->shadow_config.inner_vlan_removal = p_vlan_tlv->remove_vlan;
+
+	/* Ignore the VF request if we're forcing a vlan */
+	if (!(p_vf->configured_features & (1 << VLAN_ADDR_FORCED))) {
+		p_data->update_inner_vlan_removal_flg = 1;
+		p_data->inner_vlan_removal_flg = p_vlan_tlv->remove_vlan;
+	}
 
 	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_VLAN_STRIP;
 }
@@ -1886,6 +2058,67 @@ out:
 	qed_iov_send_response(p_hwfn, p_ptt, vf, length, status);
 }
 
+static int qed_iov_vf_update_unicast_shadow(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf,
+					    struct qed_filter_ucast *p_params)
+{
+	int i;
+
+	if (p_params->type == QED_FILTER_MAC)
+		return 0;
+
+	/* First remove entries and then add new ones */
+	if (p_params->opcode == QED_FILTER_REMOVE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			if (p_vf->shadow_config.vlans[i].used &&
+			    p_vf->shadow_config.vlans[i].vid ==
+			    p_params->vlan) {
+				p_vf->shadow_config.vlans[i].used = false;
+				break;
+			}
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to remove a non-existing vlan\n",
+				   p_vf->relative_vf_id);
+			return -EINVAL;
+		}
+	} else if (p_params->opcode == QED_FILTER_REPLACE ||
+		   p_params->opcode == QED_FILTER_FLUSH) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			p_vf->shadow_config.vlans[i].used = false;
+	}
+
+	/* In forced mode, we're willing to remove entries - but we don't add
+	 * new ones.
+	 */
+	if (p_vf->bulletin.p_virt->valid_bitmap & (1 << VLAN_ADDR_FORCED))
+		return 0;
+
+	if (p_params->opcode == QED_FILTER_ADD ||
+	    p_params->opcode == QED_FILTER_REPLACE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+			if (p_vf->shadow_config.vlans[i].used)
+				continue;
+
+			p_vf->shadow_config.vlans[i].used = true;
+			p_vf->shadow_config.vlans[i].vid = p_params->vlan;
+			break;
+		}
+
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to configure more than %d vlan filters\n",
+				   p_vf->relative_vf_id,
+				   QED_ETH_VF_NUM_VLAN_FILTERS + 1);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 int qed_iov_chk_ucast(struct qed_hwfn *hwfn,
 		      int vfid, struct qed_filter_ucast *params)
 {
@@ -1907,6 +2140,7 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt,
 					struct qed_vf_info *vf)
 {
+	struct qed_bulletin_content *p_bulletin = vf->bulletin.p_virt;
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	struct vfpf_ucast_filter_tlv *req;
 	u8 status = PFVF_STATUS_SUCCESS;
@@ -1946,6 +2180,25 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 		goto out;
 	}
 
+	/* Update shadow copy of the VF configuration */
+	if (qed_iov_vf_update_unicast_shadow(p_hwfn, vf, &params)) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	/* Determine if the unicast filtering is acceptible by PF */
+	if ((p_bulletin->valid_bitmap & (1 << VLAN_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_VLAN ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		/* Once VLAN is forced or PVID is set, do not allow
+		 * to add/replace any further VLANs.
+		 */
+		if (params.opcode == QED_FILTER_ADD ||
+		    params.opcode == QED_FILTER_REPLACE)
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
 	rc = qed_iov_chk_ucast(p_hwfn, vf->relative_vf_id, &params);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
@@ -2449,6 +2702,29 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
+				      u16 pvid, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << VLAN_ADDR_FORCED;
+	vf_info->bulletin.p_virt->pvid = pvid;
+	if (pvid)
+		vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	else
+		vf_info->bulletin.p_virt->valid_bitmap &= ~feature;
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
 bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
@@ -2460,6 +2736,20 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return 0;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & (1 << VLAN_ADDR_FORCED)))
+		return 0;
+
+	return p_vf->bulletin.p_virt->pvid;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -2609,6 +2899,38 @@ static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
 		return qed_sriov_disable(cdev, true);
 }
 
+static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced vlan, and schedule the IOV task */
+		vf_info->forced_vlan = vid;
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
 void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 {
 	struct qed_mcp_link_capabilities caps;
@@ -2671,6 +2993,38 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 	qed_ptt_release(hwfn, ptt);
 }
 
+static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
+{
+	int i;
+
+	qed_for_each_vf(hwfn, i) {
+		struct qed_public_vf_info *info;
+		bool update = false;
+
+		info = qed_iov_get_public_vf_info(hwfn, i, true);
+		if (!info)
+			continue;
+
+		/* Update data on bulletin board */
+
+		if (qed_iov_bulletin_get_forced_vlan(hwfn, i) ^
+		    info->forced_vlan) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of pvid [0x%04x] to VF 0x%02x [Abs 0x%02x]\n",
+				   info->forced_vlan,
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+			qed_iov_bulletin_set_forced_vlan(hwfn,
+							 info->forced_vlan, i);
+			update = true;
+		}
+
+		if (update)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	}
+}
+
 static void qed_handle_bulletin_post(struct qed_hwfn *hwfn)
 {
 	struct qed_ptt *ptt;
@@ -2715,6 +3069,11 @@ void qed_iov_pf_task(struct work_struct *work)
 
 	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
 		qed_handle_vf_msg(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+			       &hwfn->iov_task_flags))
+		qed_handle_pf_set_vf_unicast(hwfn);
+
 	if (test_and_clear_bit(QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
 			       &hwfn->iov_task_flags))
 		qed_handle_bulletin_post(hwfn);
@@ -2774,4 +3133,5 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
+	.set_vlan = &qed_sriov_pf_set_vlan,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 2c94b445d07f..e65f403349c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -24,6 +24,9 @@
 #define QED_MAX_VF_CHAINS_PER_PF 16
 #define QED_ETH_VF_NUM_VLAN_FILTERS 2
 
+#define QED_ETH_MAX_VF_NUM_VLAN_FILTERS	\
+	(MAX_NUM_VFS * QED_ETH_VF_NUM_VLAN_FILTERS)
+
 enum qed_iov_vport_update_flag {
 	QED_IOV_VP_UPDATE_ACTIVATE,
 	QED_IOV_VP_UPDATE_VLAN_STRIP,
@@ -40,6 +43,7 @@ struct qed_public_vf_info {
 	/* These copies will later be reflected in the bulletin board,
 	 * but this copy should be newer.
 	 */
+	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
 };
 
@@ -98,6 +102,18 @@ enum vf_state {
 	VF_STOPPED		/* VF, Stopped */
 };
 
+struct qed_vf_vlan_shadow {
+	bool used;
+	u16 vid;
+};
+
+struct qed_vf_shadow_config {
+	/* Shadow copy of all guest vlans */
+	struct qed_vf_vlan_shadow vlans[QED_ETH_VF_NUM_VLAN_FILTERS + 1];
+
+	u8 inner_vlan_removal;
+};
+
 /* PFs maintain an array of this structure, per VF */
 struct qed_vf_info {
 	struct qed_iov_vf_mbx vf_mbx;
@@ -131,6 +147,16 @@ struct qed_vf_info {
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
 	u8 num_active_rxqs;
 	struct qed_public_vf_info p_vf_info;
+
+	/* Stores the configuration requested by VF */
+	struct qed_vf_shadow_config shadow_config;
+
+	/* A bitfield using bulletin's valid-map bits, used to indicate
+	 * which of the bulletin board features have been configured.
+	 */
+	u64 configured_features;
+#define QED_IOV_CONFIGURED_FEATURES_MASK        ((1 << MAC_ADDR_FORCED) | \
+						 (1 << VLAN_ADDR_FORCED))
 };
 
 /* This structure is part of qed_hwfn and used only for PFs that have sriov
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index e788954568d4..3c8911de3ed4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -474,7 +474,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 			  u16 mtu,
 			  u8 inner_vlan_removal,
 			  enum qed_tpa_mode tpa_mode,
-			  u8 max_buffers_per_cqe)
+			  u8 max_buffers_per_cqe, u8 only_untagged)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_vport_start_tlv *req;
@@ -489,6 +489,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 	req->inner_vlan_removal = inner_vlan_removal;
 	req->tpa_mode = tpa_mode;
 	req->max_buffers_per_cqe = max_buffers_per_cqe;
+	req->only_untagged = only_untagged;
 
 	/* status blocks */
 	for (i = 0; i < p_hwfn->vf_iov_info->acquire_resp.resc.num_sbs; i++)
@@ -547,6 +548,8 @@ qed_vf_handle_vp_update_is_needed(struct qed_hwfn *p_hwfn,
 		return !!p_data->update_tx_switching_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP:
 		return !!p_data->update_inner_vlan_removal_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN:
+		return !!p_data->update_accept_any_vlan_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_MCAST:
 		return !!p_data->update_approx_mcast_flg;
 	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM:
@@ -696,6 +699,19 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 		       sizeof(rss_params->rss_key));
 	}
 
+	if (p_params->update_accept_any_vlan_flg) {
+		struct vfpf_vport_update_accept_any_vlan_tlv *p_any_vlan_tlv;
+
+		size = sizeof(struct vfpf_vport_update_accept_any_vlan_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN;
+		p_any_vlan_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, tlv, size);
+
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+		p_any_vlan_tlv->accept_any_vlan = p_params->accept_any_vlan;
+		p_any_vlan_tlv->update_accept_any_vlan_flg =
+		    p_params->update_accept_any_vlan_flg;
+	}
+
 	/* add list termination tlv */
 	qed_add_tlv(p_hwfn, &p_iov->offset,
 		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index d9a8aa684ad7..35eced3691ba 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -417,6 +417,16 @@ union pfvf_tlvs {
 	struct pfvf_start_queue_resp_tlv queue_start;
 };
 
+enum qed_bulletin_bit {
+	/* Alert the VF that a forced VLAN was set by the PF */
+	VLAN_ADDR_FORCED = 2,
+
+	/* Indicate that `default_only_untagged' contains actual data */
+	VFPF_BULLETIN_UNTAGGED_DEFAULT = 3,
+	VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED = 4,
+
+};
+
 struct qed_bulletin_content {
 	/* crc of structure to ensure is not in mid-update */
 	u32 crc;
@@ -465,6 +475,10 @@ struct qed_bulletin_content {
 	u32 partner_adv_speed;
 
 	u32 capability_speed;
+
+	/* Forced vlan */
+	u16 pvid;
+	u16 padding5;
 };
 
 struct qed_bulletin {
@@ -737,7 +751,7 @@ int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 			  u16 mtu,
 			  u8 inner_vlan_removal,
 			  enum qed_tpa_mode tpa_mode,
-			  u8 max_buffers_per_cqe);
+			  u8 max_buffers_per_cqe, u8 only_untagged);
 
 /**
  * @brief qed_vf_pf_vport_stop - stop the VF's vport
@@ -898,7 +912,8 @@ static inline int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
 					u16 mtu,
 					u8 inner_vlan_removal,
 					enum qed_tpa_mode tpa_mode,
-					u8 max_buffers_per_cqe)
+					u8 max_buffers_per_cqe,
+					u8 only_untagged)
 {
 	return -EINVAL;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index bf54cfcd75c0..4d59d7e00e42 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -103,6 +103,21 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 #ifdef CONFIG_QED_SRIOV
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (vlan > 4095) {
+		DP_NOTICE(edev, "Illegal vlan value %d\n", vlan);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
+		   vlan, vf);
+
+	return edev->ops->iov->set_vlan(edev->cdev, vlan, vf);
+}
+
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
@@ -2071,6 +2086,9 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_set_mac_address = qede_set_mac_addr,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_change_mtu = qede_change_mtu,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_vlan = qede_set_vf_vlan,
+#endif
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
 	.ndo_get_stats64 = qede_get_stats64,
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index c53bfa6374c5..825c007d50f1 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -15,6 +15,7 @@
 struct qed_iov_hv_ops {
 	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
 
+	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 };
 
 #endif
-- 
cgit v1.2.3


From eff169608c250193e72089dc4ab15cb79e0bd68c Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:21 +0300
Subject: qed*: Support forced MAC

Allows the PF to enforce the VF's mac.
i.e., by using `ip link ... vf <x> mac <value>'.

While a MAC is forced, PF would prevent the VF from configuring any other
MAC.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |   9 ++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 120 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |   1 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  47 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  21 +++++
 drivers/net/ethernet/qlogic/qede/qede_main.c |  31 +++++++
 include/linux/qed/qed_eth_if.h               |   3 +
 include/linux/qed/qed_iov_if.h               |   2 +
 8 files changed, 234 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 7fb6b82f1a97..8d83250aa5ba 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1701,6 +1701,14 @@ static void qed_register_eth_ops(struct qed_dev *cdev,
 		qed_vf_start_iov_wq(cdev);
 }
 
+static bool qed_check_mac(struct qed_dev *cdev, u8 *mac)
+{
+	if (IS_PF(cdev))
+		return true;
+
+	return qed_vf_check_mac(&cdev->hwfns[0], mac);
+}
+
 static int qed_start_vport(struct qed_dev *cdev,
 			   struct qed_start_vport_params *params)
 {
@@ -2149,6 +2157,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 #endif
 	.fill_dev_info = &qed_fill_eth_dev_info,
 	.register_ops = &qed_register_eth_ops,
+	.check_mac = &qed_check_mac,
 	.vport_start = &qed_start_vport,
 	.vport_stop = &qed_stop_vport,
 	.vport_update = &qed_update_vport,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 77d44baa5df3..c1b79190ce4d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1295,6 +1295,29 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 	if (!p_vf->vport_instance)
 		return -EINVAL;
 
+	if (events & (1 << MAC_ADDR_FORCED)) {
+		/* Since there's no way [currently] of removing the MAC,
+		 * we can always assume this means we need to force it.
+		 */
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_MAC;
+		filter.opcode = QED_FILTER_REPLACE;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		ether_addr_copy(filter.mac, p_vf->bulletin.p_virt->mac);
+
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure MAC for VF\n");
+			return rc;
+		}
+
+		p_vf->configured_features |= 1 << MAC_ADDR_FORCED;
+	}
+
 	if (events & (1 << VLAN_ADDR_FORCED)) {
 		struct qed_sp_vport_update_params vport_update;
 		u8 removal;
@@ -2199,6 +2222,16 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
 		goto out;
 	}
 
+	if ((p_bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_MAC ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		if (!ether_addr_equal(p_bulletin->mac, params.mac) ||
+		    (params.opcode != QED_FILTER_ADD &&
+		     params.opcode != QED_FILTER_REPLACE))
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
 	rc = qed_iov_chk_ucast(p_hwfn, vf->relative_vf_id, &params);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
@@ -2702,6 +2735,30 @@ static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
 	return 0;
 }
 
+static void qed_iov_bulletin_set_forced_mac(struct qed_hwfn *p_hwfn,
+					    u8 *mac, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << MAC_ADDR_FORCED;
+	memcpy(vf_info->bulletin.p_virt->mac, mac, ETH_ALEN);
+
+	vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	/* Forced MAC will disable MAC_ADDR */
+	vf_info->bulletin.p_virt->valid_bitmap &=
+				~(1 << VFPF_BULLETIN_MAC_ADDR);
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
 void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 				      u16 pvid, int vfid)
 {
@@ -2736,6 +2793,21 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
+					   u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return NULL;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & (1 << MAC_ADDR_FORCED)))
+		return NULL;
+
+	return p_vf->bulletin.p_virt->mac;
+}
+
 u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 {
 	struct qed_vf_info *p_vf;
@@ -2899,6 +2971,38 @@ static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
 		return qed_sriov_disable(cdev, true);
 }
 
+static int qed_sriov_pf_set_mac(struct qed_dev *cdev, u8 *mac, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced MAC, and schedule the IOV task */
+		ether_addr_copy(vf_info->forced_mac, mac);
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
 static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
 {
 	int i;
@@ -3000,12 +3104,27 @@ static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
 	qed_for_each_vf(hwfn, i) {
 		struct qed_public_vf_info *info;
 		bool update = false;
+		u8 *mac;
 
 		info = qed_iov_get_public_vf_info(hwfn, i, true);
 		if (!info)
 			continue;
 
 		/* Update data on bulletin board */
+		mac = qed_iov_bulletin_get_forced_mac(hwfn, i);
+		if (is_valid_ether_addr(info->forced_mac) &&
+		    (!mac || !ether_addr_equal(mac, info->forced_mac))) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of VF MAC to VF 0x%02x [Abs 0x%02x]\n",
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+			/* Update bulletin board with forced MAC */
+			qed_iov_bulletin_set_forced_mac(hwfn,
+							info->forced_mac, i);
+			update = true;
+		}
 
 		if (qed_iov_bulletin_get_forced_vlan(hwfn, i) ^
 		    info->forced_vlan) {
@@ -3133,5 +3252,6 @@ int qed_iov_wq_start(struct qed_dev *cdev)
 
 const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
+	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index e65f403349c2..e38ea985abe1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -43,6 +43,7 @@ struct qed_public_vf_info {
 	/* These copies will later be reflected in the bulletin board,
 	 * but this copy should be newer.
 	 */
+	u8 forced_mac[ETH_ALEN];
 	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 3c8911de3ed4..db14e230c9a4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/etherdevice.h>
 #include "qed.h"
 #include "qed_sriov.h"
 #include "qed_vf.h"
@@ -1004,6 +1005,43 @@ void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn, u8 *num_vlan_filters)
 	*num_vlan_filters = p_vf->acquire_resp.resc.num_vlan_filters;
 }
 
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+	if (!(bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)))
+		return true;
+
+	/* Forbid VF from changing a MAC enforced by PF */
+	if (ether_addr_equal(bulletin->mac, mac))
+		return false;
+
+	return false;
+}
+
+bool qed_vf_bulletin_get_forced_mac(struct qed_hwfn *hwfn,
+				    u8 *dst_mac, u8 *p_is_forced)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &hwfn->vf_iov_info->bulletin_shadow;
+
+	if (bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)) {
+		if (p_is_forced)
+			*p_is_forced = 1;
+	} else if (bulletin->valid_bitmap & (1 << VFPF_BULLETIN_MAC_ADDR)) {
+		if (p_is_forced)
+			*p_is_forced = 0;
+	} else {
+		return false;
+	}
+
+	ether_addr_copy(dst_mac, bulletin->mac);
+
+	return true;
+}
+
 void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 			   u16 *fw_major, u16 *fw_minor,
 			   u16 *fw_rev, u16 *fw_eng)
@@ -1020,6 +1058,15 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 
 static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
 {
+	struct qed_eth_cb_ops *ops = hwfn->cdev->protocol_ops.eth;
+	u8 mac[ETH_ALEN], is_mac_exist, is_mac_forced;
+	void *cookie = hwfn->cdev->ops_cookie;
+
+	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
+						      &is_mac_forced);
+	if (is_mac_exist && is_mac_forced && cookie)
+		ops->force_mac(cookie, mac);
+
 	/* Always update link configuration according to bulletin */
 	qed_link_update(hwfn);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 35eced3691ba..b82fda964bbd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -418,6 +418,8 @@ union pfvf_tlvs {
 };
 
 enum qed_bulletin_bit {
+	/* Alert the VF that a forced MAC was set by the PF */
+	MAC_ADDR_FORCED = 0,
 	/* Alert the VF that a forced VLAN was set by the PF */
 	VLAN_ADDR_FORCED = 2,
 
@@ -425,6 +427,10 @@ enum qed_bulletin_bit {
 	VFPF_BULLETIN_UNTAGGED_DEFAULT = 3,
 	VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED = 4,
 
+	/* Alert the VF that suggested mac was sent by the PF.
+	 * MAC_ADDR will be disabled in case MAC_ADDR_FORCED is set.
+	 */
+	VFPF_BULLETIN_MAC_ADDR = 5
 };
 
 struct qed_bulletin_content {
@@ -601,6 +607,16 @@ void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac);
 void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
 				 u8 *num_vlan_filters);
 
+/**
+ * @brief Check if VF can set a MAC address
+ *
+ * @param p_hwfn
+ * @param mac
+ *
+ * @return bool
+ */
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac);
+
 /**
  * @brief Set firmware version information in dev_info from VFs acquire response tlv
  *
@@ -841,6 +857,11 @@ static inline void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
 {
 }
 
+static inline bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	return false;
+}
+
 static inline void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 					 u16 *fw_major, u16 *fw_minor,
 					 u16 *fw_rev, u16 *fw_eng)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 4d59d7e00e42..b326b15d5196 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -118,6 +118,22 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
 	return edev->ops->iov->set_vlan(edev->cdev, vlan, vf);
 }
 
+static int qede_set_vf_mac(struct net_device *ndev, int vfidx, u8 *mac)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	DP_VERBOSE(edev, QED_MSG_IOV,
+		   "Setting MAC %02x:%02x:%02x:%02x:%02x:%02x to VF [%d]\n",
+		   mac[0], mac[1], mac[2], mac[3], mac[4], mac[5], vfidx);
+
+	if (!is_valid_ether_addr(mac)) {
+		DP_VERBOSE(edev, QED_MSG_IOV, "MAC address isn't valid\n");
+		return -EINVAL;
+	}
+
+	return edev->ops->iov->set_mac(edev->cdev, mac, vfidx);
+}
+
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
@@ -138,10 +154,19 @@ static struct pci_driver qede_pci_driver = {
 #endif
 };
 
+static void qede_force_mac(void *dev, u8 *mac)
+{
+	struct qede_dev *edev = dev;
+
+	ether_addr_copy(edev->ndev->dev_addr, mac);
+	ether_addr_copy(edev->primary_mac, mac);
+}
+
 static struct qed_eth_cb_ops qede_ll_ops = {
 	{
 		.link_update = qede_link_update,
 	},
+	.force_mac = qede_force_mac,
 };
 
 static int qede_netdev_event(struct notifier_block *this, unsigned long event,
@@ -2087,6 +2112,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_change_mtu = qede_change_mtu,
 #ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_mac = qede_set_vf_mac,
 	.ndo_set_vf_vlan = qede_set_vf_vlan,
 #endif
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
@@ -3512,6 +3538,11 @@ static int qede_set_mac_addr(struct net_device *ndev, void *p)
 		return -EFAULT;
 	}
 
+	if (!edev->ops->check_mac(edev->cdev, addr->sa_data)) {
+		DP_NOTICE(edev, "qed prevents setting MAC\n");
+		return -EINVAL;
+	}
+
 	ether_addr_copy(ndev->dev_addr, addr->sa_data);
 
 	if (!netif_running(ndev))  {
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index acfafca43aa5..e0f6e6482031 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -122,6 +122,7 @@ struct qed_tunn_params {
 
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
+	void (*force_mac) (void *dev, u8 *mac);
 };
 
 struct qed_eth_ops {
@@ -137,6 +138,8 @@ struct qed_eth_ops {
 			     struct qed_eth_cb_ops *ops,
 			     void *cookie);
 
+	 bool(*check_mac) (struct qed_dev *cdev, u8 *mac);
+
 	int (*vport_start)(struct qed_dev *cdev,
 			   struct qed_start_vport_params *params);
 
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 825c007d50f1..7a67fbf4336a 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -15,6 +15,8 @@
 struct qed_iov_hv_ops {
 	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
 
+	int (*set_mac) (struct qed_dev *cdev, u8 *mac, int vfid);
+
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 };
 
-- 
cgit v1.2.3


From 733def6a04bf3d2810dd675e1240f8df94d633c3 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:22 +0300
Subject: qed*: IOV link control

This adds support in 2 ndo that allow PF to tweak the VF's view of the
link - `ndo_set_vf_link_state' to allow it a view independent of the PF's,
and `ndo_set_vf_rate' which would allow the PF to limit the VF speed.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |   2 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |  76 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 164 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |   6 +
 drivers/net/ethernet/qlogic/qede/qede_main.c |  26 +++++
 include/linux/qed/qed_iov_if.h               |   6 +
 6 files changed, 280 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index d7da64556e4b..77323fc70927 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -554,8 +554,10 @@ static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev,
 
 #define PURE_LB_TC 8
 
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
 void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate);
 
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
 
 /* Other Linux specific common definitions */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index e75e73a77b27..acaa2866dae3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1889,6 +1889,32 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static int __qed_configure_vport_wfq(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 vp_id, u32 rate)
+{
+	struct qed_mcp_link_state *p_link;
+	int rc = 0;
+
+	p_link = &p_hwfn->cdev->hwfns[0].mcp_info->link_output;
+
+	if (!p_link->min_pf_rate) {
+		p_hwfn->qm_info.wfq_data[vp_id].min_speed = rate;
+		p_hwfn->qm_info.wfq_data[vp_id].configured = true;
+		return rc;
+	}
+
+	rc = qed_init_wfq_param(p_hwfn, vp_id, rate, p_link->min_pf_rate);
+
+	if (rc == 0)
+		qed_configure_wfq_for_all_vports(p_hwfn, p_ptt,
+						 p_link->min_pf_rate);
+	else
+		DP_NOTICE(p_hwfn,
+			  "Validation failed while configuring min rate\n");
+
+	return rc;
+}
+
 static int __qed_configure_vp_wfq_on_link_change(struct qed_hwfn *p_hwfn,
 						 struct qed_ptt *p_ptt,
 						 u32 min_pf_rate)
@@ -1923,6 +1949,42 @@ static int __qed_configure_vp_wfq_on_link_change(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+/* Main API for qed clients to configure vport min rate.
+ * vp_id - vport id in PF Range[0 - (total_num_vports_per_pf - 1)]
+ * rate - Speed in Mbps needs to be assigned to a given vport.
+ */
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate)
+{
+	int i, rc = -EINVAL;
+
+	/* Currently not supported; Might change in future */
+	if (cdev->num_hwfns > 1) {
+		DP_NOTICE(cdev,
+			  "WFQ configuration is not supported for this device\n");
+		return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_ptt *p_ptt;
+
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = __qed_configure_vport_wfq(p_hwfn, p_ptt, vp_id, rate);
+
+		if (!rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			return rc;
+		}
+
+		qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	return rc;
+}
+
 /* API to configure WFQ from mcp link change */
 void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
 {
@@ -2069,3 +2131,17 @@ int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw)
 
 	return rc;
 }
+
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_link_state *p_link;
+
+	p_link = &p_hwfn->mcp_info->link_output;
+
+	if (p_link->min_pf_rate)
+		qed_disable_wfq_for_all_vports(p_hwfn, p_ptt,
+					       p_link->min_pf_rate);
+
+	memset(p_hwfn->qm_info.wfq_data, 0,
+	       sizeof(*p_hwfn->qm_info.wfq_data) * p_hwfn->qm_info.num_vports);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index c1b79190ce4d..c9a3bb63cc87 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -2822,6 +2822,46 @@ u16 qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
 	return p_vf->bulletin.p_virt->pvid;
 }
 
+static int qed_iov_configure_tx_rate(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, int vfid, int val)
+{
+	struct qed_vf_info *vf;
+	u8 abs_vp_id = 0;
+	int rc;
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf)
+		return -EINVAL;
+
+	rc = qed_fw_vport(p_hwfn, vf->vport_id, &abs_vp_id);
+	if (rc)
+		return rc;
+
+	return qed_init_vport_rl(p_hwfn, p_ptt, abs_vp_id, (u32)val);
+}
+
+int qed_iov_configure_min_tx_rate(struct qed_dev *cdev, int vfid, u32 rate)
+{
+	struct qed_vf_info *vf;
+	u8 vport_id;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set min rate\n");
+			return -EINVAL;
+		}
+	}
+
+	vf = qed_iov_get_vf_info(QED_LEADING_HWFN(cdev), (u16)vfid, true);
+	vport_id = vf->vport_id;
+
+	return qed_configure_vport_wfq(cdev, vport_id, rate);
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -2871,6 +2911,9 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
 			return -EBUSY;
 		}
 
+		/* Clean WFQ db and configure equal weight for all vports */
+		qed_clean_wfq_db(hwfn, ptt);
+
 		qed_for_each_vf(hwfn, j) {
 			int k;
 
@@ -3047,17 +3090,136 @@ void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 
 	/* Update bulletin of all future possible VFs with link configuration */
 	for (i = 0; i < hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, i, false);
+		if (!vf_info)
+			continue;
+
 		memcpy(&params, qed_mcp_get_link_params(hwfn), sizeof(params));
 		memcpy(&link, qed_mcp_get_link_state(hwfn), sizeof(link));
 		memcpy(&caps, qed_mcp_get_link_capabilities(hwfn),
 		       sizeof(caps));
 
+		/* Modify link according to the VF's configured link state */
+		switch (vf_info->link_state) {
+		case IFLA_VF_LINK_STATE_DISABLE:
+			link.link_up = false;
+			break;
+		case IFLA_VF_LINK_STATE_ENABLE:
+			link.link_up = true;
+			/* Set speed according to maximum supported by HW.
+			 * that is 40G for regular devices and 100G for CMT
+			 * mode devices.
+			 */
+			link.speed = (hwfn->cdev->num_hwfns > 1) ?
+				     100000 : 40000;
+		default:
+			/* In auto mode pass PF link image to VF */
+			break;
+		}
+
+		if (link.link_up && vf_info->tx_rate) {
+			struct qed_ptt *ptt;
+			int rate;
+
+			rate = min_t(int, vf_info->tx_rate, link.speed);
+
+			ptt = qed_ptt_acquire(hwfn);
+			if (!ptt) {
+				DP_NOTICE(hwfn, "Failed to acquire PTT\n");
+				return;
+			}
+
+			if (!qed_iov_configure_tx_rate(hwfn, ptt, i, rate)) {
+				vf_info->tx_rate = rate;
+				link.speed = rate;
+			}
+
+			qed_ptt_release(hwfn, ptt);
+		}
+
 		qed_iov_set_link(hwfn, i, &params, &link, &caps);
 	}
 
 	qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
 }
 
+static int qed_set_vf_link_state(struct qed_dev *cdev,
+				 int vf_id, int link_state)
+{
+	int i;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	/* Handle configuration of link state */
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		vf = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+		if (!vf)
+			continue;
+
+		if (vf->link_state == link_state)
+			continue;
+
+		vf->link_state = link_state;
+		qed_inform_vf_link_state(&cdev->hwfns[i]);
+	}
+
+	return 0;
+}
+
+static int qed_configure_max_vf_rate(struct qed_dev *cdev, int vfid, int rate)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set tx rate\n");
+			return -EINVAL;
+		}
+
+		vf = qed_iov_get_public_vf_info(p_hwfn, vfid, true);
+
+		vf->tx_rate = rate;
+
+		qed_inform_vf_link_state(p_hwfn);
+	}
+
+	return 0;
+}
+
+static int qed_set_vf_rate(struct qed_dev *cdev,
+			   int vfid, u32 min_rate, u32 max_rate)
+{
+	int rc_min = 0, rc_max = 0;
+
+	if (max_rate)
+		rc_max = qed_configure_max_vf_rate(cdev, vfid, max_rate);
+
+	if (min_rate)
+		rc_min = qed_iov_configure_min_tx_rate(cdev, vfid, min_rate);
+
+	if (rc_max | rc_min)
+		return -EINVAL;
+
+	return 0;
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
@@ -3254,4 +3416,6 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
+	.set_link_state = &qed_set_vf_link_state,
+	.set_rate = &qed_set_vf_rate,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index e38ea985abe1..ab3d291cf7f0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -46,6 +46,12 @@ struct qed_public_vf_info {
 	u8 forced_mac[ETH_ALEN];
 	u16 forced_vlan;
 	u8 mac[ETH_ALEN];
+
+	/* IFLA_VF_LINK_STATE_<X> */
+	int link_state;
+
+	/* Currently configured Tx rate in MB/sec. 0 if unconfigured */
+	int tx_rate;
 };
 
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b326b15d5196..3d0f98f81122 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1792,6 +1792,28 @@ static struct rtnl_link_stats64 *qede_get_stats64(
 	return stats;
 }
 
+#ifdef CONFIG_QED_SRIOV
+static int qede_set_vf_rate(struct net_device *dev, int vfidx,
+			    int min_tx_rate, int max_tx_rate)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->iov->set_rate(edev->cdev, vfidx, max_tx_rate,
+					max_tx_rate);
+}
+
+static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
+				  int link_state)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_link_state(edev->cdev, vfidx, link_state);
+}
+#endif
+
 static void qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
 {
 	struct qed_update_vport_params params;
@@ -2118,6 +2140,10 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
 	.ndo_get_stats64 = qede_get_stats64,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_link_state = qede_set_vf_link_state,
+	.ndo_set_vf_rate = qede_set_vf_rate,
+#endif
 #ifdef CONFIG_QEDE_VXLAN
 	.ndo_add_vxlan_port = qede_add_vxlan_port,
 	.ndo_del_vxlan_port = qede_del_vxlan_port,
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 7a67fbf4336a..f364f2bd7a4d 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -18,6 +18,12 @@ struct qed_iov_hv_ops {
 	int (*set_mac) (struct qed_dev *cdev, u8 *mac, int vfid);
 
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
+
+	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
+			       int link_state);
+
+	int (*set_rate) (struct qed_dev *cdev, int vfid,
+			 u32 min_rate, u32 max_rate);
 };
 
 #endif
-- 
cgit v1.2.3


From 6ddc7608258d57d61e16d55461400bb6eff18d72 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:23 +0300
Subject: qed*: IOV support spoof-checking

Add support in `ndo_set_vf_spoofchk' for allowing PF control over
its VF spoof-checking configuration.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c     |  4 ++
 drivers/net/ethernet/qlogic/qed/qed_l2.h     |  2 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 91 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  2 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 11 ++++
 include/linux/qed/qed_iov_if.h               |  2 +
 6 files changed, 112 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 8d83250aa5ba..e0275a78b121 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -381,6 +381,10 @@ int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 	p_ramrod->common.tx_switching_en = p_params->tx_switching_flg;
 	p_cmn->update_tx_switching_en_flg = p_params->update_tx_switching_flg;
 
+	p_cmn->anti_spoofing_en = p_params->anti_spoofing_en;
+	val = p_params->update_anti_spoofing_en_flg;
+	p_ramrod->common.update_anti_spoofing_en_flg = val;
+
 	rc = qed_sp_vport_update_rss(p_hwfn, p_ramrod, p_rss_params);
 	if (rc) {
 		/* Return spq entry which is taken in qed_sp_init_request()*/
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index fad30ae12f63..a04fb7f061ea 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -149,6 +149,8 @@ struct qed_sp_vport_update_params {
 	u8				update_tx_switching_flg;
 	u8				tx_switching_flg;
 	u8				update_approx_mcast_flg;
+	u8				update_anti_spoofing_en_flg;
+	u8				anti_spoofing_en;
 	u8				update_accept_any_vlan_flg;
 	u8				accept_any_vlan;
 	unsigned long			bins[8];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index c9a3bb63cc87..804102c257e6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1234,6 +1234,39 @@ out:
 			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
 }
 
+static int __qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn,
+				  struct qed_vf_info *p_vf, bool val)
+{
+	struct qed_sp_vport_update_params params;
+	int rc;
+
+	if (val == p_vf->spoof_chk) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk value[%d] is already configured\n", val);
+		return 0;
+	}
+
+	memset(&params, 0, sizeof(struct qed_sp_vport_update_params));
+	params.opaque_fid = p_vf->opaque_fid;
+	params.vport_id = p_vf->vport_id;
+	params.update_anti_spoofing_en_flg = 1;
+	params.anti_spoofing_en = val;
+
+	rc = qed_sp_vport_update(p_hwfn, &params, QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc) {
+		p_vf->spoof_chk = val;
+		p_vf->req_spoofchk_val = p_vf->spoof_chk;
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk val[%d] configured\n", val);
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk configuration[val:%d] failed for VF[%d]\n",
+			   val, p_vf->relative_vf_id);
+	}
+
+	return rc;
+}
+
 static int qed_iov_reconfigure_unicast_vlan(struct qed_hwfn *p_hwfn,
 					    struct qed_vf_info *p_vf)
 {
@@ -1476,6 +1509,8 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 
 		/* Force configuration if needed on the newly opened vport */
 		qed_iov_configure_vport_forced(p_hwfn, vf, *p_bitmap);
+
+		__qed_iov_spoofchk_set(p_hwfn, vf, vf->req_spoofchk_val);
 	}
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_START,
 			     sizeof(struct pfvf_def_resp_tlv), status);
@@ -1489,6 +1524,7 @@ static void qed_iov_vf_mbx_stop_vport(struct qed_hwfn *p_hwfn,
 	int rc;
 
 	vf->vport_instance--;
+	vf->spoof_chk = false;
 
 	rc = qed_sp_vport_stop(p_hwfn, vf->opaque_fid, vf->vport_id);
 	if (rc != 0) {
@@ -2782,6 +2818,17 @@ void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
 }
 
+static bool qed_iov_vf_has_vport_instance(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return false;
+
+	return !!p_vf_info->vport_instance;
+}
+
 bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
@@ -2793,6 +2840,34 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+int qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn, int vfid, bool val)
+{
+	struct qed_vf_info *vf;
+	int rc = -EINVAL;
+
+	if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+		DP_NOTICE(p_hwfn,
+			  "SR-IOV sanity check failed, can't set spoofchk\n");
+		goto out;
+	}
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf)
+		goto out;
+
+	if (!qed_iov_vf_has_vport_instance(p_hwfn, vfid)) {
+		/* After VF VPORT start PF will configure spoof check */
+		vf->req_spoofchk_val = val;
+		rc = 0;
+		goto out;
+	}
+
+	rc = __qed_iov_spoofchk_set(p_hwfn, vf, val);
+
+out:
+	return rc;
+}
+
 static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
 					   u16 rel_vf_id)
 {
@@ -3179,6 +3254,21 @@ static int qed_set_vf_link_state(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_spoof_configure(struct qed_dev *cdev, int vfid, bool val)
+{
+	int i, rc = -EINVAL;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		rc = qed_iov_spoofchk_set(p_hwfn, vfid, val);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
 static int qed_configure_max_vf_rate(struct qed_dev *cdev, int vfid, int rate)
 {
 	int i;
@@ -3417,5 +3507,6 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
 	.set_link_state = &qed_set_vf_link_state,
+	.set_spoof = &qed_spoof_configure,
 	.set_rate = &qed_set_vf_rate,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index ab3d291cf7f0..c8667c65e685 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -154,6 +154,8 @@ struct qed_vf_info {
 	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
 	u8 num_active_rxqs;
 	struct qed_public_vf_info p_vf_info;
+	bool spoof_chk;
+	bool req_spoofchk_val;
 
 	/* Stores the configuration requested by VF */
 	struct qed_vf_shadow_config shadow_config;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3d0f98f81122..a908bd69d252 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1802,6 +1802,16 @@ static int qede_set_vf_rate(struct net_device *dev, int vfidx,
 					max_tx_rate);
 }
 
+static int qede_set_vf_spoofchk(struct net_device *dev, int vfidx, bool val)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_spoof(edev->cdev, vfidx, val);
+}
+
 static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
 				  int link_state)
 {
@@ -2142,6 +2152,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_get_stats64 = qede_get_stats64,
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_link_state = qede_set_vf_link_state,
+	.ndo_set_vf_spoofchk = qede_set_vf_spoofchk,
 	.ndo_set_vf_rate = qede_set_vf_rate,
 #endif
 #ifdef CONFIG_QEDE_VXLAN
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index f364f2bd7a4d..2596d30d9e63 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -22,6 +22,8 @@ struct qed_iov_hv_ops {
 	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
 			       int link_state);
 
+	int (*set_spoof) (struct qed_dev *cdev, int vfid, bool val);
+
 	int (*set_rate) (struct qed_dev *cdev, int vfid,
 			 u32 min_rate, u32 max_rate);
 };
-- 
cgit v1.2.3


From 73390ac9d82bf9f0c849ff57b06a03145fbf05d6 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:24 +0300
Subject: qed*: support ndo_get_vf_config

Allows the user to view the VF configuration by observing the PF's
device.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 93 ++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c | 12 ++++
 include/linux/qed/qed_iov_if.h               |  3 +
 3 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 804102c257e6..6af8fd9fd560 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -2588,6 +2588,30 @@ void qed_iov_set_link(struct qed_hwfn *p_hwfn,
 	p_bulletin->capability_speed = p_caps->speed_capabilities;
 }
 
+static void qed_iov_get_link(struct qed_hwfn *p_hwfn,
+			     u16 vfid,
+			     struct qed_mcp_link_params *p_params,
+			     struct qed_mcp_link_state *p_link,
+			     struct qed_mcp_link_capabilities *p_caps)
+{
+	struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn,
+						       vfid,
+						       false);
+	struct qed_bulletin_content *p_bulletin;
+
+	if (!p_vf)
+		return;
+
+	p_bulletin = p_vf->bulletin.p_virt;
+
+	if (p_params)
+		__qed_vf_get_link_params(p_hwfn, p_params, p_bulletin);
+	if (p_link)
+		__qed_vf_get_link_state(p_hwfn, p_link, p_bulletin);
+	if (p_caps)
+		__qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin);
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -2840,6 +2864,17 @@ bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
 	return p_vf_info->state == VF_STOPPED;
 }
 
+static bool qed_iov_spoofchk_get(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return false;
+
+	return vf_info->spoof_chk;
+}
+
 int qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn, int vfid, bool val)
 {
 	struct qed_vf_info *vf;
@@ -2937,6 +2972,23 @@ int qed_iov_configure_min_tx_rate(struct qed_dev *cdev, int vfid, u32 rate)
 	return qed_configure_vport_wfq(cdev, vport_id, rate);
 }
 
+static int qed_iov_get_vf_min_rate(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_wfq_data *vf_vp_wfq;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return 0;
+
+	vf_vp_wfq = &p_hwfn->qm_info.wfq_data[vf_info->vport_id];
+
+	if (vf_vp_wfq->configured)
+		return vf_vp_wfq->min_speed;
+	else
+		return 0;
+}
+
 /**
  * qed_schedule_iov - schedules IOV task for VF and PF
  * @hwfn: hardware function pointer
@@ -3153,6 +3205,46 @@ static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
 	return 0;
 }
 
+static int qed_get_vf_config(struct qed_dev *cdev,
+			     int vf_id, struct ifla_vf_info *ivi)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_public_vf_info *vf_info;
+	struct qed_mcp_link_state link;
+	u32 tx_rate;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+
+	qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL);
+
+	/* Fill information about VF */
+	ivi->vf = vf_id;
+
+	if (is_valid_ether_addr(vf_info->forced_mac))
+		ether_addr_copy(ivi->mac, vf_info->forced_mac);
+	else
+		ether_addr_copy(ivi->mac, vf_info->mac);
+
+	ivi->vlan = vf_info->forced_vlan;
+	ivi->spoofchk = qed_iov_spoofchk_get(hwfn, vf_id);
+	ivi->linkstate = vf_info->link_state;
+	tx_rate = vf_info->tx_rate;
+	ivi->max_tx_rate = tx_rate ? tx_rate : link.speed;
+	ivi->min_tx_rate = qed_iov_get_vf_min_rate(hwfn, vf_id);
+
+	return 0;
+}
+
 void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
 {
 	struct qed_mcp_link_capabilities caps;
@@ -3506,6 +3598,7 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.configure = &qed_sriov_configure,
 	.set_mac = &qed_sriov_pf_set_mac,
 	.set_vlan = &qed_sriov_pf_set_vlan,
+	.get_config = &qed_get_vf_config,
 	.set_link_state = &qed_set_vf_link_state,
 	.set_spoof = &qed_spoof_configure,
 	.set_rate = &qed_set_vf_rate,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index a908bd69d252..7130ee7f87da 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1793,6 +1793,17 @@ static struct rtnl_link_stats64 *qede_get_stats64(
 }
 
 #ifdef CONFIG_QED_SRIOV
+static int qede_get_vf_config(struct net_device *dev, int vfidx,
+			      struct ifla_vf_info *ivi)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->get_config(edev->cdev, vfidx, ivi);
+}
+
 static int qede_set_vf_rate(struct net_device *dev, int vfidx,
 			    int min_tx_rate, int max_tx_rate)
 {
@@ -2153,6 +2164,7 @@ static const struct net_device_ops qede_netdev_ops = {
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_link_state = qede_set_vf_link_state,
 	.ndo_set_vf_spoofchk = qede_set_vf_spoofchk,
+	.ndo_get_vf_config = qede_get_vf_config,
 	.ndo_set_vf_rate = qede_set_vf_rate,
 #endif
 #ifdef CONFIG_QEDE_VXLAN
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 2596d30d9e63..5a4f8d0899e9 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -19,6 +19,9 @@ struct qed_iov_hv_ops {
 
 	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
 
+	int (*get_config) (struct qed_dev *cdev, int vf_id,
+			   struct ifla_vf_info *ivi);
+
 	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
 			       int link_state);
 
-- 
cgit v1.2.3


From 831bfb0e88b54726d6e027a1d547066ffeb8b27e Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Wed, 11 May 2016 16:36:25 +0300
Subject: qed*: Tx-switching configuration

Device should be configured by default to VEB once VFs are active.
This changes the configuration of both PFs' and VFs' vports into enabling
tx-switching once sriov is enabled.

Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  4 ++++
 drivers/net/ethernet/qlogic/qed/qed_l2.h          |  1 +
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  1 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  5 ++++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       |  1 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 12 ++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 24 ++++++++++++++++++++++-
 include/linux/qed/qed_eth_if.h                    |  2 ++
 include/linux/qed/qed_if.h                        |  1 +
 11 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index acaa2866dae3..6fb6016409c6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -688,7 +688,8 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		qed_int_igu_enable(p_hwfn, p_ptt, int_mode);
 
 		/* send function start command */
-		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode);
+		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode,
+				     allow_npar_tx_switch);
 		if (rc)
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e0275a78b121..8fba87dd48af 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -99,6 +99,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 		break;
 	}
 
+	p_ramrod->tx_switching_en = p_params->tx_switching;
+
 	/* Software Function ID in hwfn (PFs are 0 - 15, VFs are 16 - 135) */
 	p_ramrod->sw_fid = qed_concrete_to_sw_fid(p_hwfn->cdev,
 						  p_params->concrete_fid);
@@ -1792,6 +1794,8 @@ static int qed_update_vport(struct qed_dev *cdev,
 		params->update_vport_active_flg;
 	sp_params.vport_active_rx_flg = params->vport_active_flg;
 	sp_params.vport_active_tx_flg = params->vport_active_flg;
+	sp_params.update_tx_switching_flg = params->update_tx_switching_flg;
+	sp_params.tx_switching_flg = params->tx_switching_flg;
 	sp_params.accept_any_vlan = params->accept_any_vlan;
 	sp_params.update_accept_any_vlan_flg =
 		params->update_accept_any_vlan_flg;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index a04fb7f061ea..002114543451 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -94,6 +94,7 @@ enum qed_tpa_mode {
 struct qed_sp_vport_start_params {
 	enum qed_tpa_mode tpa_mode;
 	bool remove_inner_vlan;
+	bool tx_switching;
 	bool only_untagged;
 	bool drop_ttl0;
 	u8 max_buffers_per_cqe;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index dcb782c14e5c..6ffc21da1415 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -216,6 +216,7 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 		dev_info->fw_rev = FW_REVISION_VERSION;
 		dev_info->fw_eng = FW_ENGINEERING_VERSION;
 		dev_info->mf_mode = cdev->mf_mode;
+		dev_info->tx_switching = true;
 	} else {
 		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
 				      &dev_info->fw_minor, &dev_info->fw_rev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index c2999cb5d1e2..ab5549f4e5ea 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -344,13 +344,14 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_tunn
  * @param mode
+ * @param allow_npar_tx_switch
  *
  * @return int
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
-		    enum qed_mf_mode mode);
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
 
 /**
  * @brief qed_sp_pf_stop - PF Function Stop Ramrod
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index ed90947c451d..8c555ed1f949 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -299,7 +299,7 @@ qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
-		    enum qed_mf_mode mode)
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
 	u16 sb = qed_int_get_sp_sb_id(p_hwfn);
@@ -358,6 +358,9 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 				     &p_ramrod->tunnel_config);
 	p_hwfn->hw_info.personality = PERSONALITY_ETH;
 
+	if (IS_MF_SI(p_hwfn))
+		p_ramrod->allow_npar_tx_switching = allow_npar_tx_switch;
+
 	if (p_hwfn->cdev->p_iov_info) {
 		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 6af8fd9fd560..d4df406ac0a4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1490,6 +1490,7 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 
 	params.tpa_mode = start->tpa_mode;
 	params.remove_inner_vlan = start->inner_vlan_removal;
+	params.tx_switching = true;
 
 	params.only_untagged = vf_info->bulletin.p_virt->default_only_untagged;
 	params.drop_ttl0 = false;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index db14e230c9a4..72e69c0ec10d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -633,6 +633,18 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 		}
 	}
 
+	if (p_params->update_tx_switching_flg) {
+		struct vfpf_vport_update_tx_switch_tlv *p_tx_switch_tlv;
+
+		size = sizeof(struct vfpf_vport_update_tx_switch_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH;
+		p_tx_switch_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+					      tlv, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		p_tx_switch_tlv->tx_switching = p_params->tx_switching_flg;
+	}
+
 	if (p_params->update_approx_mcast_flg) {
 		struct vfpf_vport_update_mcast_bin_tlv *p_mcast_tlv;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 7130ee7f87da..8114541f327c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -137,10 +137,26 @@ static int qede_set_vf_mac(struct net_device *ndev, int vfidx, u8 *mac)
 static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
+	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	int rc;
 
 	DP_VERBOSE(edev, QED_MSG_IOV, "Requested %d VFs\n", num_vfs_param);
 
-	return edev->ops->iov->configure(edev->cdev, num_vfs_param);
+	rc = edev->ops->iov->configure(edev->cdev, num_vfs_param);
+
+	/* Enable/Disable Tx switching for PF */
+	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
+	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
+		struct qed_update_vport_params params;
+
+		memset(&params, 0, sizeof(params));
+		params.vport_id = 0;
+		params.update_tx_switching_flg = 1;
+		params.tx_switching_flg = num_vfs_param ? 1 : 0;
+		edev->ops->vport_update(edev->cdev, &params);
+	}
+
+	return rc;
 }
 #endif
 
@@ -3291,6 +3307,12 @@ static int qede_start_queues(struct qede_dev *edev)
 	vport_update_params.update_vport_active_flg = 1;
 	vport_update_params.vport_active_flg = 1;
 
+	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
+	    qed_info->tx_switching) {
+		vport_update_params.update_tx_switching_flg = 1;
+		vport_update_params.tx_switching_flg = 1;
+	}
+
 	/* Fill struct with RSS params */
 	if (QEDE_RSS_CNT(edev) > 1) {
 		vport_update_params.update_rss_flg = 1;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e0f6e6482031..6ae8cb4a61d3 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -35,6 +35,8 @@ struct qed_update_vport_params {
 	u8 vport_id;
 	u8 update_vport_active_flg;
 	u8 vport_active_flg;
+	u8 update_tx_switching_flg;
+	u8 tx_switching_flg;
 	u8 update_accept_any_vlan_flg;
 	u8 accept_any_vlan;
 	u8 update_rss_flg;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 76a6f168a190..0fd8f247e65f 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -93,6 +93,7 @@ struct qed_dev_info {
 
 	u32		flash_size;
 	u8		mf_mode;
+	bool		tx_switching;
 };
 
 enum qed_sb_type {
-- 
cgit v1.2.3


From c0bba3a99f0709c24c2c7ada7cb098966b1d791f Mon Sep 17 00:00:00 2001
From: Kedareswara rao Appana <appana.durga.rao@xilinx.com>
Date: Thu, 7 Apr 2016 10:59:43 +0530
Subject: dmaengine: vdma: Add Support for Xilinx AXI Direct Memory Access
 Engine

This patch adds support for the AXI Direct Memory Access (AXI DMA)
core in the existing vdma driver, AXI DMA Core is a
soft Xilinx IP core that provides high-bandwidth
direct memory access between memory and AXI4-Stream
type target peripherals.

Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/xilinx_vdma.c | 474 +++++++++++++++++++++++++++++++++++----
 include/linux/dma/xilinx_dma.h   |  12 +
 2 files changed, 444 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c
index 57b85af4ed08..983e4bc88cd8 100644
--- a/drivers/dma/xilinx/xilinx_vdma.c
+++ b/drivers/dma/xilinx/xilinx_vdma.c
@@ -16,6 +16,11 @@
  * video device (S2MM). Initialization, status, interrupt and management
  * registers are accessed through an AXI4-Lite slave interface.
  *
+ * The AXI Direct Memory Access (AXI DMA) core is a soft Xilinx IP core that
+ * provides high-bandwidth one dimensional direct memory access between memory
+ * and AXI4-Stream target peripherals. It supports one receive and one
+ * transmit channel, both of them optional at synthesis time.
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 2 of the License, or
@@ -140,6 +145,19 @@
 /* Delay loop counter to prevent hardware failure */
 #define XILINX_DMA_LOOP_COUNT		1000000
 
+/* AXI DMA Specific Registers/Offsets */
+#define XILINX_DMA_REG_SRCDSTADDR	0x18
+#define XILINX_DMA_REG_BTT		0x28
+
+/* AXI DMA Specific Masks/Bit fields */
+#define XILINX_DMA_MAX_TRANS_LEN	GENMASK(22, 0)
+#define XILINX_DMA_CR_COALESCE_MAX	GENMASK(23, 16)
+#define XILINX_DMA_CR_COALESCE_SHIFT	16
+#define XILINX_DMA_BD_SOP		BIT(27)
+#define XILINX_DMA_BD_EOP		BIT(26)
+#define XILINX_DMA_COALESCE_MAX		255
+#define XILINX_DMA_NUM_APP_WORDS	5
+
 /**
  * struct xilinx_vdma_desc_hw - Hardware Descriptor
  * @next_desc: Next Descriptor Pointer @0x00
@@ -161,6 +179,30 @@ struct xilinx_vdma_desc_hw {
 	u32 stride;
 } __aligned(64);
 
+/**
+ * struct xilinx_axidma_desc_hw - Hardware Descriptor for AXI DMA
+ * @next_desc: Next Descriptor Pointer @0x00
+ * @pad1: Reserved @0x04
+ * @buf_addr: Buffer address @0x08
+ * @pad2: Reserved @0x0C
+ * @pad3: Reserved @0x10
+ * @pad4: Reserved @0x14
+ * @control: Control field @0x18
+ * @status: Status field @0x1C
+ * @app: APP Fields @0x20 - 0x30
+ */
+struct xilinx_axidma_desc_hw {
+	u32 next_desc;
+	u32 pad1;
+	u32 buf_addr;
+	u32 pad2;
+	u32 pad3;
+	u32 pad4;
+	u32 control;
+	u32 status;
+	u32 app[XILINX_DMA_NUM_APP_WORDS];
+} __aligned(64);
+
 /**
  * struct xilinx_vdma_tx_segment - Descriptor segment
  * @hw: Hardware descriptor
@@ -173,6 +215,18 @@ struct xilinx_vdma_tx_segment {
 	dma_addr_t phys;
 } __aligned(64);
 
+/**
+ * struct xilinx_axidma_tx_segment - Descriptor segment
+ * @hw: Hardware descriptor
+ * @node: Node in the descriptor segments list
+ * @phys: Physical address of segment
+ */
+struct xilinx_axidma_tx_segment {
+	struct xilinx_axidma_desc_hw hw;
+	struct list_head node;
+	dma_addr_t phys;
+} __aligned(64);
+
 /**
  * struct xilinx_dma_tx_descriptor - Per Transaction structure
  * @async_tx: Async transaction descriptor
@@ -210,6 +264,9 @@ struct xilinx_dma_tx_descriptor {
  * @desc_pendingcount: Descriptor pending count
  * @ext_addr: Indicates 64 bit addressing is supported by dma channel
  * @desc_submitcount: Descriptor h/w submitted count
+ * @residue: Residue for AXI DMA
+ * @seg_v: Statically allocated segments base
+ * @start_transfer: Differentiate b/w DMA IP's transfer
  */
 struct xilinx_dma_chan {
 	struct xilinx_dma_device *xdev;
@@ -235,6 +292,9 @@ struct xilinx_dma_chan {
 	u32 desc_pendingcount;
 	bool ext_addr;
 	u32 desc_submitcount;
+	u32 residue;
+	struct xilinx_axidma_tx_segment *seg_v;
+	void (*start_transfer)(struct xilinx_dma_chan *chan);
 };
 
 /**
@@ -246,6 +306,7 @@ struct xilinx_dma_chan {
  * @has_sg: Specifies whether Scatter-Gather is present or not
  * @flush_on_fsync: Flush on frame sync
  * @ext_addr: Indicates 64 bit addressing is supported by dma device
+ * @dmatype: DMA ip type
  */
 struct xilinx_dma_device {
 	void __iomem *regs;
@@ -255,6 +316,7 @@ struct xilinx_dma_device {
 	bool has_sg;
 	u32 flush_on_fsync;
 	bool ext_addr;
+	enum xdma_ip_type dmatype;
 };
 
 /* Macros */
@@ -352,6 +414,39 @@ xilinx_vdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
 	return segment;
 }
 
+/**
+ * xilinx_axidma_alloc_tx_segment - Allocate transaction segment
+ * @chan: Driver specific DMA channel
+ *
+ * Return: The allocated segment on success and NULL on failure.
+ */
+static struct xilinx_axidma_tx_segment *
+xilinx_axidma_alloc_tx_segment(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_axidma_tx_segment *segment;
+	dma_addr_t phys;
+
+	segment = dma_pool_alloc(chan->desc_pool, GFP_ATOMIC, &phys);
+	if (!segment)
+		return NULL;
+
+	memset(segment, 0, sizeof(*segment));
+	segment->phys = phys;
+
+	return segment;
+}
+
+/**
+ * xilinx_dma_free_tx_segment - Free transaction segment
+ * @chan: Driver specific DMA channel
+ * @segment: DMA transaction segment
+ */
+static void xilinx_dma_free_tx_segment(struct xilinx_dma_chan *chan,
+				struct xilinx_axidma_tx_segment *segment)
+{
+	dma_pool_free(chan->desc_pool, segment, segment->phys);
+}
+
 /**
  * xilinx_vdma_free_tx_segment - Free transaction segment
  * @chan: Driver specific DMA channel
@@ -393,13 +488,22 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			       struct xilinx_dma_tx_descriptor *desc)
 {
 	struct xilinx_vdma_tx_segment *segment, *next;
+	struct xilinx_axidma_tx_segment *axidma_segment, *axidma_next;
 
 	if (!desc)
 		return;
 
-	list_for_each_entry_safe(segment, next, &desc->segments, node) {
-		list_del(&segment->node);
-		xilinx_vdma_free_tx_segment(chan, segment);
+	if (chan->xdev->dmatype == XDMA_TYPE_VDMA) {
+		list_for_each_entry_safe(segment, next, &desc->segments, node) {
+			list_del(&segment->node);
+			xilinx_vdma_free_tx_segment(chan, segment);
+		}
+	} else {
+		list_for_each_entry_safe(axidma_segment, axidma_next,
+					 &desc->segments, node) {
+			list_del(&axidma_segment->node);
+			xilinx_dma_free_tx_segment(chan, axidma_segment);
+		}
 	}
 
 	kfree(desc);
@@ -451,6 +555,8 @@ static void xilinx_dma_free_chan_resources(struct dma_chan *dchan)
 	dev_dbg(chan->dev, "Free all channel resources.\n");
 
 	xilinx_dma_free_descriptors(chan);
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA)
+		xilinx_dma_free_tx_segment(chan, chan->seg_v);
 	dma_pool_destroy(chan->desc_pool);
 	chan->desc_pool = NULL;
 }
@@ -519,10 +625,20 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 	 * We need the descriptor to be aligned to 64bytes
 	 * for meeting Xilinx VDMA specification requirement.
 	 */
-	chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
-				chan->dev,
-				sizeof(struct xilinx_vdma_tx_segment),
-				__alignof__(struct xilinx_vdma_tx_segment), 0);
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		chan->desc_pool = dma_pool_create("xilinx_dma_desc_pool",
+				   chan->dev,
+				   sizeof(struct xilinx_axidma_tx_segment),
+				   __alignof__(struct xilinx_axidma_tx_segment),
+				   0);
+	} else {
+		chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
+				     chan->dev,
+				     sizeof(struct xilinx_vdma_tx_segment),
+				     __alignof__(struct xilinx_vdma_tx_segment),
+				     0);
+	}
+
 	if (!chan->desc_pool) {
 		dev_err(chan->dev,
 			"unable to allocate channel %d descriptor pool\n",
@@ -530,7 +646,27 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 		return -ENOMEM;
 	}
 
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA)
+		/*
+		 * For AXI DMA case after submitting a pending_list, keep
+		 * an extra segment allocated so that the "next descriptor"
+		 * pointer on the tail descriptor always points to a
+		 * valid descriptor, even when paused after reaching taildesc.
+		 * This way, it is possible to issue additional
+		 * transfers without halting and restarting the channel.
+		 */
+		chan->seg_v = xilinx_axidma_alloc_tx_segment(chan);
+
 	dma_cookie_init(dchan);
+
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		/* For AXI DMA resetting once channel will reset the
+		 * other channel as well so enable the interrupts here.
+		 */
+		dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
+			      XILINX_DMA_DMAXR_ALL_IRQ_MASK);
+	}
+
 	return 0;
 }
 
@@ -546,7 +682,37 @@ static enum dma_status xilinx_dma_tx_status(struct dma_chan *dchan,
 					dma_cookie_t cookie,
 					struct dma_tx_state *txstate)
 {
-	return dma_cookie_status(dchan, cookie, txstate);
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_axidma_tx_segment *segment;
+	struct xilinx_axidma_desc_hw *hw;
+	enum dma_status ret;
+	unsigned long flags;
+	u32 residue = 0;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE || !txstate)
+		return ret;
+
+	if (chan->xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		spin_lock_irqsave(&chan->lock, flags);
+
+		desc = list_last_entry(&chan->active_list,
+				       struct xilinx_dma_tx_descriptor, node);
+		if (chan->has_sg) {
+			list_for_each_entry(segment, &desc->segments, node) {
+				hw = &segment->hw;
+				residue += (hw->control - hw->status) &
+					   XILINX_DMA_MAX_TRANS_LEN;
+			}
+		}
+		spin_unlock_irqrestore(&chan->lock, flags);
+
+		chan->residue = residue;
+		dma_set_residue(txstate, chan->residue);
+	}
+
+	return ret;
 }
 
 /**
@@ -753,6 +919,91 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan)
 	}
 }
 
+/**
+ * xilinx_dma_start_transfer - Starts DMA transfer
+ * @chan: Driver specific channel struct pointer
+ */
+static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_dma_tx_descriptor *head_desc, *tail_desc;
+	struct xilinx_axidma_tx_segment *tail_segment, *old_head, *new_head;
+	u32 reg;
+
+	if (chan->err)
+		return;
+
+	if (list_empty(&chan->pending_list))
+		return;
+
+	/* If it is SG mode and hardware is busy, cannot submit */
+	if (chan->has_sg && xilinx_dma_is_running(chan) &&
+	    !xilinx_dma_is_idle(chan)) {
+		dev_dbg(chan->dev, "DMA controller still busy\n");
+		return;
+	}
+
+	head_desc = list_first_entry(&chan->pending_list,
+				     struct xilinx_dma_tx_descriptor, node);
+	tail_desc = list_last_entry(&chan->pending_list,
+				    struct xilinx_dma_tx_descriptor, node);
+	tail_segment = list_last_entry(&tail_desc->segments,
+				       struct xilinx_axidma_tx_segment, node);
+
+	old_head = list_first_entry(&head_desc->segments,
+				struct xilinx_axidma_tx_segment, node);
+	new_head = chan->seg_v;
+	/* Copy Buffer Descriptor fields. */
+	new_head->hw = old_head->hw;
+
+	/* Swap and save new reserve */
+	list_replace_init(&old_head->node, &new_head->node);
+	chan->seg_v = old_head;
+
+	tail_segment->hw.next_desc = chan->seg_v->phys;
+	head_desc->async_tx.phys = new_head->phys;
+
+	reg = dma_ctrl_read(chan, XILINX_DMA_REG_DMACR);
+
+	if (chan->desc_pendingcount <= XILINX_DMA_COALESCE_MAX) {
+		reg &= ~XILINX_DMA_CR_COALESCE_MAX;
+		reg |= chan->desc_pendingcount <<
+				  XILINX_DMA_CR_COALESCE_SHIFT;
+		dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg);
+	}
+
+	if (chan->has_sg)
+		dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC,
+			       head_desc->async_tx.phys);
+
+	xilinx_dma_start(chan);
+
+	if (chan->err)
+		return;
+
+	/* Start the transfer */
+	if (chan->has_sg) {
+		dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC,
+			       tail_segment->phys);
+	} else {
+		struct xilinx_axidma_tx_segment *segment;
+		struct xilinx_axidma_desc_hw *hw;
+
+		segment = list_first_entry(&head_desc->segments,
+					   struct xilinx_axidma_tx_segment,
+					   node);
+		hw = &segment->hw;
+
+		dma_ctrl_write(chan, XILINX_DMA_REG_SRCDSTADDR, hw->buf_addr);
+
+		/* Start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
+			       hw->control & XILINX_DMA_MAX_TRANS_LEN);
+	}
+
+	list_splice_tail_init(&chan->pending_list, &chan->active_list);
+	chan->desc_pendingcount = 0;
+}
+
 /**
  * xilinx_dma_issue_pending - Issue pending transactions
  * @dchan: DMA channel
@@ -763,7 +1014,7 @@ static void xilinx_dma_issue_pending(struct dma_chan *dchan)
 	unsigned long flags;
 
 	spin_lock_irqsave(&chan->lock, flags);
-	xilinx_vdma_start_transfer(chan);
+	chan->start_transfer(chan);
 	spin_unlock_irqrestore(&chan->lock, flags);
 }
 
@@ -895,7 +1146,7 @@ static irqreturn_t xilinx_dma_irq_handler(int irq, void *data)
 	if (status & XILINX_DMA_DMASR_FRM_CNT_IRQ) {
 		spin_lock(&chan->lock);
 		xilinx_dma_complete_descriptor(chan);
-		xilinx_vdma_start_transfer(chan);
+		chan->start_transfer(chan);
 		spin_unlock(&chan->lock);
 	}
 
@@ -913,6 +1164,7 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 {
 	struct xilinx_vdma_tx_segment *tail_segment;
 	struct xilinx_dma_tx_descriptor *tail_desc;
+	struct xilinx_axidma_tx_segment *axidma_tail_segment;
 
 	if (list_empty(&chan->pending_list))
 		goto append;
@@ -923,9 +1175,17 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 	 */
 	tail_desc = list_last_entry(&chan->pending_list,
 				    struct xilinx_dma_tx_descriptor, node);
-	tail_segment = list_last_entry(&tail_desc->segments,
-				       struct xilinx_vdma_tx_segment, node);
-	tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	if (chan->xdev->dmatype == XDMA_TYPE_VDMA) {
+		tail_segment = list_last_entry(&tail_desc->segments,
+					       struct xilinx_vdma_tx_segment,
+					       node);
+		tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	} else {
+		axidma_tail_segment = list_last_entry(&tail_desc->segments,
+					       struct xilinx_axidma_tx_segment,
+					       node);
+		axidma_tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	}
 
 	/*
 	 * Add the software descriptor and all children to the list
@@ -935,7 +1195,7 @@ append:
 	list_add_tail(&desc->node, &chan->pending_list);
 	chan->desc_pendingcount++;
 
-	if (chan->has_sg &&
+	if (chan->has_sg && (chan->xdev->dmatype == XDMA_TYPE_VDMA) &&
 	    unlikely(chan->desc_pendingcount > chan->num_frms)) {
 		dev_dbg(chan->dev, "desc pendingcount is too high\n");
 		chan->desc_pendingcount = chan->num_frms;
@@ -1062,6 +1322,109 @@ error:
 	return NULL;
 }
 
+/**
+ * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
+ * @dchan: DMA channel
+ * @sgl: scatterlist to transfer to/from
+ * @sg_len: number of entries in @scatterlist
+ * @direction: DMA direction
+ * @flags: transfer ack flags
+ * @context: APP words of the descriptor
+ *
+ * Return: Async transaction descriptor on success and NULL on failure
+ */
+static struct dma_async_tx_descriptor *xilinx_dma_prep_slave_sg(
+	struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
+	enum dma_transfer_direction direction, unsigned long flags,
+	void *context)
+{
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_axidma_tx_segment *segment = NULL, *prev = NULL;
+	u32 *app_w = (u32 *)context;
+	struct scatterlist *sg;
+	size_t copy;
+	size_t sg_used;
+	unsigned int i;
+
+	if (!is_slave_direction(direction))
+		return NULL;
+
+	/* Allocate a transaction descriptor. */
+	desc = xilinx_dma_alloc_tx_descriptor(chan);
+	if (!desc)
+		return NULL;
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
+	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
+
+	/* Build transactions using information in the scatter gather list */
+	for_each_sg(sgl, sg, sg_len, i) {
+		sg_used = 0;
+
+		/* Loop until the entire scatterlist entry is used */
+		while (sg_used < sg_dma_len(sg)) {
+			struct xilinx_axidma_desc_hw *hw;
+
+			/* Get a free segment */
+			segment = xilinx_axidma_alloc_tx_segment(chan);
+			if (!segment)
+				goto error;
+
+			/*
+			 * Calculate the maximum number of bytes to transfer,
+			 * making sure it is less than the hw limit
+			 */
+			copy = min_t(size_t, sg_dma_len(sg) - sg_used,
+				     XILINX_DMA_MAX_TRANS_LEN);
+			hw = &segment->hw;
+
+			/* Fill in the descriptor */
+			hw->buf_addr = sg_dma_address(sg) + sg_used;
+
+			hw->control = copy;
+
+			if (chan->direction == DMA_MEM_TO_DEV) {
+				if (app_w)
+					memcpy(hw->app, app_w, sizeof(u32) *
+					       XILINX_DMA_NUM_APP_WORDS);
+			}
+
+			if (prev)
+				prev->hw.next_desc = segment->phys;
+
+			prev = segment;
+			sg_used += copy;
+
+			/*
+			 * Insert the segment into the descriptor segments
+			 * list.
+			 */
+			list_add_tail(&segment->node, &desc->segments);
+		}
+	}
+
+	segment = list_first_entry(&desc->segments,
+				   struct xilinx_axidma_tx_segment, node);
+	desc->async_tx.phys = segment->phys;
+	prev->hw.next_desc = segment->phys;
+
+	/* For the last DMA_MEM_TO_DEV transfer, set EOP */
+	if (chan->direction == DMA_MEM_TO_DEV) {
+		segment->hw.control |= XILINX_DMA_BD_SOP;
+		segment = list_last_entry(&desc->segments,
+					  struct xilinx_axidma_tx_segment,
+					  node);
+		segment->hw.control |= XILINX_DMA_BD_EOP;
+	}
+
+	return &desc->async_tx;
+
+error:
+	xilinx_dma_free_tx_descriptor(chan, desc);
+	return NULL;
+}
+
 /**
  * xilinx_dma_terminate_all - Halt the channel and free descriptors
  * @chan: Driver specific DMA Channel pointer
@@ -1224,22 +1587,26 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 		chan->id = 0;
 
 		chan->ctrl_offset = XILINX_DMA_MM2S_CTRL_OFFSET;
-		chan->desc_offset = XILINX_VDMA_MM2S_DESC_OFFSET;
+		if (xdev->dmatype == XDMA_TYPE_VDMA) {
+			chan->desc_offset = XILINX_VDMA_MM2S_DESC_OFFSET;
 
-		if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
-		    xdev->flush_on_fsync == XILINX_DMA_FLUSH_MM2S)
-			chan->flush_on_fsync = true;
+			if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
+			    xdev->flush_on_fsync == XILINX_DMA_FLUSH_MM2S)
+				chan->flush_on_fsync = true;
+		}
 	} else if (of_device_is_compatible(node,
 					    "xlnx,axi-vdma-s2mm-channel")) {
 		chan->direction = DMA_DEV_TO_MEM;
 		chan->id = 1;
 
 		chan->ctrl_offset = XILINX_DMA_S2MM_CTRL_OFFSET;
-		chan->desc_offset = XILINX_VDMA_S2MM_DESC_OFFSET;
+		if (xdev->dmatype == XDMA_TYPE_VDMA) {
+			chan->desc_offset = XILINX_VDMA_S2MM_DESC_OFFSET;
 
-		if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
-		    xdev->flush_on_fsync == XILINX_DMA_FLUSH_S2MM)
-			chan->flush_on_fsync = true;
+			if (xdev->flush_on_fsync == XILINX_DMA_FLUSH_BOTH ||
+			    xdev->flush_on_fsync == XILINX_DMA_FLUSH_S2MM)
+				chan->flush_on_fsync = true;
+		}
 	} else {
 		dev_err(xdev->dev, "Invalid channel compatible node\n");
 		return -EINVAL;
@@ -1254,6 +1621,11 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 		return err;
 	}
 
+	if (xdev->dmatype == XDMA_TYPE_AXIDMA)
+		chan->start_transfer = xilinx_dma_start_transfer;
+	else
+		chan->start_transfer = xilinx_vdma_start_transfer;
+
 	/* Initialize the tasklet */
 	tasklet_init(&chan->tasklet, xilinx_dma_do_tasklet,
 			(unsigned long)chan);
@@ -1296,6 +1668,15 @@ static struct dma_chan *of_dma_xilinx_xlate(struct of_phandle_args *dma_spec,
 	return dma_get_slave_channel(&xdev->chan[chan_id]->common);
 }
 
+static const struct of_device_id xilinx_dma_of_ids[] = {
+	{ .compatible = "xlnx,axi-dma-1.00.a",
+	  .data = (void *)XDMA_TYPE_AXIDMA },
+	{ .compatible = "xlnx,axi-vdma-1.00.a",
+	  .data = (void *)XDMA_TYPE_VDMA },
+	{}
+};
+MODULE_DEVICE_TABLE(of, xilinx_dma_of_ids);
+
 /**
  * xilinx_dma_probe - Driver probe function
  * @pdev: Pointer to the platform_device structure
@@ -1317,6 +1698,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	xdev->dev = &pdev->dev;
+	xdev->dmatype = (enum xdma_ip_type)of_device_get_match_data(&pdev->dev);
 
 	/* Request and map I/O memory */
 	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -1327,16 +1709,21 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 	/* Retrieve the DMA engine properties from the device tree */
 	xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg");
 
-	err = of_property_read_u32(node, "xlnx,num-fstores", &num_frames);
-	if (err < 0) {
-		dev_err(xdev->dev, "missing xlnx,num-fstores property\n");
-		return err;
-	}
+	if (xdev->dmatype == XDMA_TYPE_VDMA) {
+		err = of_property_read_u32(node, "xlnx,num-fstores",
+					   &num_frames);
+		if (err < 0) {
+			dev_err(xdev->dev,
+				"missing xlnx,num-fstores property\n");
+			return err;
+		}
 
-	err = of_property_read_u32(node, "xlnx,flush-fsync",
-					&xdev->flush_on_fsync);
-	if (err < 0)
-		dev_warn(xdev->dev, "missing xlnx,flush-fsync property\n");
+		err = of_property_read_u32(node, "xlnx,flush-fsync",
+					   &xdev->flush_on_fsync);
+		if (err < 0)
+			dev_warn(xdev->dev,
+				 "missing xlnx,flush-fsync property\n");
+	}
 
 	err = of_property_read_u32(node, "xlnx,addrwidth", &addr_width);
 	if (err < 0)
@@ -1361,11 +1748,18 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 				xilinx_dma_alloc_chan_resources;
 	xdev->common.device_free_chan_resources =
 				xilinx_dma_free_chan_resources;
-	xdev->common.device_prep_interleaved_dma =
-				xilinx_vdma_dma_prep_interleaved;
 	xdev->common.device_terminate_all = xilinx_dma_terminate_all;
 	xdev->common.device_tx_status = xilinx_dma_tx_status;
 	xdev->common.device_issue_pending = xilinx_dma_issue_pending;
+	if (xdev->dmatype == XDMA_TYPE_AXIDMA) {
+		xdev->common.device_prep_slave_sg = xilinx_dma_prep_slave_sg;
+		/* Residue calculation is supported by only AXI DMA */
+		xdev->common.residue_granularity =
+					  DMA_RESIDUE_GRANULARITY_SEGMENT;
+	} else {
+		xdev->common.device_prep_interleaved_dma =
+				xilinx_vdma_dma_prep_interleaved;
+	}
 
 	platform_set_drvdata(pdev, xdev);
 
@@ -1376,9 +1770,11 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 			goto error;
 	}
 
-	for (i = 0; i < XILINX_DMA_MAX_CHANS_PER_DEVICE; i++)
-		if (xdev->chan[i])
-			xdev->chan[i]->num_frms = num_frames;
+	if (xdev->dmatype == XDMA_TYPE_VDMA) {
+		for (i = 0; i < XILINX_DMA_MAX_CHANS_PER_DEVICE; i++)
+			if (xdev->chan[i])
+				xdev->chan[i]->num_frms = num_frames;
+	}
 
 	/* Register the DMA engine with the core */
 	dma_async_device_register(&xdev->common);
@@ -1425,12 +1821,6 @@ static int xilinx_dma_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct of_device_id xilinx_dma_of_ids[] = {
-	{ .compatible = "xlnx,axi-vdma-1.00.a",},
-	{}
-};
-MODULE_DEVICE_TABLE(of, xilinx_dma_of_ids);
-
 static struct platform_driver xilinx_vdma_driver = {
 	.driver = {
 		.name = "xilinx-vdma",
diff --git a/include/linux/dma/xilinx_dma.h b/include/linux/dma/xilinx_dma.h
index 34b98f276ed0..5db17ff8e254 100644
--- a/include/linux/dma/xilinx_dma.h
+++ b/include/linux/dma/xilinx_dma.h
@@ -41,6 +41,18 @@ struct xilinx_vdma_config {
 	int ext_fsync;
 };
 
+/**
+ * enum xdma_ip_type: DMA IP type.
+ *
+ * XDMA_TYPE_AXIDMA: Axi dma ip.
+ * XDMA_TYPE_VDMA: Axi vdma ip.
+ *
+ */
+enum xdma_ip_type {
+	XDMA_TYPE_AXIDMA = 0,
+	XDMA_TYPE_VDMA,
+};
+
 int xilinx_vdma_channel_set_config(struct dma_chan *dchan,
 					struct xilinx_vdma_config *cfg);
 
-- 
cgit v1.2.3


From 07b0e7d49cbcadebad9d3b986f3298e33286dea2 Mon Sep 17 00:00:00 2001
From: Kedareswara rao Appana <appana.durga.rao@xilinx.com>
Date: Thu, 7 Apr 2016 10:59:45 +0530
Subject: dmaengine: vdma: Add Support for Xilinx AXI Central Direct Memory
 Access Engine

This patch adds support for the AXI Central Direct Memory Access
(AXI CDMA) core to the existing vdma driver, AXI CDMA is a
soft Xilinx IP core that provides high-bandwidth
Direct Memory Access(DMA) between a memory-mapped
source address and a memory-mapped destination address.

Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/xilinx_vdma.c | 236 ++++++++++++++++++++++++++++++++++++++-
 include/linux/dma/xilinx_dma.h   |   2 +
 2 files changed, 236 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c
index 983e4bc88cd8..fb481135f27a 100644
--- a/drivers/dma/xilinx/xilinx_vdma.c
+++ b/drivers/dma/xilinx/xilinx_vdma.c
@@ -21,6 +21,10 @@
  * and AXI4-Stream target peripherals. It supports one receive and one
  * transmit channel, both of them optional at synthesis time.
  *
+ * The AXI CDMA, is a soft IP, which provides high-bandwidth Direct Memory
+ * Access (DMA) between a memory-mapped source address and a memory-mapped
+ * destination address.
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 2 of the License, or
@@ -158,6 +162,13 @@
 #define XILINX_DMA_COALESCE_MAX		255
 #define XILINX_DMA_NUM_APP_WORDS	5
 
+/* AXI CDMA Specific Registers/Offsets */
+#define XILINX_CDMA_REG_SRCADDR		0x18
+#define XILINX_CDMA_REG_DSTADDR		0x20
+
+/* AXI CDMA Specific Masks */
+#define XILINX_CDMA_CR_SGMODE          BIT(3)
+
 /**
  * struct xilinx_vdma_desc_hw - Hardware Descriptor
  * @next_desc: Next Descriptor Pointer @0x00
@@ -203,6 +214,28 @@ struct xilinx_axidma_desc_hw {
 	u32 app[XILINX_DMA_NUM_APP_WORDS];
 } __aligned(64);
 
+/**
+ * struct xilinx_cdma_desc_hw - Hardware Descriptor
+ * @next_desc: Next Descriptor Pointer @0x00
+ * @pad1: Reserved @0x04
+ * @src_addr: Source address @0x08
+ * @pad2: Reserved @0x0C
+ * @dest_addr: Destination address @0x10
+ * @pad3: Reserved @0x14
+ * @control: Control field @0x18
+ * @status: Status field @0x1C
+ */
+struct xilinx_cdma_desc_hw {
+	u32 next_desc;
+	u32 pad1;
+	u32 src_addr;
+	u32 pad2;
+	u32 dest_addr;
+	u32 pad3;
+	u32 control;
+	u32 status;
+} __aligned(64);
+
 /**
  * struct xilinx_vdma_tx_segment - Descriptor segment
  * @hw: Hardware descriptor
@@ -227,6 +260,18 @@ struct xilinx_axidma_tx_segment {
 	dma_addr_t phys;
 } __aligned(64);
 
+/**
+ * struct xilinx_cdma_tx_segment - Descriptor segment
+ * @hw: Hardware descriptor
+ * @node: Node in the descriptor segments list
+ * @phys: Physical address of segment
+ */
+struct xilinx_cdma_tx_segment {
+	struct xilinx_cdma_desc_hw hw;
+	struct list_head node;
+	dma_addr_t phys;
+} __aligned(64);
+
 /**
  * struct xilinx_dma_tx_descriptor - Per Transaction structure
  * @async_tx: Async transaction descriptor
@@ -414,6 +459,28 @@ xilinx_vdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
 	return segment;
 }
 
+/**
+ * xilinx_cdma_alloc_tx_segment - Allocate transaction segment
+ * @chan: Driver specific DMA channel
+ *
+ * Return: The allocated segment on success and NULL on failure.
+ */
+static struct xilinx_cdma_tx_segment *
+xilinx_cdma_alloc_tx_segment(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_cdma_tx_segment *segment;
+	dma_addr_t phys;
+
+	segment = dma_pool_alloc(chan->desc_pool, GFP_ATOMIC, &phys);
+	if (!segment)
+		return NULL;
+
+	memset(segment, 0, sizeof(*segment));
+	segment->phys = phys;
+
+	return segment;
+}
+
 /**
  * xilinx_axidma_alloc_tx_segment - Allocate transaction segment
  * @chan: Driver specific DMA channel
@@ -447,6 +514,17 @@ static void xilinx_dma_free_tx_segment(struct xilinx_dma_chan *chan,
 	dma_pool_free(chan->desc_pool, segment, segment->phys);
 }
 
+/**
+ * xilinx_cdma_free_tx_segment - Free transaction segment
+ * @chan: Driver specific DMA channel
+ * @segment: DMA transaction segment
+ */
+static void xilinx_cdma_free_tx_segment(struct xilinx_dma_chan *chan,
+				struct xilinx_cdma_tx_segment *segment)
+{
+	dma_pool_free(chan->desc_pool, segment, segment->phys);
+}
+
 /**
  * xilinx_vdma_free_tx_segment - Free transaction segment
  * @chan: Driver specific DMA channel
@@ -488,6 +566,7 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			       struct xilinx_dma_tx_descriptor *desc)
 {
 	struct xilinx_vdma_tx_segment *segment, *next;
+	struct xilinx_cdma_tx_segment *cdma_segment, *cdma_next;
 	struct xilinx_axidma_tx_segment *axidma_segment, *axidma_next;
 
 	if (!desc)
@@ -498,6 +577,12 @@ xilinx_dma_free_tx_descriptor(struct xilinx_dma_chan *chan,
 			list_del(&segment->node);
 			xilinx_vdma_free_tx_segment(chan, segment);
 		}
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		list_for_each_entry_safe(cdma_segment, cdma_next,
+					 &desc->segments, node) {
+			list_del(&cdma_segment->node);
+			xilinx_cdma_free_tx_segment(chan, cdma_segment);
+		}
 	} else {
 		list_for_each_entry_safe(axidma_segment, axidma_next,
 					 &desc->segments, node) {
@@ -631,6 +716,12 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 				   sizeof(struct xilinx_axidma_tx_segment),
 				   __alignof__(struct xilinx_axidma_tx_segment),
 				   0);
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		chan->desc_pool = dma_pool_create("xilinx_cdma_desc_pool",
+				   chan->dev,
+				   sizeof(struct xilinx_cdma_tx_segment),
+				   __alignof__(struct xilinx_cdma_tx_segment),
+				   0);
 	} else {
 		chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
 				     chan->dev,
@@ -667,6 +758,10 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan)
 			      XILINX_DMA_DMAXR_ALL_IRQ_MASK);
 	}
 
+	if ((chan->xdev->dmatype == XDMA_TYPE_CDMA) && chan->has_sg)
+		dma_ctrl_set(chan, XILINX_DMA_REG_DMACR,
+			     XILINX_CDMA_CR_SGMODE);
+
 	return 0;
 }
 
@@ -919,6 +1014,66 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan)
 	}
 }
 
+/**
+ * xilinx_cdma_start_transfer - Starts cdma transfer
+ * @chan: Driver specific channel struct pointer
+ */
+static void xilinx_cdma_start_transfer(struct xilinx_dma_chan *chan)
+{
+	struct xilinx_dma_tx_descriptor *head_desc, *tail_desc;
+	struct xilinx_cdma_tx_segment *tail_segment;
+	u32 ctrl_reg = dma_read(chan, XILINX_DMA_REG_DMACR);
+
+	if (chan->err)
+		return;
+
+	if (list_empty(&chan->pending_list))
+		return;
+
+	head_desc = list_first_entry(&chan->pending_list,
+				     struct xilinx_dma_tx_descriptor, node);
+	tail_desc = list_last_entry(&chan->pending_list,
+				    struct xilinx_dma_tx_descriptor, node);
+	tail_segment = list_last_entry(&tail_desc->segments,
+				       struct xilinx_cdma_tx_segment, node);
+
+	if (chan->desc_pendingcount <= XILINX_DMA_COALESCE_MAX) {
+		ctrl_reg &= ~XILINX_DMA_CR_COALESCE_MAX;
+		ctrl_reg |= chan->desc_pendingcount <<
+				XILINX_DMA_CR_COALESCE_SHIFT;
+		dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, ctrl_reg);
+	}
+
+	if (chan->has_sg) {
+		dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC,
+			   head_desc->async_tx.phys);
+
+		/* Update tail ptr register which will start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC,
+			       tail_segment->phys);
+	} else {
+		/* In simple mode */
+		struct xilinx_cdma_tx_segment *segment;
+		struct xilinx_cdma_desc_hw *hw;
+
+		segment = list_first_entry(&head_desc->segments,
+					   struct xilinx_cdma_tx_segment,
+					   node);
+
+		hw = &segment->hw;
+
+		dma_ctrl_write(chan, XILINX_CDMA_REG_SRCADDR, hw->src_addr);
+		dma_ctrl_write(chan, XILINX_CDMA_REG_DSTADDR, hw->dest_addr);
+
+		/* Start the transfer */
+		dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
+				hw->control & XILINX_DMA_MAX_TRANS_LEN);
+	}
+
+	list_splice_tail_init(&chan->pending_list, &chan->active_list);
+	chan->desc_pendingcount = 0;
+}
+
 /**
  * xilinx_dma_start_transfer - Starts DMA transfer
  * @chan: Driver specific channel struct pointer
@@ -1165,6 +1320,7 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 	struct xilinx_vdma_tx_segment *tail_segment;
 	struct xilinx_dma_tx_descriptor *tail_desc;
 	struct xilinx_axidma_tx_segment *axidma_tail_segment;
+	struct xilinx_cdma_tx_segment *cdma_tail_segment;
 
 	if (list_empty(&chan->pending_list))
 		goto append;
@@ -1180,6 +1336,11 @@ static void append_desc_queue(struct xilinx_dma_chan *chan,
 					       struct xilinx_vdma_tx_segment,
 					       node);
 		tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
+	} else if (chan->xdev->dmatype == XDMA_TYPE_CDMA) {
+		cdma_tail_segment = list_last_entry(&tail_desc->segments,
+						struct xilinx_cdma_tx_segment,
+						node);
+		cdma_tail_segment->hw.next_desc = (u32)desc->async_tx.phys;
 	} else {
 		axidma_tail_segment = list_last_entry(&tail_desc->segments,
 					       struct xilinx_axidma_tx_segment,
@@ -1322,6 +1483,68 @@ error:
 	return NULL;
 }
 
+/**
+ * xilinx_cdma_prep_memcpy - prepare descriptors for a memcpy transaction
+ * @dchan: DMA channel
+ * @dma_dst: destination address
+ * @dma_src: source address
+ * @len: transfer length
+ * @flags: transfer ack flags
+ *
+ * Return: Async transaction descriptor on success and NULL on failure
+ */
+static struct dma_async_tx_descriptor *
+xilinx_cdma_prep_memcpy(struct dma_chan *dchan, dma_addr_t dma_dst,
+			dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_dma_tx_descriptor *desc;
+	struct xilinx_cdma_tx_segment *segment, *prev;
+	struct xilinx_cdma_desc_hw *hw;
+
+	if (!len || len > XILINX_DMA_MAX_TRANS_LEN)
+		return NULL;
+
+	desc = xilinx_dma_alloc_tx_descriptor(chan);
+	if (!desc)
+		return NULL;
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
+	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
+
+	/* Allocate the link descriptor from DMA pool */
+	segment = xilinx_cdma_alloc_tx_segment(chan);
+	if (!segment)
+		goto error;
+
+	hw = &segment->hw;
+	hw->control = len;
+	hw->src_addr = dma_src;
+	hw->dest_addr = dma_dst;
+
+	/* Fill the previous next descriptor with current */
+	prev = list_last_entry(&desc->segments,
+			       struct xilinx_cdma_tx_segment, node);
+	prev->hw.next_desc = segment->phys;
+
+	/* Insert the segment into the descriptor segments list. */
+	list_add_tail(&segment->node, &desc->segments);
+
+	prev = segment;
+
+	/* Link the last hardware descriptor with the first. */
+	segment = list_first_entry(&desc->segments,
+				struct xilinx_cdma_tx_segment, node);
+	desc->async_tx.phys = segment->phys;
+	prev->hw.next_desc = segment->phys;
+
+	return &desc->async_tx;
+
+error:
+	xilinx_dma_free_tx_descriptor(chan, desc);
+	return NULL;
+}
+
 /**
  * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
  * @dchan: DMA channel
@@ -1623,6 +1846,8 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev,
 
 	if (xdev->dmatype == XDMA_TYPE_AXIDMA)
 		chan->start_transfer = xilinx_dma_start_transfer;
+	else if (xdev->dmatype == XDMA_TYPE_CDMA)
+		chan->start_transfer = xilinx_cdma_start_transfer;
 	else
 		chan->start_transfer = xilinx_vdma_start_transfer;
 
@@ -1671,6 +1896,8 @@ static struct dma_chan *of_dma_xilinx_xlate(struct of_phandle_args *dma_spec,
 static const struct of_device_id xilinx_dma_of_ids[] = {
 	{ .compatible = "xlnx,axi-dma-1.00.a",
 	  .data = (void *)XDMA_TYPE_AXIDMA },
+	{ .compatible = "xlnx,axi-cdma-1.00.a",
+	  .data = (void *)XDMA_TYPE_CDMA },
 	{ .compatible = "xlnx,axi-vdma-1.00.a",
 	  .data = (void *)XDMA_TYPE_VDMA },
 	{}
@@ -1741,8 +1968,10 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 	xdev->common.dev = &pdev->dev;
 
 	INIT_LIST_HEAD(&xdev->common.channels);
-	dma_cap_set(DMA_SLAVE, xdev->common.cap_mask);
-	dma_cap_set(DMA_PRIVATE, xdev->common.cap_mask);
+	if (!(xdev->dmatype == XDMA_TYPE_CDMA)) {
+		dma_cap_set(DMA_SLAVE, xdev->common.cap_mask);
+		dma_cap_set(DMA_PRIVATE, xdev->common.cap_mask);
+	}
 
 	xdev->common.device_alloc_chan_resources =
 				xilinx_dma_alloc_chan_resources;
@@ -1756,6 +1985,9 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 		/* Residue calculation is supported by only AXI DMA */
 		xdev->common.residue_granularity =
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
+	} else if (xdev->dmatype == XDMA_TYPE_CDMA) {
+		dma_cap_set(DMA_MEMCPY, xdev->common.cap_mask);
+		xdev->common.device_prep_dma_memcpy = xilinx_cdma_prep_memcpy;
 	} else {
 		xdev->common.device_prep_interleaved_dma =
 				xilinx_vdma_dma_prep_interleaved;
diff --git a/include/linux/dma/xilinx_dma.h b/include/linux/dma/xilinx_dma.h
index 5db17ff8e254..3ae300052553 100644
--- a/include/linux/dma/xilinx_dma.h
+++ b/include/linux/dma/xilinx_dma.h
@@ -45,11 +45,13 @@ struct xilinx_vdma_config {
  * enum xdma_ip_type: DMA IP type.
  *
  * XDMA_TYPE_AXIDMA: Axi dma ip.
+ * XDMA_TYPE_CDMA: Axi cdma ip.
  * XDMA_TYPE_VDMA: Axi vdma ip.
  *
  */
 enum xdma_ip_type {
 	XDMA_TYPE_AXIDMA = 0,
+	XDMA_TYPE_CDMA,
 	XDMA_TYPE_VDMA,
 };
 
-- 
cgit v1.2.3


From c5076cfe768998e9d395bc8486b29b18b0f99fd9 Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Wed, 11 May 2016 17:34:51 -0500
Subject: PCI, of: Move PCI I/O space management to PCI core code

No functional changes in this patch.

PCI I/O space mapping code does not depend on OF; therefore it can be moved
to PCI core code.  This way we will be able to use it, e.g., in ACPI PCI
code.

Suggested-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Tomasz Nowicki <tn@semihalf.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Arnd Bergmann <arnd@arndb.de>
CC: Liviu Dudau <Liviu.Dudau@arm.com>
---
 drivers/of/address.c       | 116 +--------------------------------------------
 drivers/pci/pci.c          | 115 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_address.h |   9 ----
 include/linux/pci.h        |   5 ++
 4 files changed, 121 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 91a469d55b8f..0a553c084a81 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -4,6 +4,7 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/pci.h>
 #include <linux/pci_regs.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
@@ -673,121 +674,6 @@ const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-#ifdef PCI_IOBASE
-struct io_range {
-	struct list_head list;
-	phys_addr_t start;
-	resource_size_t size;
-};
-
-static LIST_HEAD(io_range_list);
-static DEFINE_SPINLOCK(io_range_lock);
-#endif
-
-/*
- * Record the PCI IO range (expressed as CPU physical address + size).
- * Return a negative value if an error has occured, zero otherwise
- */
-int __weak pci_register_io_range(phys_addr_t addr, resource_size_t size)
-{
-	int err = 0;
-
-#ifdef PCI_IOBASE
-	struct io_range *range;
-	resource_size_t allocated_size = 0;
-
-	/* check if the range hasn't been previously recorded */
-	spin_lock(&io_range_lock);
-	list_for_each_entry(range, &io_range_list, list) {
-		if (addr >= range->start && addr + size <= range->start + size) {
-			/* range already registered, bail out */
-			goto end_register;
-		}
-		allocated_size += range->size;
-	}
-
-	/* range not registed yet, check for available space */
-	if (allocated_size + size - 1 > IO_SPACE_LIMIT) {
-		/* if it's too big check if 64K space can be reserved */
-		if (allocated_size + SZ_64K - 1 > IO_SPACE_LIMIT) {
-			err = -E2BIG;
-			goto end_register;
-		}
-
-		size = SZ_64K;
-		pr_warn("Requested IO range too big, new size set to 64K\n");
-	}
-
-	/* add the range to the list */
-	range = kzalloc(sizeof(*range), GFP_ATOMIC);
-	if (!range) {
-		err = -ENOMEM;
-		goto end_register;
-	}
-
-	range->start = addr;
-	range->size = size;
-
-	list_add_tail(&range->list, &io_range_list);
-
-end_register:
-	spin_unlock(&io_range_lock);
-#endif
-
-	return err;
-}
-
-phys_addr_t pci_pio_to_address(unsigned long pio)
-{
-	phys_addr_t address = (phys_addr_t)OF_BAD_ADDR;
-
-#ifdef PCI_IOBASE
-	struct io_range *range;
-	resource_size_t allocated_size = 0;
-
-	if (pio > IO_SPACE_LIMIT)
-		return address;
-
-	spin_lock(&io_range_lock);
-	list_for_each_entry(range, &io_range_list, list) {
-		if (pio >= allocated_size && pio < allocated_size + range->size) {
-			address = range->start + pio - allocated_size;
-			break;
-		}
-		allocated_size += range->size;
-	}
-	spin_unlock(&io_range_lock);
-#endif
-
-	return address;
-}
-
-unsigned long __weak pci_address_to_pio(phys_addr_t address)
-{
-#ifdef PCI_IOBASE
-	struct io_range *res;
-	resource_size_t offset = 0;
-	unsigned long addr = -1;
-
-	spin_lock(&io_range_lock);
-	list_for_each_entry(res, &io_range_list, list) {
-		if (address >= res->start && address < res->start + res->size) {
-			addr = address - res->start + offset;
-			break;
-		}
-		offset += res->size;
-	}
-	spin_unlock(&io_range_lock);
-
-	return addr;
-#else
-	if (address > IO_SPACE_LIMIT)
-		return (unsigned long)-1;
-
-	return (unsigned long) address;
-#endif
-}
-
 static int __of_address_to_resource(struct device_node *dev,
 		const __be32 *addrp, u64 size, unsigned int flags,
 		const char *name, struct resource *r)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..bc0c914b8afc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3021,6 +3021,121 @@ int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name)
 }
 EXPORT_SYMBOL(pci_request_regions_exclusive);
 
+#ifdef PCI_IOBASE
+struct io_range {
+	struct list_head list;
+	phys_addr_t start;
+	resource_size_t size;
+};
+
+static LIST_HEAD(io_range_list);
+static DEFINE_SPINLOCK(io_range_lock);
+#endif
+
+/*
+ * Record the PCI IO range (expressed as CPU physical address + size).
+ * Return a negative value if an error has occured, zero otherwise
+ */
+int __weak pci_register_io_range(phys_addr_t addr, resource_size_t size)
+{
+	int err = 0;
+
+#ifdef PCI_IOBASE
+	struct io_range *range;
+	resource_size_t allocated_size = 0;
+
+	/* check if the range hasn't been previously recorded */
+	spin_lock(&io_range_lock);
+	list_for_each_entry(range, &io_range_list, list) {
+		if (addr >= range->start && addr + size <= range->start + size) {
+			/* range already registered, bail out */
+			goto end_register;
+		}
+		allocated_size += range->size;
+	}
+
+	/* range not registed yet, check for available space */
+	if (allocated_size + size - 1 > IO_SPACE_LIMIT) {
+		/* if it's too big check if 64K space can be reserved */
+		if (allocated_size + SZ_64K - 1 > IO_SPACE_LIMIT) {
+			err = -E2BIG;
+			goto end_register;
+		}
+
+		size = SZ_64K;
+		pr_warn("Requested IO range too big, new size set to 64K\n");
+	}
+
+	/* add the range to the list */
+	range = kzalloc(sizeof(*range), GFP_ATOMIC);
+	if (!range) {
+		err = -ENOMEM;
+		goto end_register;
+	}
+
+	range->start = addr;
+	range->size = size;
+
+	list_add_tail(&range->list, &io_range_list);
+
+end_register:
+	spin_unlock(&io_range_lock);
+#endif
+
+	return err;
+}
+
+phys_addr_t pci_pio_to_address(unsigned long pio)
+{
+	phys_addr_t address = (phys_addr_t)OF_BAD_ADDR;
+
+#ifdef PCI_IOBASE
+	struct io_range *range;
+	resource_size_t allocated_size = 0;
+
+	if (pio > IO_SPACE_LIMIT)
+		return address;
+
+	spin_lock(&io_range_lock);
+	list_for_each_entry(range, &io_range_list, list) {
+		if (pio >= allocated_size && pio < allocated_size + range->size) {
+			address = range->start + pio - allocated_size;
+			break;
+		}
+		allocated_size += range->size;
+	}
+	spin_unlock(&io_range_lock);
+#endif
+
+	return address;
+}
+
+unsigned long __weak pci_address_to_pio(phys_addr_t address)
+{
+#ifdef PCI_IOBASE
+	struct io_range *res;
+	resource_size_t offset = 0;
+	unsigned long addr = -1;
+
+	spin_lock(&io_range_lock);
+	list_for_each_entry(res, &io_range_list, list) {
+		if (address >= res->start && address < res->start + res->size) {
+			addr = address - res->start + offset;
+			break;
+		}
+		offset += res->size;
+	}
+	spin_unlock(&io_range_lock);
+
+	return addr;
+#else
+	if (address > IO_SPACE_LIMIT)
+		return (unsigned long)-1;
+
+	return (unsigned long) address;
+#endif
+}
+
 /**
  *	pci_remap_iospace - Remap the memory mapped I/O space
  *	@res: Resource describing the I/O space
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 01c0a556448b..37864734ca50 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -47,10 +47,6 @@ void __iomem *of_io_request_and_map(struct device_node *device,
 extern const __be32 *of_get_address(struct device_node *dev, int index,
 			   u64 *size, unsigned int *flags);
 
-extern int pci_register_io_range(phys_addr_t addr, resource_size_t size);
-extern unsigned long pci_address_to_pio(phys_addr_t addr);
-extern phys_addr_t pci_pio_to_address(unsigned long pio);
-
 extern int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node);
 extern struct of_pci_range *of_pci_range_parser_one(
@@ -86,11 +82,6 @@ static inline const __be32 *of_get_address(struct device_node *dev, int index,
 	return NULL;
 }
 
-static inline phys_addr_t pci_pio_to_address(unsigned long pio)
-{
-	return 0;
-}
-
 static inline int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..1824ef80e10d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1164,6 +1164,9 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
 			void *alignf_data);
 
 
+int pci_register_io_range(phys_addr_t addr, resource_size_t size);
+unsigned long pci_address_to_pio(phys_addr_t addr);
+phys_addr_t pci_pio_to_address(unsigned long pio);
 int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
 
 static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
@@ -1480,6 +1483,8 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name)
 { return -EIO; }
 static inline void pci_release_regions(struct pci_dev *dev) { }
 
+static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
+
 static inline void pci_block_cfg_access(struct pci_dev *dev) { }
 static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev)
 { return 0; }
-- 
cgit v1.2.3


From c16aea129bf788127465771cab97134a94af33e4 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 13 Apr 2016 19:11:03 +0300
Subject: net/mlx5: Fix mlx5 ifc cmd_hca_cap bad offsets

All reserved fields after early_vf_enable are off by 1, since
early_vf_enable was not explicitly declared as array of size 1.

Reserved field before cqe_zip had a wrong size, it should
be 0x80 + 0x3f.

Fixes: b0844444590e ("net/mlx5_core: Introduce access function to read internal timer ")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 107 ++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c15b8a864937..c300e7491d80 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -750,21 +750,21 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
 	u8         eswitch_flow_table[0x1];
-	u8	   early_vf_enable;
-	u8         reserved_at_1a8[0x2];
+	u8	   early_vf_enable[0x1];
+	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         reserved_at_1af[0x6];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_at_1bf[0x3];
+	u8         reserved_at_1c0[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_at_1c7[0x4];
+	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1cf[0x6];
+	u8         reserved_at_1d0[0x6];
 	u8         rol_s[0x1];
 	u8         rol_g[0x1];
-	u8         reserved_at_1d7[0x1];
+	u8         reserved_at_1d8[0x1];
 	u8         wol_s[0x1];
 	u8         wol_g[0x1];
 	u8         wol_a[0x1];
@@ -774,47 +774,47 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         wol_p[0x1];
 
 	u8         stat_rate_support[0x10];
-	u8         reserved_at_1ef[0xc];
+	u8         reserved_at_1f0[0xc];
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
 	u8         reserved_at_200[0x3];
 	u8         ipoib_basic_offloads[0x1];
-	u8         reserved_at_204[0xa];
+	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-	u8         reserved_at_212[0x1];
+	u8         reserved_at_213[0x1];
 	u8         wq_signature[0x1];
 	u8         sctr_data_cqe[0x1];
-	u8         reserved_at_215[0x1];
+	u8         reserved_at_216[0x1];
 	u8         sho[0x1];
 	u8         tph[0x1];
 	u8         rf[0x1];
 	u8         dct[0x1];
-	u8         reserved_at_21a[0x1];
+	u8         reserved_at_21b[0x1];
 	u8         eth_net_offloads[0x1];
 	u8         roce[0x1];
 	u8         atomic[0x1];
-	u8         reserved_at_21e[0x1];
+	u8         reserved_at_21f[0x1];
 
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         reserved_at_222[0x3];
+	u8         reserved_at_223[0x3];
 	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-	u8         reserved_at_228[0x1];
+	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
 	u8         reserved_at_22a[0x1];
 	u8         cd[0x1];
-	u8         reserved_at_22c[0x1];
+	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
 	u8         reserved_at_22f[0x1];
 	u8	   imaicl[0x1];
-	u8         reserved_at_231[0x4];
+	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
 	u8         pkv[0x1];
 	u8         set_deth_sqpn[0x1];
@@ -824,98 +824,101 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_at_23f[0xa];
+	u8         reserved_at_240[0xa];
 	u8         uar_sz[0x6];
-	u8         reserved_at_24f[0x8];
+	u8         reserved_at_250[0x8];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_260[0x1];
+	u8         reserved_at_261[0x1];
 	u8         pad_tx_eth_packet[0x1];
-	u8         reserved_at_262[0x8];
+	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
-	u8         reserved_at_26f[0x10];
+	u8         reserved_at_270[0x10];
 
-	u8         reserved_at_27f[0x10];
+	u8         reserved_at_280[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_at_29f[0x10];
+	u8         reserved_at_2a0[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_at_2bf[0x10];
+	u8         reserved_at_2c0[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
-	u8         reserved_at_2df[0x7];
+	u8         reserved_at_2e0[0x7];
 	u8         max_qp_mcg[0x19];
 
-	u8         reserved_at_2ff[0x18];
+	u8         reserved_at_300[0x18];
 	u8         log_max_mcg[0x8];
 
-	u8         reserved_at_31f[0x3];
+	u8         reserved_at_320[0x3];
 	u8         log_max_transport_domain[0x5];
-	u8         reserved_at_327[0x3];
+	u8         reserved_at_328[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_32f[0xb];
+	u8         reserved_at_330[0xb];
 	u8         log_max_xrcd[0x5];
 
-	u8         reserved_at_33f[0x20];
+	u8         reserved_at_340[0x20];
 
-	u8         reserved_at_35f[0x3];
+	u8         reserved_at_360[0x3];
 	u8         log_max_rq[0x5];
-	u8         reserved_at_367[0x3];
+	u8         reserved_at_368[0x3];
 	u8         log_max_sq[0x5];
-	u8         reserved_at_36f[0x3];
+	u8         reserved_at_370[0x3];
 	u8         log_max_tir[0x5];
-	u8         reserved_at_377[0x3];
+	u8         reserved_at_378[0x3];
 	u8         log_max_tis[0x5];
 
 	u8         basic_cyclic_rcv_wqe[0x1];
-	u8         reserved_at_380[0x2];
+	u8         reserved_at_381[0x2];
 	u8         log_max_rmp[0x5];
-	u8         reserved_at_387[0x3];
+	u8         reserved_at_388[0x3];
 	u8         log_max_rqt[0x5];
-	u8         reserved_at_38f[0x3];
+	u8         reserved_at_390[0x3];
 	u8         log_max_rqt_size[0x5];
-	u8         reserved_at_397[0x3];
+	u8         reserved_at_398[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_at_39f[0x3];
+	u8         reserved_at_3a0[0x3];
 	u8         log_max_stride_sz_rq[0x5];
-	u8         reserved_at_3a7[0x3];
+	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
-	u8         reserved_at_3af[0x3];
+	u8         reserved_at_3b0[0x3];
 	u8         log_max_stride_sz_sq[0x5];
-	u8         reserved_at_3b7[0x3];
+	u8         reserved_at_3b8[0x3];
 	u8         log_min_stride_sz_sq[0x5];
 
-	u8         reserved_at_3bf[0x1b];
+	u8         reserved_at_3c0[0x1b];
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         reserved_at_3e0[0xa];
+	u8         reserved_at_3e1[0xa];
 	u8         log_max_vlan_list[0x5];
-	u8         reserved_at_3ef[0x3];
+	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
-	u8         reserved_at_3f7[0x3];
+	u8         reserved_at_3f8[0x3];
 	u8         log_max_current_uc_list[0x5];
 
-	u8         reserved_at_3ff[0x80];
+	u8         reserved_at_400[0x80];
 
-	u8         reserved_at_47f[0x3];
+	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
-	u8         reserved_at_487[0x8];
+	u8         reserved_at_488[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_at_49f[0x20];
+	u8         reserved_at_4a0[0x20];
 	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
-	u8         reserved_at_4ff[0x5f];
+
+	u8         reserved_at_500[0x80];
+
+	u8         reserved_at_580[0x3f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_at_57f[0x220];
+	u8         reserved_at_5e0[0x220];
 };
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From 80835cba4b857485b068e1ce83512e896e6f001e Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 13 Apr 2016 19:11:04 +0300
Subject: net/mlx5: Update mlx5_ifc hardware features

Adding the needed mlx5_ifc hardware bits and structs
for the following features:

* Add vport to steering commands for SRIOV ACL support
* Add mlcr, pcmr and mcia registers for dump module EEPROM
* Add support for FCS, beacon led and disable_link bits to
  hca caps
* Add CQE period mode bit in CQ context for CQE based CQ
  moderation support
* Add umr SQ bit for fragmented memory registration
* Add needed bits and caps for Striding RQ support

In-order to avoid possible future conflicts between rdma and
net-next we added all expected updates to this file for this release.
If more changes will be submitted, we plan to do it only through
one of the subsystems, probably net-next.

All updated bits in this patch will be later used in
the up-coming submissions to net-next and rdma trees.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 146 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c300e7491d80..4ce4ea422a10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -513,7 +513,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         max_lso_cap[0x5];
 	u8         reserved_at_10[0x4];
 	u8         rss_ind_tbl_cap[0x4];
-	u8         reserved_at_18[0x3];
+	u8         reg_umr_sq[0x1];
+	u8         scatter_fcs[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
 	u8         tunnel_statless_gre[0x1];
@@ -648,7 +650,7 @@ struct mlx5_ifc_vector_calc_cap_bits {
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
-	MLX5_WQ_TYPE_STRQ         = 0x2,
+	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
 };
 
 enum {
@@ -753,7 +755,11 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x6];
+	u8         reserved_at_1af[0x2];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b2[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
@@ -778,7 +784,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
-	u8         reserved_at_200[0x3];
+	u8         striding_rq[0x1];
+	u8         reserved_at_201[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -807,12 +814,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         block_lb_mc[0x1];
 	u8         reserved_at_229[0x1];
 	u8         scqe_break_moderation[0x1];
-	u8         reserved_at_22a[0x1];
+	u8         cq_period_start_from_cqe[0x1];
 	u8         cd[0x1];
 	u8         reserved_at_22d[0x1];
 	u8         apm[0x1];
 	u8         vector_calc[0x1];
-	u8         reserved_at_22f[0x1];
+	u8         umr_ptr_rlky[0x1];
 	u8	   imaicl[0x1];
 	u8         reserved_at_232[0x4];
 	u8         qkv[0x1];
@@ -913,10 +920,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_500[0x80];
 
 	u8         reserved_at_580[0x3f];
-	u8         cqe_zip[0x1];
+	u8         cqe_compression[0x1];
 
-	u8         cqe_zip_timeout[0x10];
-	u8         cqe_zip_max_num[0x10];
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
 
 	u8         reserved_at_5e0[0x220];
 };
@@ -1000,7 +1007,13 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_118[0x3];
 	u8         log_wq_sz[0x5];
 
-	u8         reserved_at_120[0x4e0];
+	u8         reserved_at_120[0x15];
+	u8         log_wqe_num_of_strides[0x3];
+	u8         two_byte_shift_en[0x1];
+	u8         reserved_at_139[0x4];
+	u8         log_wqe_stride_size[0x3];
+
+	u8         reserved_at_140[0x4c0];
 
 	struct mlx5_ifc_cmd_pas_bits pas[0];
 };
@@ -2199,7 +2212,8 @@ struct mlx5_ifc_sqc_bits {
 	u8         flush_in_error_en[0x1];
 	u8         reserved_at_4[0x4];
 	u8         state[0x4];
-	u8         reserved_at_c[0x14];
+	u8         reg_umr[0x1];
+	u8         reserved_at_d[0x13];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
@@ -2247,7 +2261,8 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         scatter_fcs[0x1];
 	u8         vsd[0x1];
 	u8         mem_rq_type[0x4];
 	u8         state[0x4];
@@ -2604,6 +2619,11 @@ enum {
 	MLX5_CQC_ST_FIRED                                 = 0xa,
 };
 
+enum {
+	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+};
+
 struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x4];
@@ -2612,8 +2632,8 @@ struct mlx5_ifc_cqc_bits {
 	u8         reserved_at_c[0x1];
 	u8         scqe_break_moderation_en[0x1];
 	u8         oi[0x1];
-	u8         reserved_at_f[0x2];
-	u8         cqe_zip_en[0x1];
+	u8         cq_period_mode[0x2];
+	u8         cqe_comp_en[0x1];
 	u8         mini_cqe_res_format[0x2];
 	u8         st[0x4];
 	u8         reserved_at_18[0x8];
@@ -2987,7 +3007,11 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5181,7 +5205,11 @@ struct mlx5_ifc_destroy_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5208,7 +5236,11 @@ struct mlx5_ifc_destroy_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5349,7 +5381,11 @@ struct mlx5_ifc_delete_fte_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5795,7 +5831,11 @@ struct mlx5_ifc_create_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -5839,7 +5879,11 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -6372,6 +6416,17 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_mlcr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x20];
+
+	u8         beacon_duration[0x10];
+	u8         reserved_at_40[0x10];
+
+	u8         beacon_remain[0x10];
+};
+
 struct mlx5_ifc_ptas_reg_bits {
 	u8         reserved_at_0[0x20];
 
@@ -6781,6 +6836,16 @@ struct mlx5_ifc_pamp_reg_bits {
 	u8         index_data[18][0x10];
 };
 
+struct mlx5_ifc_pcmr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2e];
+	u8         fcs_cap[0x1];
+	u8         reserved_at_3f[0x1f];
+	u8         fcs_chk[0x1];
+	u8         reserved_at_5f[0x1];
+};
+
 struct mlx5_ifc_lane_2_module_mapping_bits {
 	u8         reserved_at_0[0x6];
 	u8         rx_lane[0x2];
@@ -7117,6 +7182,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pspa_reg_bits pspa_reg;
 	struct mlx5_ifc_ptas_reg_bits ptas_reg;
 	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_mlcr_reg_bits mlcr_reg;
 	struct mlx5_ifc_pude_reg_bits pude_reg;
 	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
 	struct mlx5_ifc_slrg_reg_bits slrg_reg;
@@ -7150,7 +7216,11 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x40];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
 	u8         reserved_at_88[0x18];
@@ -7181,7 +7251,9 @@ struct mlx5_ifc_modify_flow_table_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x20];
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
 
 	u8         reserved_at_60[0x10];
 	u8         modify_field_select[0x10];
@@ -7247,4 +7319,34 @@ struct mlx5_ifc_qtct_reg_bits {
 	u8         tclass[0x3];
 };
 
+struct mlx5_ifc_mcia_reg_bits {
+	u8         l[0x1];
+	u8         reserved_at_1[0x7];
+	u8         module[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         i2c_device_address[0x8];
+	u8         page_number[0x8];
+	u8         device_address[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         dword_0[0x20];
+	u8         dword_1[0x20];
+	u8         dword_2[0x20];
+	u8         dword_3[0x20];
+	u8         dword_4[0x20];
+	u8         dword_5[0x20];
+	u8         dword_6[0x20];
+	u8         dword_7[0x20];
+	u8         dword_8[0x20];
+	u8         dword_9[0x20];
+	u8         dword_10[0x20];
+	u8         dword_11[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From b3d39032d7b33029373fbce6459aff6ac316e130 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Mon, 28 Mar 2016 20:36:59 -0700
Subject: remoteproc: Add additional crash reasons

The Qualcomm WCNSS can crash by watchdog or a fatal software error. Add
these types to the list of remoteproc crash reasons.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 2 ++
 include/linux/remoteproc.h           | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 31dfc9996389..db3958b3f094 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -57,6 +57,8 @@ static DEFINE_IDA(rproc_dev_index);
 
 static const char * const rproc_crash_names[] = {
 	[RPROC_MMUFAULT]	= "mmufault",
+	[RPROC_WATCHDOG]	= "watchdog",
+	[RPROC_FATAL_ERROR]	= "fatal error",
 };
 
 /* translate rproc_crash_type to string */
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 9c4e1384f636..1c457a8dd5a6 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -365,6 +365,8 @@ enum rproc_state {
 /**
  * enum rproc_crash_type - remote processor crash types
  * @RPROC_MMUFAULT:	iommu fault
+ * @RPROC_WATCHDOG:	watchdog bite
+ * @RPROC_FATAL_ERROR	fatal error
  *
  * Each element of the enum is used as an array index. So that, the value of
  * the elements should be always something sane.
@@ -373,6 +375,8 @@ enum rproc_state {
  */
 enum rproc_crash_type {
 	RPROC_MMUFAULT,
+	RPROC_WATCHDOG,
+	RPROC_FATAL_ERROR,
 };
 
 /**
-- 
cgit v1.2.3


From ae05327a00fd47c34dfe25294b359a3f3fef96e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 12 May 2016 20:36:01 -0400
Subject: ext4: switch to ->iterate_shared()

Note that we need relax_dir() equivalent for directories
locked shared.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/dir.c      | 4 ++--
 include/linux/fs.h | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 561d7308b393..5d00bf060254 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -266,7 +266,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 			ctx->pos += ext4_rec_len_from_disk(de->rec_len,
 						sb->s_blocksize);
 		}
-		if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+		if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
 			goto done;
 		brelse(bh);
 		bh = NULL;
@@ -644,7 +644,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
 const struct file_operations ext4_dir_operations = {
 	.llseek		= ext4_dir_llseek,
 	.read		= generic_read_dir,
-	.iterate	= ext4_readdir,
+	.iterate_shared	= ext4_readdir,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3dc0258a2b64..e87245ac6941 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3134,6 +3134,13 @@ static inline bool dir_relax(struct inode *inode)
 	return !IS_DEADDIR(inode);
 }
 
+static inline bool dir_relax_shared(struct inode *inode)
+{
+	inode_unlock_shared(inode);
+	inode_lock_shared(inode);
+	return !IS_DEADDIR(inode);
+}
+
 extern bool path_noexec(const struct path *path);
 extern void inode_nohighmem(struct inode *inode);
 
-- 
cgit v1.2.3


From 2ab71a02c56f8244ac611b5c6e6603c6fe83b966 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Mon, 25 Jan 2016 09:50:29 +0100
Subject: MIPS: BCM47xx: Move SPROM driver to drivers/firmware/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Broadcom ARM home routers store SPROM content in NVRAM just like MIPS
ones. To share SPROM code we need to move it out of arch/mips/ to some
common place. We already have bcm47xx_nvram in firmware path and SPROM
should fit there as well.
This driver is responsible for parsing SoC configuration data into a
struct shared between ssb and bcma buses.
This was tested with BCM4706 & BCM5357C0 (BCM47XX) and BCM4708A0
(ARCH_BCM_5301X).

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12210/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/Kconfig                         |   1 +
 arch/mips/bcm47xx/Makefile                |   2 +-
 arch/mips/bcm47xx/bcm47xx_private.h       |   3 -
 arch/mips/bcm47xx/setup.c                 |   2 +-
 arch/mips/bcm47xx/sprom.c                 | 724 -----------------------------
 drivers/firmware/broadcom/Kconfig         |  11 +
 drivers/firmware/broadcom/Makefile        |   1 +
 drivers/firmware/broadcom/bcm47xx_sprom.c | 737 ++++++++++++++++++++++++++++++
 include/linux/bcm47xx_sprom.h             |  24 +
 9 files changed, 776 insertions(+), 729 deletions(-)
 delete mode 100644 arch/mips/bcm47xx/sprom.c
 create mode 100644 drivers/firmware/broadcom/bcm47xx_sprom.c
 create mode 100644 include/linux/bcm47xx_sprom.h

(limited to 'include/linux')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2352917a47a1..882e73c3334e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -197,6 +197,7 @@ config BCM47XX
 	select GPIOLIB
 	select LEDS_GPIO_REGISTER
 	select BCM47XX_NVRAM
+	select BCM47XX_SPROM
 	help
 	 Support for BCM47XX based boards
 
diff --git a/arch/mips/bcm47xx/Makefile b/arch/mips/bcm47xx/Makefile
index 66bea4ecf449..6d8615074075 100644
--- a/arch/mips/bcm47xx/Makefile
+++ b/arch/mips/bcm47xx/Makefile
@@ -3,5 +3,5 @@
 # under Linux.
 #
 
-obj-y				+= irq.o prom.o serial.o setup.o time.o sprom.o
+obj-y				+= irq.o prom.o serial.o setup.o time.o
 obj-y				+= board.o buttons.o leds.o workarounds.o
diff --git a/arch/mips/bcm47xx/bcm47xx_private.h b/arch/mips/bcm47xx/bcm47xx_private.h
index 41796befa9df..0367ac7286fe 100644
--- a/arch/mips/bcm47xx/bcm47xx_private.h
+++ b/arch/mips/bcm47xx/bcm47xx_private.h
@@ -10,9 +10,6 @@
 /* prom.c */
 void __init bcm47xx_prom_highmem_init(void);
 
-/* sprom.c */
-void bcm47xx_sprom_register_fallbacks(void);
-
 /* buttons.c */
 int __init bcm47xx_buttons_register(void);
 
diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c
index c807e32d6d81..6054d49e608e 100644
--- a/arch/mips/bcm47xx/setup.c
+++ b/arch/mips/bcm47xx/setup.c
@@ -28,6 +28,7 @@
 
 #include "bcm47xx_private.h"
 
+#include <linux/bcm47xx_sprom.h>
 #include <linux/export.h>
 #include <linux/types.h>
 #include <linux/ethtool.h>
@@ -151,7 +152,6 @@ void __init plat_mem_setup(void)
 		pr_info("Using bcma bus\n");
 #ifdef CONFIG_BCM47XX_BCMA
 		bcm47xx_bus_type = BCM47XX_BUS_TYPE_BCMA;
-		bcm47xx_sprom_register_fallbacks();
 		bcm47xx_register_bcma();
 		bcm47xx_set_system_type(bcm47xx_bus.bcma.bus.chipinfo.id);
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/mips/bcm47xx/sprom.c b/arch/mips/bcm47xx/sprom.c
deleted file mode 100644
index ca7ad131d057..000000000000
--- a/arch/mips/bcm47xx/sprom.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- *  Copyright (C) 2004 Florian Schirmer <jolt@tuxbox.org>
- *  Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
- *  Copyright (C) 2006 Michael Buesch <m@bues.ch>
- *  Copyright (C) 2010 Waldemar Brodkorb <wbx@openadk.org>
- *  Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- *
- *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
- *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
- *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
- *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the  GNU General Public License along
- *  with this program; if not, write  to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <bcm47xx.h>
-#include <linux/if_ether.h>
-#include <linux/etherdevice.h>
-
-static void create_key(const char *prefix, const char *postfix,
-		       const char *name, char *buf, int len)
-{
-	if (prefix && postfix)
-		snprintf(buf, len, "%s%s%s", prefix, name, postfix);
-	else if (prefix)
-		snprintf(buf, len, "%s%s", prefix, name);
-	else if (postfix)
-		snprintf(buf, len, "%s%s", name, postfix);
-	else
-		snprintf(buf, len, "%s", name);
-}
-
-static int get_nvram_var(const char *prefix, const char *postfix,
-			 const char *name, char *buf, int len, bool fallback)
-{
-	char key[40];
-	int err;
-
-	create_key(prefix, postfix, name, key, sizeof(key));
-
-	err = bcm47xx_nvram_getenv(key, buf, len);
-	if (fallback && err == -ENOENT && prefix) {
-		create_key(NULL, postfix, name, key, sizeof(key));
-		err = bcm47xx_nvram_getenv(key, buf, len);
-	}
-	return err;
-}
-
-#define NVRAM_READ_VAL(type)						\
-static void nvram_read_ ## type(const char *prefix,			\
-				const char *postfix, const char *name,	\
-				type *val, type allset, bool fallback)	\
-{									\
-	char buf[100];							\
-	int err;							\
-	type var;							\
-									\
-	err = get_nvram_var(prefix, postfix, name, buf, sizeof(buf),	\
-			    fallback);					\
-	if (err < 0)							\
-		return;							\
-	err = kstrto ## type(strim(buf), 0, &var);			\
-	if (err) {							\
-		pr_warn("can not parse nvram name %s%s%s with value %s got %i\n",	\
-			prefix, name, postfix, buf, err);		\
-		return;							\
-	}								\
-	if (allset && var == allset)					\
-		return;							\
-	*val = var;							\
-}
-
-NVRAM_READ_VAL(u8)
-NVRAM_READ_VAL(s8)
-NVRAM_READ_VAL(u16)
-NVRAM_READ_VAL(u32)
-
-#undef NVRAM_READ_VAL
-
-static void nvram_read_u32_2(const char *prefix, const char *name,
-			     u16 *val_lo, u16 *val_hi, bool fallback)
-{
-	char buf[100];
-	int err;
-	u32 val;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	err = kstrtou32(strim(buf), 0, &val);
-	if (err) {
-		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
-			prefix, name, buf, err);
-		return;
-	}
-	*val_lo = (val & 0x0000FFFFU);
-	*val_hi = (val & 0xFFFF0000U) >> 16;
-}
-
-static void nvram_read_leddc(const char *prefix, const char *name,
-			     u8 *leddc_on_time, u8 *leddc_off_time,
-			     bool fallback)
-{
-	char buf[100];
-	int err;
-	u32 val;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	err = kstrtou32(strim(buf), 0, &val);
-	if (err) {
-		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
-			prefix, name, buf, err);
-		return;
-	}
-
-	if (val == 0xffff || val == 0xffffffff)
-		return;
-
-	*leddc_on_time = val & 0xff;
-	*leddc_off_time = (val >> 16) & 0xff;
-}
-
-static void bcm47xx_nvram_parse_macaddr(char *buf, u8 macaddr[6])
-{
-	if (strchr(buf, ':'))
-		sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0],
-			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
-			&macaddr[5]);
-	else if (strchr(buf, '-'))
-		sscanf(buf, "%hhx-%hhx-%hhx-%hhx-%hhx-%hhx", &macaddr[0],
-			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
-			&macaddr[5]);
-	else
-		pr_warn("Can not parse mac address: %s\n", buf);
-}
-
-static void nvram_read_macaddr(const char *prefix, const char *name,
-			       u8 val[6], bool fallback)
-{
-	char buf[100];
-	int err;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-
-	bcm47xx_nvram_parse_macaddr(buf, val);
-}
-
-static void nvram_read_alpha2(const char *prefix, const char *name,
-			     char val[2], bool fallback)
-{
-	char buf[10];
-	int err;
-
-	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
-	if (err < 0)
-		return;
-	if (buf[0] == '0')
-		return;
-	if (strlen(buf) > 2) {
-		pr_warn("alpha2 is too long %s\n", buf);
-		return;
-	}
-	memcpy(val, buf, 2);
-}
-
-/* This is one-function-only macro, it uses local "sprom" variable! */
-#define ENTRY(_revmask, _type, _prefix, _name, _val, _allset, _fallback) \
-	if (_revmask & BIT(sprom->revision)) \
-		nvram_read_ ## _type(_prefix, NULL, _name, &sprom->_val, \
-				     _allset, _fallback)
-/*
- * Special version of filling function that can be safely called for any SPROM
- * revision. For every NVRAM to SPROM mapping it contains bitmask of revisions
- * for which the mapping is valid.
- * It obviously requires some hexadecimal/bitmasks knowledge, but allows
- * writing cleaner code (easy revisions handling).
- * Note that while SPROM revision 0 was never used, we still keep BIT(0)
- * reserved for it, just to keep numbering sane.
- */
-static void bcm47xx_sprom_fill_auto(struct ssb_sprom *sprom,
-				    const char *prefix, bool fallback)
-{
-	const char *pre = prefix;
-	bool fb = fallback;
-
-	/* Broadcom extracts it for rev 8+ but it was found on 2 and 4 too */
-	ENTRY(0xfffffffe, u16, pre, "devid", dev_id, 0, fallback);
-
-	ENTRY(0xfffffffe, u16, pre, "boardrev", board_rev, 0, true);
-	ENTRY(0xfffffffe, u32, pre, "boardflags", boardflags, 0, fb);
-	ENTRY(0xfffffff0, u32, pre, "boardflags2", boardflags2, 0, fb);
-	ENTRY(0xfffff800, u32, pre, "boardflags3", boardflags3, 0, fb);
-	ENTRY(0x00000002, u16, pre, "boardflags", boardflags_lo, 0, fb);
-	ENTRY(0xfffffffc, u16, pre, "boardtype", board_type, 0, true);
-	ENTRY(0xfffffffe, u16, pre, "boardnum", board_num, 0, fb);
-	ENTRY(0x00000002, u8, pre, "cc", country_code, 0, fb);
-	ENTRY(0xfffffff8, u8, pre, "regrev", regrev, 0, fb);
-
-	ENTRY(0xfffffffe, u8, pre, "ledbh0", gpio0, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh1", gpio1, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh2", gpio2, 0xff, fb);
-	ENTRY(0xfffffffe, u8, pre, "ledbh3", gpio3, 0xff, fb);
-
-	ENTRY(0x0000070e, u16, pre, "pa0b0", pa0b0, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa0b1", pa0b1, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa0b2", pa0b2, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa0itssit", itssi_bg, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa0maxpwr", maxpwr_bg, 0, fb);
-
-	ENTRY(0x0000070c, u8, pre, "opo", opo, 0, fb);
-	ENTRY(0xfffffffe, u8, pre, "aa2g", ant_available_bg, 0, fb);
-	ENTRY(0xfffffffe, u8, pre, "aa5g", ant_available_a, 0, fb);
-	ENTRY(0x000007fe, s8, pre, "ag0", antenna_gain.a0, 0, fb);
-	ENTRY(0x000007fe, s8, pre, "ag1", antenna_gain.a1, 0, fb);
-	ENTRY(0x000007f0, s8, pre, "ag2", antenna_gain.a2, 0, fb);
-	ENTRY(0x000007f0, s8, pre, "ag3", antenna_gain.a3, 0, fb);
-
-	ENTRY(0x0000070e, u16, pre, "pa1b0", pa1b0, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa1b1", pa1b1, 0, fb);
-	ENTRY(0x0000070e, u16, pre, "pa1b2", pa1b2, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob0", pa1lob0, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob1", pa1lob1, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1lob2", pa1lob2, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib0", pa1hib0, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib1", pa1hib1, 0, fb);
-	ENTRY(0x0000070c, u16, pre, "pa1hib2", pa1hib2, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa1itssit", itssi_a, 0, fb);
-	ENTRY(0x0000070e, u8, pre, "pa1maxpwr", maxpwr_a, 0, fb);
-	ENTRY(0x0000070c, u8, pre, "pa1lomaxpwr", maxpwr_al, 0, fb);
-	ENTRY(0x0000070c, u8, pre, "pa1himaxpwr", maxpwr_ah, 0, fb);
-
-	ENTRY(0x00000708, u8, pre, "bxa2g", bxa2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssisav2g", rssisav2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismc2g", rssismc2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismf2g", rssismf2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "bxa5g", bxa5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssisav5g", rssisav5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismc5g", rssismc5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "rssismf5g", rssismf5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri2g", tri2g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5g", tri5g, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5gl", tri5gl, 0, fb);
-	ENTRY(0x00000708, u8, pre, "tri5gh", tri5gh, 0, fb);
-	ENTRY(0x00000708, s8, pre, "rxpo2g", rxpo2g, 0, fb);
-	ENTRY(0x00000708, s8, pre, "rxpo5g", rxpo5g, 0, fb);
-	ENTRY(0xfffffff0, u8, pre, "txchain", txchain, 0xf, fb);
-	ENTRY(0xfffffff0, u8, pre, "rxchain", rxchain, 0xf, fb);
-	ENTRY(0xfffffff0, u8, pre, "antswitch", antswitch, 0xff, fb);
-	ENTRY(0x00000700, u8, pre, "tssipos2g", fem.ghz2.tssipos, 0, fb);
-	ENTRY(0x00000700, u8, pre, "extpagain2g", fem.ghz2.extpa_gain, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pdetrange2g", fem.ghz2.pdet_range, 0, fb);
-	ENTRY(0x00000700, u8, pre, "triso2g", fem.ghz2.tr_iso, 0, fb);
-	ENTRY(0x00000700, u8, pre, "antswctl2g", fem.ghz2.antswlut, 0, fb);
-	ENTRY(0x00000700, u8, pre, "tssipos5g", fem.ghz5.tssipos, 0, fb);
-	ENTRY(0x00000700, u8, pre, "extpagain5g", fem.ghz5.extpa_gain, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pdetrange5g", fem.ghz5.pdet_range, 0, fb);
-	ENTRY(0x00000700, u8, pre, "triso5g", fem.ghz5.tr_iso, 0, fb);
-	ENTRY(0x00000700, u8, pre, "antswctl5g", fem.ghz5.antswlut, 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga0", txpid2g[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga1", txpid2g[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga2", txpid2g[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid2ga3", txpid2g[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga0", txpid5g[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga1", txpid5g[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga2", txpid5g[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5ga3", txpid5g[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla0", txpid5gl[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla1", txpid5gl[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla2", txpid5gl[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gla3", txpid5gl[3], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha0", txpid5gh[0], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha1", txpid5gh[1], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha2", txpid5gh[2], 0, fb);
-	ENTRY(0x000000f0, u8, pre, "txpid5gha3", txpid5gh[3], 0, fb);
-
-	ENTRY(0xffffff00, u8, pre, "tempthresh", tempthresh, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempoffset", tempoffset, 0, fb);
-	ENTRY(0xffffff00, u16, pre, "rawtempsense", rawtempsense, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower", measpower, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempsense_slope", tempsense_slope, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempcorrx", tempcorrx, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "tempsense_option", tempsense_option, 0, fb);
-	ENTRY(0x00000700, u8, pre, "freqoffset_corr", freqoffset_corr, 0, fb);
-	ENTRY(0x00000700, u8, pre, "iqcal_swp_dis", iqcal_swp_dis, 0, fb);
-	ENTRY(0x00000700, u8, pre, "hw_iqcal_en", hw_iqcal_en, 0, fb);
-	ENTRY(0x00000700, u8, pre, "elna2g", elna2g, 0, fb);
-	ENTRY(0x00000700, u8, pre, "elna5g", elna5g, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "phycal_tempdelta", phycal_tempdelta, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "temps_period", temps_period, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "temps_hysteresis", temps_hysteresis, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower1", measpower1, 0, fb);
-	ENTRY(0xffffff00, u8, pre, "measpower2", measpower2, 0, fb);
-
-	ENTRY(0x000001f0, u16, pre, "cck2gpo", cck2gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm2gpo", ofdm2gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5gpo", ofdm5gpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5glpo", ofdm5glpo, 0, fb);
-	ENTRY(0x000001f0, u32, pre, "ofdm5ghpo", ofdm5ghpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo0", mcs2gpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo1", mcs2gpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo2", mcs2gpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo3", mcs2gpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo4", mcs2gpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo5", mcs2gpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo6", mcs2gpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs2gpo7", mcs2gpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo0", mcs5gpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo1", mcs5gpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo2", mcs5gpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo3", mcs5gpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo4", mcs5gpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo5", mcs5gpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo6", mcs5gpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5gpo7", mcs5gpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo0", mcs5glpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo1", mcs5glpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo2", mcs5glpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo3", mcs5glpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo4", mcs5glpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo5", mcs5glpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo6", mcs5glpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5glpo7", mcs5glpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo0", mcs5ghpo[0], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo1", mcs5ghpo[1], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo2", mcs5ghpo[2], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo3", mcs5ghpo[3], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo4", mcs5ghpo[4], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo5", mcs5ghpo[5], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo6", mcs5ghpo[6], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "mcs5ghpo7", mcs5ghpo[7], 0, fb);
-	ENTRY(0x000001f0, u16, pre, "cddpo", cddpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "stbcpo", stbcpo, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "bw40po", bw40po, 0, fb);
-	ENTRY(0x000001f0, u16, pre, "bwduppo", bwduppo, 0, fb);
-
-	ENTRY(0xfffffe00, u16, pre, "cckbw202gpo", cckbw202gpo, 0, fb);
-	ENTRY(0xfffffe00, u16, pre, "cckbw20ul2gpo", cckbw20ul2gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw202gpo", legofdmbw202gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul2gpo", legofdmbw20ul2gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205glpo", legofdmbw205glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5glpo", legofdmbw20ul5glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205gmpo", legofdmbw205gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5gmpo", legofdmbw20ul5gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw205ghpo", legofdmbw205ghpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5ghpo", legofdmbw20ul5ghpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw202gpo", mcsbw202gpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul2gpo", mcsbw20ul2gpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw402gpo", mcsbw402gpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205glpo", mcsbw205glpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5glpo", mcsbw20ul5glpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405glpo", mcsbw405glpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205gmpo", mcsbw205gmpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5gmpo", mcsbw20ul5gmpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405gmpo", mcsbw405gmpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw205ghpo", mcsbw205ghpo, 0, fb);
-	ENTRY(0x00000600, u32, pre, "mcsbw20ul5ghpo", mcsbw20ul5ghpo, 0, fb);
-	ENTRY(0xfffffe00, u32, pre, "mcsbw405ghpo", mcsbw405ghpo, 0, fb);
-	ENTRY(0x00000600, u16, pre, "mcs32po", mcs32po, 0, fb);
-	ENTRY(0x00000600, u16, pre, "legofdm40duppo", legofdm40duppo, 0, fb);
-	ENTRY(0x00000700, u8, pre, "pcieingress_war", pcieingress_war, 0, fb);
-
-	/* TODO: rev 11 support */
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga0", rxgainerr2ga[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga1", rxgainerr2ga[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr2ga2", rxgainerr2ga[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla0", rxgainerr5gla[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla1", rxgainerr5gla[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gla2", rxgainerr5gla[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma0", rxgainerr5gma[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma1", rxgainerr5gma[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gma2", rxgainerr5gma[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha0", rxgainerr5gha[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha1", rxgainerr5gha[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gha2", rxgainerr5gha[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua0", rxgainerr5gua[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua1", rxgainerr5gua[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "rxgainerr5gua2", rxgainerr5gua[2], 0, fb);
-
-	ENTRY(0xfffffe00, u8, pre, "sar2g", sar2g, 0, fb);
-	ENTRY(0xfffffe00, u8, pre, "sar5g", sar5g, 0, fb);
-
-	/* TODO: rev 11 support */
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga0", noiselvl2ga[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga1", noiselvl2ga[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl2ga2", noiselvl2ga[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla0", noiselvl5gla[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla1", noiselvl5gla[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gla2", noiselvl5gla[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma0", noiselvl5gma[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma1", noiselvl5gma[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gma2", noiselvl5gma[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha0", noiselvl5gha[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha1", noiselvl5gha[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gha2", noiselvl5gha[2], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua0", noiselvl5gua[0], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua1", noiselvl5gua[1], 0, fb);
-	ENTRY(0x00000700, u8, pre, "noiselvl5gua2", noiselvl5gua[2], 0, fb);
-}
-#undef ENTRY /* It's specififc, uses local variable, don't use it (again). */
-
-static void bcm47xx_fill_sprom_path_r4589(struct ssb_sprom *sprom,
-					  const char *prefix, bool fallback)
-{
-	char postfix[2];
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
-		struct ssb_sprom_core_pwr_info *pwr_info;
-
-		pwr_info = &sprom->core_pwr_info[i];
-
-		snprintf(postfix, sizeof(postfix), "%i", i);
-		nvram_read_u8(prefix, postfix, "maxp2ga",
-			      &pwr_info->maxpwr_2g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "itt2ga",
-			      &pwr_info->itssi_2g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "itt5ga",
-			      &pwr_info->itssi_5g, 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw0a",
-			       &pwr_info->pa_2g[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw1a",
-			       &pwr_info->pa_2g[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa2gw2a",
-			       &pwr_info->pa_2g[2], 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5ga",
-			      &pwr_info->maxpwr_5g, 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5gha",
-			      &pwr_info->maxpwr_5gh, 0, fallback);
-		nvram_read_u8(prefix, postfix, "maxp5gla",
-			      &pwr_info->maxpwr_5gl, 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw0a",
-			       &pwr_info->pa_5g[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw1a",
-			       &pwr_info->pa_5g[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw2a",
-			       &pwr_info->pa_5g[2], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw0a",
-			       &pwr_info->pa_5gl[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw1a",
-			       &pwr_info->pa_5gl[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw2a",
-			       &pwr_info->pa_5gl[2], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw0a",
-			       &pwr_info->pa_5gh[0], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw1a",
-			       &pwr_info->pa_5gh[1], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw2a",
-			       &pwr_info->pa_5gh[2], 0, fallback);
-	}
-}
-
-static void bcm47xx_fill_sprom_path_r45(struct ssb_sprom *sprom,
-					const char *prefix, bool fallback)
-{
-	char postfix[2];
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
-		struct ssb_sprom_core_pwr_info *pwr_info;
-
-		pwr_info = &sprom->core_pwr_info[i];
-
-		snprintf(postfix, sizeof(postfix), "%i", i);
-		nvram_read_u16(prefix, postfix, "pa2gw3a",
-			       &pwr_info->pa_2g[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5gw3a",
-			       &pwr_info->pa_5g[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5glw3a",
-			       &pwr_info->pa_5gl[3], 0, fallback);
-		nvram_read_u16(prefix, postfix, "pa5ghw3a",
-			       &pwr_info->pa_5gh[3], 0, fallback);
-	}
-}
-
-static bool bcm47xx_is_valid_mac(u8 *mac)
-{
-	return mac && !(mac[0] == 0x00 && mac[1] == 0x90 && mac[2] == 0x4c);
-}
-
-static int bcm47xx_increase_mac_addr(u8 *mac, u8 num)
-{
-	u8 *oui = mac + ETH_ALEN/2 - 1;
-	u8 *p = mac + ETH_ALEN - 1;
-
-	do {
-		(*p) += num;
-		if (*p > num)
-			break;
-		p--;
-		num = 1;
-	} while (p != oui);
-
-	if (p == oui) {
-		pr_err("unable to fetch mac address\n");
-		return -ENOENT;
-	}
-	return 0;
-}
-
-static int mac_addr_used = 2;
-
-static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
-					const char *prefix, bool fallback)
-{
-	bool fb = fallback;
-
-	nvram_read_macaddr(prefix, "et0macaddr", sprom->et0mac, fallback);
-	nvram_read_u8(prefix, NULL, "et0mdcport", &sprom->et0mdcport, 0,
-		      fallback);
-	nvram_read_u8(prefix, NULL, "et0phyaddr", &sprom->et0phyaddr, 0,
-		      fallback);
-
-	nvram_read_macaddr(prefix, "et1macaddr", sprom->et1mac, fallback);
-	nvram_read_u8(prefix, NULL, "et1mdcport", &sprom->et1mdcport, 0,
-		      fallback);
-	nvram_read_u8(prefix, NULL, "et1phyaddr", &sprom->et1phyaddr, 0,
-		      fallback);
-
-	nvram_read_macaddr(prefix, "et2macaddr", sprom->et2mac, fb);
-	nvram_read_u8(prefix, NULL, "et2mdcport", &sprom->et2mdcport, 0, fb);
-	nvram_read_u8(prefix, NULL, "et2phyaddr", &sprom->et2phyaddr, 0, fb);
-
-	nvram_read_macaddr(prefix, "macaddr", sprom->il0mac, fallback);
-	nvram_read_macaddr(prefix, "il0macaddr", sprom->il0mac, fallback);
-
-	/* The address prefix 00:90:4C is used by Broadcom in their initial
-	 * configuration. When a mac address with the prefix 00:90:4C is used
-	 * all devices from the same series are sharing the same mac address.
-	 * To prevent mac address collisions we replace them with a mac address
-	 * based on the base address.
-	 */
-	if (!bcm47xx_is_valid_mac(sprom->il0mac)) {
-		u8 mac[6];
-
-		nvram_read_macaddr(NULL, "et0macaddr", mac, false);
-		if (bcm47xx_is_valid_mac(mac)) {
-			int err = bcm47xx_increase_mac_addr(mac, mac_addr_used);
-
-			if (!err) {
-				ether_addr_copy(sprom->il0mac, mac);
-				mac_addr_used++;
-			}
-		}
-	}
-}
-
-static void bcm47xx_fill_board_data(struct ssb_sprom *sprom, const char *prefix,
-				    bool fallback)
-{
-	nvram_read_u32_2(prefix, "boardflags", &sprom->boardflags_lo,
-			 &sprom->boardflags_hi, fallback);
-	nvram_read_u32_2(prefix, "boardflags2", &sprom->boardflags2_lo,
-			 &sprom->boardflags2_hi, fallback);
-}
-
-void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix,
-			bool fallback)
-{
-	bcm47xx_fill_sprom_ethernet(sprom, prefix, fallback);
-	bcm47xx_fill_board_data(sprom, prefix, fallback);
-
-	nvram_read_u8(prefix, NULL, "sromrev", &sprom->revision, 0, fallback);
-
-	/* Entries requiring custom functions */
-	nvram_read_alpha2(prefix, "ccode", sprom->alpha2, fallback);
-	if (sprom->revision >= 3)
-		nvram_read_leddc(prefix, "leddc", &sprom->leddc_on_time,
-				 &sprom->leddc_off_time, fallback);
-
-	switch (sprom->revision) {
-	case 4:
-	case 5:
-		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
-		bcm47xx_fill_sprom_path_r45(sprom, prefix, fallback);
-		break;
-	case 8:
-	case 9:
-		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
-		break;
-	}
-
-	bcm47xx_sprom_fill_auto(sprom, prefix, fallback);
-}
-
-#if defined(CONFIG_BCM47XX_SSB)
-static int bcm47xx_get_sprom_ssb(struct ssb_bus *bus, struct ssb_sprom *out)
-{
-	char prefix[10];
-
-	switch (bus->bustype) {
-	case SSB_BUSTYPE_SSB:
-		bcm47xx_fill_sprom(out, NULL, false);
-		return 0;
-	case SSB_BUSTYPE_PCI:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		snprintf(prefix, sizeof(prefix), "pci/%u/%u/",
-			 bus->host_pci->bus->number + 1,
-			 PCI_SLOT(bus->host_pci->devfn));
-		bcm47xx_fill_sprom(out, prefix, false);
-		return 0;
-	default:
-		pr_warn("Unable to fill SPROM for given bustype.\n");
-		return -EINVAL;
-	}
-}
-#endif
-
-#if defined(CONFIG_BCM47XX_BCMA)
-/*
- * Having many NVRAM entries for PCI devices led to repeating prefixes like
- * pci/1/1/ all the time and wasting flash space. So at some point Broadcom
- * decided to introduce prefixes like 0: 1: 2: etc.
- * If we find e.g. devpath0=pci/2/1 or devpath0=pci/2/1/ we should use 0:
- * instead of pci/2/1/.
- */
-static void bcm47xx_sprom_apply_prefix_alias(char *prefix, size_t prefix_size)
-{
-	size_t prefix_len = strlen(prefix);
-	size_t short_len = prefix_len - 1;
-	char nvram_var[10];
-	char buf[20];
-	int i;
-
-	/* Passed prefix has to end with a slash */
-	if (prefix_len <= 0 || prefix[prefix_len - 1] != '/')
-		return;
-
-	for (i = 0; i < 3; i++) {
-		if (snprintf(nvram_var, sizeof(nvram_var), "devpath%d", i) <= 0)
-			continue;
-		if (bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf)) < 0)
-			continue;
-		if (!strcmp(buf, prefix) ||
-		    (short_len && strlen(buf) == short_len && !strncmp(buf, prefix, short_len))) {
-			snprintf(prefix, prefix_size, "%d:", i);
-			return;
-		}
-	}
-}
-
-static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out)
-{
-	struct bcma_boardinfo *binfo = &bus->boardinfo;
-	struct bcma_device *core;
-	char buf[10];
-	char *prefix;
-	bool fallback = false;
-
-	switch (bus->hosttype) {
-	case BCMA_HOSTTYPE_PCI:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		/* On BCM47XX all PCI buses share the same domain */
-		if (config_enabled(CONFIG_BCM47XX))
-			snprintf(buf, sizeof(buf), "pci/%u/%u/",
-				 bus->host_pci->bus->number + 1,
-				 PCI_SLOT(bus->host_pci->devfn));
-		else
-			snprintf(buf, sizeof(buf), "pci/%u/%u/",
-				 pci_domain_nr(bus->host_pci->bus) + 1,
-				 bus->host_pci->bus->number);
-		bcm47xx_sprom_apply_prefix_alias(buf, sizeof(buf));
-		prefix = buf;
-		break;
-	case BCMA_HOSTTYPE_SOC:
-		memset(out, 0, sizeof(struct ssb_sprom));
-		core = bcma_find_core(bus, BCMA_CORE_80211);
-		if (core) {
-			snprintf(buf, sizeof(buf), "sb/%u/",
-				 core->core_index);
-			prefix = buf;
-			fallback = true;
-		} else {
-			prefix = NULL;
-		}
-		break;
-	default:
-		pr_warn("Unable to fill SPROM for given bustype.\n");
-		return -EINVAL;
-	}
-
-	nvram_read_u16(prefix, NULL, "boardvendor", &binfo->vendor, 0, true);
-	if (!binfo->vendor)
-		binfo->vendor = SSB_BOARDVENDOR_BCM;
-	nvram_read_u16(prefix, NULL, "boardtype", &binfo->type, 0, true);
-
-	bcm47xx_fill_sprom(out, prefix, fallback);
-
-	return 0;
-}
-#endif
-
-/*
- * On bcm47xx we need to register SPROM fallback handler very early, so we can't
- * use anything like platform device / driver for this.
- */
-void bcm47xx_sprom_register_fallbacks(void)
-{
-#if defined(CONFIG_BCM47XX_SSB)
-	if (ssb_arch_register_fallback_sprom(&bcm47xx_get_sprom_ssb))
-		pr_warn("Failed to register ssb SPROM handler\n");
-#endif
-
-#if defined(CONFIG_BCM47XX_BCMA)
-	if (bcma_arch_register_fallback_sprom(&bcm47xx_get_sprom_bcma))
-		pr_warn("Failed to register bcma SPROM handler\n");
-#endif
-}
diff --git a/drivers/firmware/broadcom/Kconfig b/drivers/firmware/broadcom/Kconfig
index 6bed119930dd..3c7e5b741e37 100644
--- a/drivers/firmware/broadcom/Kconfig
+++ b/drivers/firmware/broadcom/Kconfig
@@ -9,3 +9,14 @@ config BCM47XX_NVRAM
 	  This driver provides an easy way to get value of requested parameter.
 	  It simply reads content of NVRAM and parses it. It doesn't control any
 	  hardware part itself.
+
+config BCM47XX_SPROM
+	bool "Broadcom SPROM driver"
+	depends on BCM47XX_NVRAM
+	help
+	  Broadcom devices store configuration data in SPROM. Accessing it is
+	  specific to the bus host type, e.g. PCI(e) devices have it mapped in
+	  a PCI BAR.
+	  In case of SoC devices SPROM content is stored on a flash used by
+	  bootloader firmware CFE. This driver provides method to ssb and bcma
+	  drivers to read SPROM on SoC.
diff --git a/drivers/firmware/broadcom/Makefile b/drivers/firmware/broadcom/Makefile
index d0e683583cd6..f93efc479b8b 100644
--- a/drivers/firmware/broadcom/Makefile
+++ b/drivers/firmware/broadcom/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_BCM47XX_NVRAM)		+= bcm47xx_nvram.o
+obj-$(CONFIG_BCM47XX_SPROM)		+= bcm47xx_sprom.o
diff --git a/drivers/firmware/broadcom/bcm47xx_sprom.c b/drivers/firmware/broadcom/bcm47xx_sprom.c
new file mode 100644
index 000000000000..b6eb875d4af3
--- /dev/null
+++ b/drivers/firmware/broadcom/bcm47xx_sprom.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (C) 2004 Florian Schirmer <jolt@tuxbox.org>
+ *  Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
+ *  Copyright (C) 2006 Michael Buesch <m@bues.ch>
+ *  Copyright (C) 2010 Waldemar Brodkorb <wbx@openadk.org>
+ *  Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
+ *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+ *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
+ *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the  GNU General Public License along
+ *  with this program; if not, write  to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bcm47xx_nvram.h>
+#include <linux/bcma/bcma.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/ssb/ssb.h>
+
+static void create_key(const char *prefix, const char *postfix,
+		       const char *name, char *buf, int len)
+{
+	if (prefix && postfix)
+		snprintf(buf, len, "%s%s%s", prefix, name, postfix);
+	else if (prefix)
+		snprintf(buf, len, "%s%s", prefix, name);
+	else if (postfix)
+		snprintf(buf, len, "%s%s", name, postfix);
+	else
+		snprintf(buf, len, "%s", name);
+}
+
+static int get_nvram_var(const char *prefix, const char *postfix,
+			 const char *name, char *buf, int len, bool fallback)
+{
+	char key[40];
+	int err;
+
+	create_key(prefix, postfix, name, key, sizeof(key));
+
+	err = bcm47xx_nvram_getenv(key, buf, len);
+	if (fallback && err == -ENOENT && prefix) {
+		create_key(NULL, postfix, name, key, sizeof(key));
+		err = bcm47xx_nvram_getenv(key, buf, len);
+	}
+	return err;
+}
+
+#define NVRAM_READ_VAL(type)						\
+static void nvram_read_ ## type(const char *prefix,			\
+				const char *postfix, const char *name,	\
+				type *val, type allset, bool fallback)	\
+{									\
+	char buf[100];							\
+	int err;							\
+	type var;							\
+									\
+	err = get_nvram_var(prefix, postfix, name, buf, sizeof(buf),	\
+			    fallback);					\
+	if (err < 0)							\
+		return;							\
+	err = kstrto ## type(strim(buf), 0, &var);			\
+	if (err) {							\
+		pr_warn("can not parse nvram name %s%s%s with value %s got %i\n",	\
+			prefix, name, postfix, buf, err);		\
+		return;							\
+	}								\
+	if (allset && var == allset)					\
+		return;							\
+	*val = var;							\
+}
+
+NVRAM_READ_VAL(u8)
+NVRAM_READ_VAL(s8)
+NVRAM_READ_VAL(u16)
+NVRAM_READ_VAL(u32)
+
+#undef NVRAM_READ_VAL
+
+static void nvram_read_u32_2(const char *prefix, const char *name,
+			     u16 *val_lo, u16 *val_hi, bool fallback)
+{
+	char buf[100];
+	int err;
+	u32 val;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	err = kstrtou32(strim(buf), 0, &val);
+	if (err) {
+		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
+			prefix, name, buf, err);
+		return;
+	}
+	*val_lo = (val & 0x0000FFFFU);
+	*val_hi = (val & 0xFFFF0000U) >> 16;
+}
+
+static void nvram_read_leddc(const char *prefix, const char *name,
+			     u8 *leddc_on_time, u8 *leddc_off_time,
+			     bool fallback)
+{
+	char buf[100];
+	int err;
+	u32 val;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	err = kstrtou32(strim(buf), 0, &val);
+	if (err) {
+		pr_warn("can not parse nvram name %s%s with value %s got %i\n",
+			prefix, name, buf, err);
+		return;
+	}
+
+	if (val == 0xffff || val == 0xffffffff)
+		return;
+
+	*leddc_on_time = val & 0xff;
+	*leddc_off_time = (val >> 16) & 0xff;
+}
+
+static void bcm47xx_nvram_parse_macaddr(char *buf, u8 macaddr[6])
+{
+	if (strchr(buf, ':'))
+		sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0],
+			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
+			&macaddr[5]);
+	else if (strchr(buf, '-'))
+		sscanf(buf, "%hhx-%hhx-%hhx-%hhx-%hhx-%hhx", &macaddr[0],
+			&macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4],
+			&macaddr[5]);
+	else
+		pr_warn("Can not parse mac address: %s\n", buf);
+}
+
+static void nvram_read_macaddr(const char *prefix, const char *name,
+			       u8 val[6], bool fallback)
+{
+	char buf[100];
+	int err;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+
+	bcm47xx_nvram_parse_macaddr(buf, val);
+}
+
+static void nvram_read_alpha2(const char *prefix, const char *name,
+			     char val[2], bool fallback)
+{
+	char buf[10];
+	int err;
+
+	err = get_nvram_var(prefix, NULL, name, buf, sizeof(buf), fallback);
+	if (err < 0)
+		return;
+	if (buf[0] == '0')
+		return;
+	if (strlen(buf) > 2) {
+		pr_warn("alpha2 is too long %s\n", buf);
+		return;
+	}
+	memcpy(val, buf, 2);
+}
+
+/* This is one-function-only macro, it uses local "sprom" variable! */
+#define ENTRY(_revmask, _type, _prefix, _name, _val, _allset, _fallback) \
+	if (_revmask & BIT(sprom->revision)) \
+		nvram_read_ ## _type(_prefix, NULL, _name, &sprom->_val, \
+				     _allset, _fallback)
+/*
+ * Special version of filling function that can be safely called for any SPROM
+ * revision. For every NVRAM to SPROM mapping it contains bitmask of revisions
+ * for which the mapping is valid.
+ * It obviously requires some hexadecimal/bitmasks knowledge, but allows
+ * writing cleaner code (easy revisions handling).
+ * Note that while SPROM revision 0 was never used, we still keep BIT(0)
+ * reserved for it, just to keep numbering sane.
+ */
+static void bcm47xx_sprom_fill_auto(struct ssb_sprom *sprom,
+				    const char *prefix, bool fallback)
+{
+	const char *pre = prefix;
+	bool fb = fallback;
+
+	/* Broadcom extracts it for rev 8+ but it was found on 2 and 4 too */
+	ENTRY(0xfffffffe, u16, pre, "devid", dev_id, 0, fallback);
+
+	ENTRY(0xfffffffe, u16, pre, "boardrev", board_rev, 0, true);
+	ENTRY(0xfffffffe, u32, pre, "boardflags", boardflags, 0, fb);
+	ENTRY(0xfffffff0, u32, pre, "boardflags2", boardflags2, 0, fb);
+	ENTRY(0xfffff800, u32, pre, "boardflags3", boardflags3, 0, fb);
+	ENTRY(0x00000002, u16, pre, "boardflags", boardflags_lo, 0, fb);
+	ENTRY(0xfffffffc, u16, pre, "boardtype", board_type, 0, true);
+	ENTRY(0xfffffffe, u16, pre, "boardnum", board_num, 0, fb);
+	ENTRY(0x00000002, u8, pre, "cc", country_code, 0, fb);
+	ENTRY(0xfffffff8, u8, pre, "regrev", regrev, 0, fb);
+
+	ENTRY(0xfffffffe, u8, pre, "ledbh0", gpio0, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh1", gpio1, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh2", gpio2, 0xff, fb);
+	ENTRY(0xfffffffe, u8, pre, "ledbh3", gpio3, 0xff, fb);
+
+	ENTRY(0x0000070e, u16, pre, "pa0b0", pa0b0, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa0b1", pa0b1, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa0b2", pa0b2, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa0itssit", itssi_bg, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa0maxpwr", maxpwr_bg, 0, fb);
+
+	ENTRY(0x0000070c, u8, pre, "opo", opo, 0, fb);
+	ENTRY(0xfffffffe, u8, pre, "aa2g", ant_available_bg, 0, fb);
+	ENTRY(0xfffffffe, u8, pre, "aa5g", ant_available_a, 0, fb);
+	ENTRY(0x000007fe, s8, pre, "ag0", antenna_gain.a0, 0, fb);
+	ENTRY(0x000007fe, s8, pre, "ag1", antenna_gain.a1, 0, fb);
+	ENTRY(0x000007f0, s8, pre, "ag2", antenna_gain.a2, 0, fb);
+	ENTRY(0x000007f0, s8, pre, "ag3", antenna_gain.a3, 0, fb);
+
+	ENTRY(0x0000070e, u16, pre, "pa1b0", pa1b0, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa1b1", pa1b1, 0, fb);
+	ENTRY(0x0000070e, u16, pre, "pa1b2", pa1b2, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob0", pa1lob0, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob1", pa1lob1, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1lob2", pa1lob2, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib0", pa1hib0, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib1", pa1hib1, 0, fb);
+	ENTRY(0x0000070c, u16, pre, "pa1hib2", pa1hib2, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa1itssit", itssi_a, 0, fb);
+	ENTRY(0x0000070e, u8, pre, "pa1maxpwr", maxpwr_a, 0, fb);
+	ENTRY(0x0000070c, u8, pre, "pa1lomaxpwr", maxpwr_al, 0, fb);
+	ENTRY(0x0000070c, u8, pre, "pa1himaxpwr", maxpwr_ah, 0, fb);
+
+	ENTRY(0x00000708, u8, pre, "bxa2g", bxa2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssisav2g", rssisav2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismc2g", rssismc2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismf2g", rssismf2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "bxa5g", bxa5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssisav5g", rssisav5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismc5g", rssismc5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "rssismf5g", rssismf5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri2g", tri2g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5g", tri5g, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5gl", tri5gl, 0, fb);
+	ENTRY(0x00000708, u8, pre, "tri5gh", tri5gh, 0, fb);
+	ENTRY(0x00000708, s8, pre, "rxpo2g", rxpo2g, 0, fb);
+	ENTRY(0x00000708, s8, pre, "rxpo5g", rxpo5g, 0, fb);
+	ENTRY(0xfffffff0, u8, pre, "txchain", txchain, 0xf, fb);
+	ENTRY(0xfffffff0, u8, pre, "rxchain", rxchain, 0xf, fb);
+	ENTRY(0xfffffff0, u8, pre, "antswitch", antswitch, 0xff, fb);
+	ENTRY(0x00000700, u8, pre, "tssipos2g", fem.ghz2.tssipos, 0, fb);
+	ENTRY(0x00000700, u8, pre, "extpagain2g", fem.ghz2.extpa_gain, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pdetrange2g", fem.ghz2.pdet_range, 0, fb);
+	ENTRY(0x00000700, u8, pre, "triso2g", fem.ghz2.tr_iso, 0, fb);
+	ENTRY(0x00000700, u8, pre, "antswctl2g", fem.ghz2.antswlut, 0, fb);
+	ENTRY(0x00000700, u8, pre, "tssipos5g", fem.ghz5.tssipos, 0, fb);
+	ENTRY(0x00000700, u8, pre, "extpagain5g", fem.ghz5.extpa_gain, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pdetrange5g", fem.ghz5.pdet_range, 0, fb);
+	ENTRY(0x00000700, u8, pre, "triso5g", fem.ghz5.tr_iso, 0, fb);
+	ENTRY(0x00000700, u8, pre, "antswctl5g", fem.ghz5.antswlut, 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga0", txpid2g[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga1", txpid2g[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga2", txpid2g[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid2ga3", txpid2g[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga0", txpid5g[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga1", txpid5g[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga2", txpid5g[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5ga3", txpid5g[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla0", txpid5gl[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla1", txpid5gl[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla2", txpid5gl[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gla3", txpid5gl[3], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha0", txpid5gh[0], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha1", txpid5gh[1], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha2", txpid5gh[2], 0, fb);
+	ENTRY(0x000000f0, u8, pre, "txpid5gha3", txpid5gh[3], 0, fb);
+
+	ENTRY(0xffffff00, u8, pre, "tempthresh", tempthresh, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempoffset", tempoffset, 0, fb);
+	ENTRY(0xffffff00, u16, pre, "rawtempsense", rawtempsense, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower", measpower, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempsense_slope", tempsense_slope, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempcorrx", tempcorrx, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "tempsense_option", tempsense_option, 0, fb);
+	ENTRY(0x00000700, u8, pre, "freqoffset_corr", freqoffset_corr, 0, fb);
+	ENTRY(0x00000700, u8, pre, "iqcal_swp_dis", iqcal_swp_dis, 0, fb);
+	ENTRY(0x00000700, u8, pre, "hw_iqcal_en", hw_iqcal_en, 0, fb);
+	ENTRY(0x00000700, u8, pre, "elna2g", elna2g, 0, fb);
+	ENTRY(0x00000700, u8, pre, "elna5g", elna5g, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "phycal_tempdelta", phycal_tempdelta, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "temps_period", temps_period, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "temps_hysteresis", temps_hysteresis, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower1", measpower1, 0, fb);
+	ENTRY(0xffffff00, u8, pre, "measpower2", measpower2, 0, fb);
+
+	ENTRY(0x000001f0, u16, pre, "cck2gpo", cck2gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm2gpo", ofdm2gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5gpo", ofdm5gpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5glpo", ofdm5glpo, 0, fb);
+	ENTRY(0x000001f0, u32, pre, "ofdm5ghpo", ofdm5ghpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo0", mcs2gpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo1", mcs2gpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo2", mcs2gpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo3", mcs2gpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo4", mcs2gpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo5", mcs2gpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo6", mcs2gpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs2gpo7", mcs2gpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo0", mcs5gpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo1", mcs5gpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo2", mcs5gpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo3", mcs5gpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo4", mcs5gpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo5", mcs5gpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo6", mcs5gpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5gpo7", mcs5gpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo0", mcs5glpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo1", mcs5glpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo2", mcs5glpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo3", mcs5glpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo4", mcs5glpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo5", mcs5glpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo6", mcs5glpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5glpo7", mcs5glpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo0", mcs5ghpo[0], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo1", mcs5ghpo[1], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo2", mcs5ghpo[2], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo3", mcs5ghpo[3], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo4", mcs5ghpo[4], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo5", mcs5ghpo[5], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo6", mcs5ghpo[6], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "mcs5ghpo7", mcs5ghpo[7], 0, fb);
+	ENTRY(0x000001f0, u16, pre, "cddpo", cddpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "stbcpo", stbcpo, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "bw40po", bw40po, 0, fb);
+	ENTRY(0x000001f0, u16, pre, "bwduppo", bwduppo, 0, fb);
+
+	ENTRY(0xfffffe00, u16, pre, "cckbw202gpo", cckbw202gpo, 0, fb);
+	ENTRY(0xfffffe00, u16, pre, "cckbw20ul2gpo", cckbw20ul2gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw202gpo", legofdmbw202gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul2gpo", legofdmbw20ul2gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205glpo", legofdmbw205glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5glpo", legofdmbw20ul5glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205gmpo", legofdmbw205gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5gmpo", legofdmbw20ul5gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw205ghpo", legofdmbw205ghpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "legofdmbw20ul5ghpo", legofdmbw20ul5ghpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw202gpo", mcsbw202gpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul2gpo", mcsbw20ul2gpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw402gpo", mcsbw402gpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205glpo", mcsbw205glpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5glpo", mcsbw20ul5glpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405glpo", mcsbw405glpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205gmpo", mcsbw205gmpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5gmpo", mcsbw20ul5gmpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405gmpo", mcsbw405gmpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw205ghpo", mcsbw205ghpo, 0, fb);
+	ENTRY(0x00000600, u32, pre, "mcsbw20ul5ghpo", mcsbw20ul5ghpo, 0, fb);
+	ENTRY(0xfffffe00, u32, pre, "mcsbw405ghpo", mcsbw405ghpo, 0, fb);
+	ENTRY(0x00000600, u16, pre, "mcs32po", mcs32po, 0, fb);
+	ENTRY(0x00000600, u16, pre, "legofdm40duppo", legofdm40duppo, 0, fb);
+	ENTRY(0x00000700, u8, pre, "pcieingress_war", pcieingress_war, 0, fb);
+
+	/* TODO: rev 11 support */
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga0", rxgainerr2ga[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga1", rxgainerr2ga[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr2ga2", rxgainerr2ga[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla0", rxgainerr5gla[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla1", rxgainerr5gla[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gla2", rxgainerr5gla[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma0", rxgainerr5gma[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma1", rxgainerr5gma[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gma2", rxgainerr5gma[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha0", rxgainerr5gha[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha1", rxgainerr5gha[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gha2", rxgainerr5gha[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua0", rxgainerr5gua[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua1", rxgainerr5gua[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "rxgainerr5gua2", rxgainerr5gua[2], 0, fb);
+
+	ENTRY(0xfffffe00, u8, pre, "sar2g", sar2g, 0, fb);
+	ENTRY(0xfffffe00, u8, pre, "sar5g", sar5g, 0, fb);
+
+	/* TODO: rev 11 support */
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga0", noiselvl2ga[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga1", noiselvl2ga[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl2ga2", noiselvl2ga[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla0", noiselvl5gla[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla1", noiselvl5gla[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gla2", noiselvl5gla[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma0", noiselvl5gma[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma1", noiselvl5gma[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gma2", noiselvl5gma[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha0", noiselvl5gha[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha1", noiselvl5gha[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gha2", noiselvl5gha[2], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua0", noiselvl5gua[0], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua1", noiselvl5gua[1], 0, fb);
+	ENTRY(0x00000700, u8, pre, "noiselvl5gua2", noiselvl5gua[2], 0, fb);
+}
+#undef ENTRY /* It's specififc, uses local variable, don't use it (again). */
+
+static void bcm47xx_fill_sprom_path_r4589(struct ssb_sprom *sprom,
+					  const char *prefix, bool fallback)
+{
+	char postfix[2];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
+		struct ssb_sprom_core_pwr_info *pwr_info;
+
+		pwr_info = &sprom->core_pwr_info[i];
+
+		snprintf(postfix, sizeof(postfix), "%i", i);
+		nvram_read_u8(prefix, postfix, "maxp2ga",
+			      &pwr_info->maxpwr_2g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "itt2ga",
+			      &pwr_info->itssi_2g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "itt5ga",
+			      &pwr_info->itssi_5g, 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw0a",
+			       &pwr_info->pa_2g[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw1a",
+			       &pwr_info->pa_2g[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa2gw2a",
+			       &pwr_info->pa_2g[2], 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5ga",
+			      &pwr_info->maxpwr_5g, 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5gha",
+			      &pwr_info->maxpwr_5gh, 0, fallback);
+		nvram_read_u8(prefix, postfix, "maxp5gla",
+			      &pwr_info->maxpwr_5gl, 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw0a",
+			       &pwr_info->pa_5g[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw1a",
+			       &pwr_info->pa_5g[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw2a",
+			       &pwr_info->pa_5g[2], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw0a",
+			       &pwr_info->pa_5gl[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw1a",
+			       &pwr_info->pa_5gl[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw2a",
+			       &pwr_info->pa_5gl[2], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw0a",
+			       &pwr_info->pa_5gh[0], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw1a",
+			       &pwr_info->pa_5gh[1], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw2a",
+			       &pwr_info->pa_5gh[2], 0, fallback);
+	}
+}
+
+static void bcm47xx_fill_sprom_path_r45(struct ssb_sprom *sprom,
+					const char *prefix, bool fallback)
+{
+	char postfix[2];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sprom->core_pwr_info); i++) {
+		struct ssb_sprom_core_pwr_info *pwr_info;
+
+		pwr_info = &sprom->core_pwr_info[i];
+
+		snprintf(postfix, sizeof(postfix), "%i", i);
+		nvram_read_u16(prefix, postfix, "pa2gw3a",
+			       &pwr_info->pa_2g[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5gw3a",
+			       &pwr_info->pa_5g[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5glw3a",
+			       &pwr_info->pa_5gl[3], 0, fallback);
+		nvram_read_u16(prefix, postfix, "pa5ghw3a",
+			       &pwr_info->pa_5gh[3], 0, fallback);
+	}
+}
+
+static bool bcm47xx_is_valid_mac(u8 *mac)
+{
+	return mac && !(mac[0] == 0x00 && mac[1] == 0x90 && mac[2] == 0x4c);
+}
+
+static int bcm47xx_increase_mac_addr(u8 *mac, u8 num)
+{
+	u8 *oui = mac + ETH_ALEN/2 - 1;
+	u8 *p = mac + ETH_ALEN - 1;
+
+	do {
+		(*p) += num;
+		if (*p > num)
+			break;
+		p--;
+		num = 1;
+	} while (p != oui);
+
+	if (p == oui) {
+		pr_err("unable to fetch mac address\n");
+		return -ENOENT;
+	}
+	return 0;
+}
+
+static int mac_addr_used = 2;
+
+static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
+					const char *prefix, bool fallback)
+{
+	bool fb = fallback;
+
+	nvram_read_macaddr(prefix, "et0macaddr", sprom->et0mac, fallback);
+	nvram_read_u8(prefix, NULL, "et0mdcport", &sprom->et0mdcport, 0,
+		      fallback);
+	nvram_read_u8(prefix, NULL, "et0phyaddr", &sprom->et0phyaddr, 0,
+		      fallback);
+
+	nvram_read_macaddr(prefix, "et1macaddr", sprom->et1mac, fallback);
+	nvram_read_u8(prefix, NULL, "et1mdcport", &sprom->et1mdcport, 0,
+		      fallback);
+	nvram_read_u8(prefix, NULL, "et1phyaddr", &sprom->et1phyaddr, 0,
+		      fallback);
+
+	nvram_read_macaddr(prefix, "et2macaddr", sprom->et2mac, fb);
+	nvram_read_u8(prefix, NULL, "et2mdcport", &sprom->et2mdcport, 0, fb);
+	nvram_read_u8(prefix, NULL, "et2phyaddr", &sprom->et2phyaddr, 0, fb);
+
+	nvram_read_macaddr(prefix, "macaddr", sprom->il0mac, fallback);
+	nvram_read_macaddr(prefix, "il0macaddr", sprom->il0mac, fallback);
+
+	/* The address prefix 00:90:4C is used by Broadcom in their initial
+	 * configuration. When a mac address with the prefix 00:90:4C is used
+	 * all devices from the same series are sharing the same mac address.
+	 * To prevent mac address collisions we replace them with a mac address
+	 * based on the base address.
+	 */
+	if (!bcm47xx_is_valid_mac(sprom->il0mac)) {
+		u8 mac[6];
+
+		nvram_read_macaddr(NULL, "et0macaddr", mac, false);
+		if (bcm47xx_is_valid_mac(mac)) {
+			int err = bcm47xx_increase_mac_addr(mac, mac_addr_used);
+
+			if (!err) {
+				ether_addr_copy(sprom->il0mac, mac);
+				mac_addr_used++;
+			}
+		}
+	}
+}
+
+static void bcm47xx_fill_board_data(struct ssb_sprom *sprom, const char *prefix,
+				    bool fallback)
+{
+	nvram_read_u32_2(prefix, "boardflags", &sprom->boardflags_lo,
+			 &sprom->boardflags_hi, fallback);
+	nvram_read_u32_2(prefix, "boardflags2", &sprom->boardflags2_lo,
+			 &sprom->boardflags2_hi, fallback);
+}
+
+void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix,
+			bool fallback)
+{
+	bcm47xx_fill_sprom_ethernet(sprom, prefix, fallback);
+	bcm47xx_fill_board_data(sprom, prefix, fallback);
+
+	nvram_read_u8(prefix, NULL, "sromrev", &sprom->revision, 0, fallback);
+
+	/* Entries requiring custom functions */
+	nvram_read_alpha2(prefix, "ccode", sprom->alpha2, fallback);
+	if (sprom->revision >= 3)
+		nvram_read_leddc(prefix, "leddc", &sprom->leddc_on_time,
+				 &sprom->leddc_off_time, fallback);
+
+	switch (sprom->revision) {
+	case 4:
+	case 5:
+		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
+		bcm47xx_fill_sprom_path_r45(sprom, prefix, fallback);
+		break;
+	case 8:
+	case 9:
+		bcm47xx_fill_sprom_path_r4589(sprom, prefix, fallback);
+		break;
+	}
+
+	bcm47xx_sprom_fill_auto(sprom, prefix, fallback);
+}
+
+#if IS_BUILTIN(CONFIG_SSB) && IS_ENABLED(CONFIG_SSB_SPROM)
+static int bcm47xx_get_sprom_ssb(struct ssb_bus *bus, struct ssb_sprom *out)
+{
+	char prefix[10];
+
+	switch (bus->bustype) {
+	case SSB_BUSTYPE_SSB:
+		bcm47xx_fill_sprom(out, NULL, false);
+		return 0;
+	case SSB_BUSTYPE_PCI:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		snprintf(prefix, sizeof(prefix), "pci/%u/%u/",
+			 bus->host_pci->bus->number + 1,
+			 PCI_SLOT(bus->host_pci->devfn));
+		bcm47xx_fill_sprom(out, prefix, false);
+		return 0;
+	default:
+		pr_warn("Unable to fill SPROM for given bustype.\n");
+		return -EINVAL;
+	}
+}
+#endif
+
+#if IS_BUILTIN(CONFIG_BCMA)
+/*
+ * Having many NVRAM entries for PCI devices led to repeating prefixes like
+ * pci/1/1/ all the time and wasting flash space. So at some point Broadcom
+ * decided to introduce prefixes like 0: 1: 2: etc.
+ * If we find e.g. devpath0=pci/2/1 or devpath0=pci/2/1/ we should use 0:
+ * instead of pci/2/1/.
+ */
+static void bcm47xx_sprom_apply_prefix_alias(char *prefix, size_t prefix_size)
+{
+	size_t prefix_len = strlen(prefix);
+	size_t short_len = prefix_len - 1;
+	char nvram_var[10];
+	char buf[20];
+	int i;
+
+	/* Passed prefix has to end with a slash */
+	if (prefix_len <= 0 || prefix[prefix_len - 1] != '/')
+		return;
+
+	for (i = 0; i < 3; i++) {
+		if (snprintf(nvram_var, sizeof(nvram_var), "devpath%d", i) <= 0)
+			continue;
+		if (bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf)) < 0)
+			continue;
+		if (!strcmp(buf, prefix) ||
+		    (short_len && strlen(buf) == short_len && !strncmp(buf, prefix, short_len))) {
+			snprintf(prefix, prefix_size, "%d:", i);
+			return;
+		}
+	}
+}
+
+static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out)
+{
+	struct bcma_boardinfo *binfo = &bus->boardinfo;
+	struct bcma_device *core;
+	char buf[10];
+	char *prefix;
+	bool fallback = false;
+
+	switch (bus->hosttype) {
+	case BCMA_HOSTTYPE_PCI:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		/* On BCM47XX all PCI buses share the same domain */
+		if (config_enabled(CONFIG_BCM47XX))
+			snprintf(buf, sizeof(buf), "pci/%u/%u/",
+				 bus->host_pci->bus->number + 1,
+				 PCI_SLOT(bus->host_pci->devfn));
+		else
+			snprintf(buf, sizeof(buf), "pci/%u/%u/",
+				 pci_domain_nr(bus->host_pci->bus) + 1,
+				 bus->host_pci->bus->number);
+		bcm47xx_sprom_apply_prefix_alias(buf, sizeof(buf));
+		prefix = buf;
+		break;
+	case BCMA_HOSTTYPE_SOC:
+		memset(out, 0, sizeof(struct ssb_sprom));
+		core = bcma_find_core(bus, BCMA_CORE_80211);
+		if (core) {
+			snprintf(buf, sizeof(buf), "sb/%u/",
+				 core->core_index);
+			prefix = buf;
+			fallback = true;
+		} else {
+			prefix = NULL;
+		}
+		break;
+	default:
+		pr_warn("Unable to fill SPROM for given bustype.\n");
+		return -EINVAL;
+	}
+
+	nvram_read_u16(prefix, NULL, "boardvendor", &binfo->vendor, 0, true);
+	if (!binfo->vendor)
+		binfo->vendor = SSB_BOARDVENDOR_BCM;
+	nvram_read_u16(prefix, NULL, "boardtype", &binfo->type, 0, true);
+
+	bcm47xx_fill_sprom(out, prefix, fallback);
+
+	return 0;
+}
+#endif
+
+static unsigned int bcm47xx_sprom_registered;
+
+/*
+ * On bcm47xx we need to register SPROM fallback handler very early, so we can't
+ * use anything like platform device / driver for this.
+ */
+int bcm47xx_sprom_register_fallbacks(void)
+{
+	if (bcm47xx_sprom_registered)
+		return 0;
+
+#if IS_BUILTIN(CONFIG_SSB) && IS_ENABLED(CONFIG_SSB_SPROM)
+	if (ssb_arch_register_fallback_sprom(&bcm47xx_get_sprom_ssb))
+		pr_warn("Failed to register ssb SPROM handler\n");
+#endif
+
+#if IS_BUILTIN(CONFIG_BCMA)
+	if (bcma_arch_register_fallback_sprom(&bcm47xx_get_sprom_bcma))
+		pr_warn("Failed to register bcma SPROM handler\n");
+#endif
+
+	bcm47xx_sprom_registered = 1;
+
+	return 0;
+}
+
+fs_initcall(bcm47xx_sprom_register_fallbacks);
diff --git a/include/linux/bcm47xx_sprom.h b/include/linux/bcm47xx_sprom.h
new file mode 100644
index 000000000000..c06b47c84e1a
--- /dev/null
+++ b/include/linux/bcm47xx_sprom.h
@@ -0,0 +1,24 @@
+/*
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#ifndef __BCM47XX_SPROM_H
+#define __BCM47XX_SPROM_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_BCM47XX_SPROM
+int bcm47xx_sprom_register_fallbacks(void);
+#else
+static inline int bcm47xx_sprom_register_fallbacks(void)
+{
+	return -ENOTSUPP;
+};
+#endif
+
+#endif /* __BCM47XX_SPROM_H */
-- 
cgit v1.2.3


From 835d2b452969820fd67a755a2c01fb6e12822448 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:28 +0000
Subject: irqchip: mips-gic: Provide VP ID accessor

Provide a gic_read_local_vp_id() function to read the VCNUM field of the
GICs local VP_IDENT register. This will be used by a further patch to
check that the value reported by the GIC matches up with the kernels
calculation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Bresticker <abrestic@chromium.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12334/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   |  8 ++++++++
 include/linux/irqchip/mips-gic.h | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index e28311f237f5..c089f49b63fb 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -246,6 +246,14 @@ void gic_stop_count(void)
 
 #endif
 
+unsigned gic_read_local_vp_id(void)
+{
+	unsigned long ident;
+
+	ident = gic_read(GIC_REG(VPE_LOCAL, GIC_VP_IDENT));
+	return ident & GIC_VP_IDENT_VCNUM_MSK;
+}
+
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 80f89e4a29ac..81f930b0bca9 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -103,6 +103,7 @@
 #define GIC_VPE_SWINT0_MAP_OFS		0x0054
 #define GIC_VPE_SWINT1_MAP_OFS		0x0058
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
+#define GIC_VP_IDENT_OFS		0x0088
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
@@ -211,6 +212,10 @@
 #define GIC_VPE_SMASK_FDC_SHF		6
 #define GIC_VPE_SMASK_FDC_MSK		(MSK(1) << GIC_VPE_SMASK_FDC_SHF)
 
+/* GIC_VP_IDENT fields */
+#define GIC_VP_IDENT_VCNUM_SHF		0
+#define GIC_VP_IDENT_VCNUM_MSK		(MSK(6) << GIC_VP_IDENT_VCNUM_SHF)
+
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
@@ -278,4 +283,16 @@ static inline int gic_get_usm_range(struct resource *gic_usm_res)
 
 #endif /* CONFIG_MIPS_GIC */
 
+/**
+ * gic_read_local_vp_id() - read the local VPs VCNUM
+ *
+ * Read the VCNUM of the local VP from the GIC_VP_IDENT register and
+ * return it to the caller. This ID should be used to refer to the VP
+ * via the GICs VP-other region, or when calculating an offset to a
+ * bit representing the VP in interrupt masks.
+ *
+ * Return: The VCNUM value for the local VP.
+ */
+extern unsigned gic_read_local_vp_id(void);
+
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From e55d5312444087eb6bfb34c1cd5f6e0bf626cf26 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 11 Feb 2016 13:36:54 +0100
Subject: crash_dump: Add vmcore_elf32_check_arch

parse_crash_elf{32|64}_headers will check the headers via the
elf_check_arch respectively vmcore_elf64_check_arch macro.

The MIPS architecture implements those two macros differently.
In order to make the differentiation more explicit, let's introduce
an vmcore_elf32_check_arch to allow the archs to overwrite it.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Suggested-by: Maciej W. Rozycki <macro@imgtec.com>
Reviewed-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12535/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 fs/proc/vmcore.c           | 2 +-
 include/linux/crash_dump.h | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8afe10cf7df8..8ab782d8b33d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void)
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!elf_check_arch(&ehdr) ||
+		!vmcore_elf32_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS32||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3849fce7ecfe..3873697ba21c 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -34,9 +34,13 @@ void vmcore_cleanup(void);
 
 /*
  * Architecture code can redefine this if there are any special checks
- * needed for 64-bit ELF vmcores. In case of 32-bit only architecture,
- * this can be set to zero.
+ * needed for 32-bit ELF or 64-bit ELF vmcores.  In case of 32-bit
+ * only architecture, vmcore_elf64_check_arch can be set to zero.
  */
+#ifndef vmcore_elf32_check_arch
+#define vmcore_elf32_check_arch(x) elf_check_arch(x)
+#endif
+
 #ifndef vmcore_elf64_check_arch
 #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 #endif
-- 
cgit v1.2.3


From ca9eb49aa9562eaadf3cea071ec7018ad6800425 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 8 Feb 2016 18:43:50 +0000
Subject: SIGNAL: Move generic copy_siginfo() to signal.h

The generic copy_siginfo() is currently defined in
asm-generic/siginfo.h, after including uapi/asm-generic/siginfo.h which
defines the generic struct siginfo. However this makes it awkward for an
architecture to use it if it has to define its own struct siginfo (e.g.
MIPS and potentially IA64), since it means that asm-generic/siginfo.h
can only be included after defining the arch-specific siginfo, which may
be problematic if the arch-specific definition needs definitions from
uapi/asm-generic/siginfo.h.

It is possible to work around this by first including
uapi/asm-generic/siginfo.h to get the constants before defining the
arch-specific siginfo, and include asm-generic/siginfo.h after. However
uapi headers can't be included by other uapi headers, so that first
include has to be in an ifdef __kernel__, with the non __kernel__ case
including the non-UAPI header instead.

Instead of that mess, move the generic copy_siginfo() definition into
linux/signal.h, which allows an arch-specific uapi/asm/siginfo.h to
include asm-generic/siginfo.h and define the arch-specific siginfo, and
for the generic copy_siginfo() to see that arch-specific definition.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Petr Malat <oss@malat.biz>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Christopher Ferris <cferris@google.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # 4.0-
Patchwork: https://patchwork.linux-mips.org/patch/12478/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/asm-generic/siginfo.h | 15 ---------------
 include/linux/signal.h        | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 3d1a3af5cf59..a2508a8f9a9c 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -17,21 +17,6 @@
 struct siginfo;
 void do_schedule_next_timer(struct siginfo *info);
 
-#ifndef HAVE_ARCH_COPY_SIGINFO
-
-#include <linux/string.h>
-
-static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
-{
-	if (from->si_code < 0)
-		memcpy(to, from, sizeof(*to));
-	else
-		/* _sigchld is currently the largest know union member */
-		memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
-}
-
-#endif
-
 extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
 
 #endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 92557bbce7e7..d80259afb9e5 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -28,6 +28,21 @@ struct sigpending {
 	sigset_t signal;
 };
 
+#ifndef HAVE_ARCH_COPY_SIGINFO
+
+#include <linux/string.h>
+
+static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
+{
+	if (from->si_code < 0)
+		memcpy(to, from, sizeof(*to));
+	else
+		/* _sigchld is currently the largest know union member */
+		memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
+}
+
+#endif
+
 /*
  * Define some primitives to manipulate sigset_t.
  */
-- 
cgit v1.2.3


From 3491caf2755e9f312666712510d80b00c81ff247 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 13 May 2016 12:16:35 +0200
Subject: KVM: halt_polling: provide a way to qualify wakeups during poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some wakeups should not be considered a sucessful poll. For example on
s390 I/O interrupts are usually floating, which means that _ALL_ CPUs
would be considered runnable - letting all vCPUs poll all the time for
transactional like workload, even if one vCPU would be enough.
This can result in huge CPU usage for large guests.
This patch lets architectures provide a way to qualify wakeups if they
should be considered a good/bad wakeups in regard to polls.

For s390 the implementation will fence of halt polling for anything but
known good, single vCPU events. The s390 implementation for floating
interrupts does a wakeup for one vCPU, but the interrupt will be delivered
by whatever CPU checks first for a pending interrupt. We prefer the
woken up CPU by marking the poll of this CPU as "good" poll.
This code will also mark several other wakeup reasons like IPI or
expired timers as "good". This will of course also mark some events as
not sucessful. As  KVM on z runs always as a 2nd level hypervisor,
we prefer to not poll, unless we are really sure, though.

This patch successfully limits the CPU usage for cases like uperf 1byte
transactional ping pong workload or wakeup heavy workload like OLTP
while still providing a proper speedup.

This also introduced a new vcpu stat "halt_poll_no_tuning" that marks
wakeups that are considered not good for polling.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Radim Krčmář <rkrcmar@redhat.com> (for an earlier version)
Cc: David Matlack <dmatlack@google.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
[Rename config symbol. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h     |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  2 ++
 arch/mips/include/asm/kvm_host.h    |  2 ++
 arch/mips/kvm/mips.c                |  1 +
 arch/powerpc/include/asm/kvm_host.h |  2 ++
 arch/powerpc/kvm/book3s.c           |  1 +
 arch/powerpc/kvm/booke.c            |  1 +
 arch/s390/include/asm/kvm_host.h    |  3 +++
 arch/s390/kvm/Kconfig               |  1 +
 arch/s390/kvm/interrupt.c           |  5 +++++
 arch/s390/kvm/kvm-s390.c            |  6 ++++++
 arch/x86/include/asm/kvm_host.h     |  2 ++
 arch/x86/kvm/x86.c                  |  1 +
 include/linux/kvm_host.h            | 15 +++++++++++++++
 include/trace/events/kvm.h          | 11 +++++++----
 virt/kvm/Kconfig                    |  3 +++
 virt/kvm/kvm_main.c                 |  8 ++++++--
 17 files changed, 60 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 385070180c25..4cd8732796ab 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -187,6 +187,7 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 hvc_exit_stat;
 	u64 wfe_exit_stat;
@@ -282,6 +283,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 static inline void kvm_arm_init_debug(void) {}
 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f5c6bd2541ef..d49399d9890d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -293,6 +293,7 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 hvc_exit_stat;
 	u64 wfe_exit_stat;
@@ -357,6 +358,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 942b8f6bf35b..9a37a1044032 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -122,6 +122,7 @@ struct kvm_vcpu_stat {
 	u32 flush_dcache_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 };
 
@@ -812,5 +813,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 23b209463238..dc052fb5c7a2 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -56,6 +56,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
 	{ "halt_wakeup",  VCPU_STAT(halt_wakeup),	 KVM_STAT_VCPU },
 	{NULL}
 };
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a07645c17818..ec35af34a3fb 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -116,6 +116,7 @@ struct kvm_vcpu_stat {
 	u32 ext_intr_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 dbell_exits;
 	u32 gdbell_exits;
@@ -727,5 +728,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b34220d2aa42..47018fcbf7d6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -54,6 +54,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "queue_intr",  VCPU_STAT(queue_intr) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "pf_storage",  VCPU_STAT(pf_storage) },
 	{ "sp_storage",  VCPU_STAT(sp_storage) },
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4d66f44a1657..4afae695899a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -64,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "doorbell", VCPU_STAT(dbell_exits) },
 	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9282ccf1d136..53d794538067 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -247,6 +247,7 @@ struct kvm_vcpu_stat {
 	u32 exit_instruction;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 instruction_lctl;
 	u32 instruction_lctlg;
@@ -696,4 +697,6 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
+void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu);
+
 #endif
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 5ea5af3c7db7..b1900239b0ab 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_INVALID_WAKEUPS
 	select SRCU
 	select KVM_VFIO
 	---help---
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index e55040467eb5..5a80af740d3e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -977,6 +977,11 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * We cannot move this into the if, as the CPU might be already
+	 * in kvm_vcpu_block without having the waitqueue set (polling)
+	 */
+	vcpu->valid_wakeup = true;
 	if (swait_active(&vcpu->wq)) {
 		/*
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c597201a5ca9..6d8ec3ac9dd8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -65,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
@@ -2992,6 +2993,11 @@ static inline unsigned long nonhyp_mask(int i)
 	return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
 }
 
+void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
+{
+	vcpu->valid_wakeup = false;
+}
+
 static int __init kvm_s390_init(void)
 {
 	int i;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c66e26280707..c99494b4bdf7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -803,6 +803,7 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_successful_poll;
 	u32 halt_attempted_poll;
+	u32 halt_poll_invalid;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
@@ -1342,5 +1343,6 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c774cdf553c..bcef92fc41d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -161,6 +161,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 92a0229044fb..bbcd921d7cb0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -229,6 +229,7 @@ struct kvm_vcpu {
 	sigset_t sigset;
 	struct kvm_vcpu_stat stat;
 	unsigned int halt_poll_ns;
+	bool valid_wakeup;
 
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
@@ -1196,4 +1197,18 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				  uint32_t guest_irq, bool set);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 
+#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
+/* If we wakeup during the poll time, was it a sucessful poll? */
+static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
+{
+	return vcpu->valid_wakeup;
+}
+
+#else
+static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
+{
+	return true;
+}
+#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */
+
 #endif
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index aa69253ecc7d..526fb3d2e43a 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -38,22 +38,25 @@ TRACE_EVENT(kvm_userspace_exit,
 );
 
 TRACE_EVENT(kvm_vcpu_wakeup,
-	    TP_PROTO(__u64 ns, bool waited),
-	    TP_ARGS(ns, waited),
+	    TP_PROTO(__u64 ns, bool waited, bool valid),
+	    TP_ARGS(ns, waited, valid),
 
 	TP_STRUCT__entry(
 		__field(	__u64,		ns		)
 		__field(	bool,		waited		)
+		__field(	bool,		valid		)
 	),
 
 	TP_fast_assign(
 		__entry->ns		= ns;
 		__entry->waited		= waited;
+		__entry->valid		= valid;
 	),
 
-	TP_printk("%s time %lld ns",
+	TP_printk("%s time %lld ns, polling %s",
 		  __entry->waited ? "wait" : "poll",
-		  __entry->ns)
+		  __entry->ns,
+		  __entry->valid ? "valid" : "invalid")
 );
 
 #if defined(CONFIG_HAVE_KVM_IRQFD)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7a79b6853583..e5d6108f5e85 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -41,6 +41,9 @@ config KVM_VFIO
 config HAVE_KVM_ARCH_TLB_FLUSH_ALL
        bool
 
+config HAVE_KVM_INVALID_WAKEUPS
+       bool
+
 config KVM_GENERIC_DIRTYLOG_READ_PROTECT
        bool
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ed3d9bb18a56..21f6498d52e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2028,6 +2028,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 			 */
 			if (kvm_vcpu_check_block(vcpu) < 0) {
 				++vcpu->stat.halt_successful_poll;
+				if (!vcpu_valid_wakeup(vcpu))
+					++vcpu->stat.halt_poll_invalid;
 				goto out;
 			}
 			cur = ktime_get();
@@ -2057,7 +2059,8 @@ out:
 		if (block_ns <= vcpu->halt_poll_ns)
 			;
 		/* we had a long block, shrink polling */
-		else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+		else if (!vcpu_valid_wakeup(vcpu) ||
+			(vcpu->halt_poll_ns && block_ns > halt_poll_ns))
 			shrink_halt_poll_ns(vcpu);
 		/* we had a short halt and our poll time is too small */
 		else if (vcpu->halt_poll_ns < halt_poll_ns &&
@@ -2066,7 +2069,8 @@ out:
 	} else
 		vcpu->halt_poll_ns = 0;
 
-	trace_kvm_vcpu_wakeup(block_ns, waited);
+	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
+	kvm_arch_vcpu_block_finish(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-- 
cgit v1.2.3


From d9e4084f6c9746e51a78a4d7ebf4983023289b32 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 4 May 2016 10:53:47 -0400
Subject: svcrdma: Generalize svc_rdma_xdr_decode_req()

Clean up: Pass in just the piece of the svc_rqst that is needed
here.

While we're in the area, add an informative documenting comment.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 32 ++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  2 +-
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 3081339968c3..d6917b896d3a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -199,7 +199,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
-extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg *, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
 				     enum rpcrdma_errcode, __be32 *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c74d..0ba9887f3e22 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
 	return (__be32 *)&ary->wc_array[nchunks];
 }
 
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
+	struct rpcrdma_msg *rmsgp;
 	__be32 *va, *vaend;
 	unsigned int len;
 	u32 hdr_len;
 
 	/* Verify that there's enough bytes for header + something */
-	if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
 		dprintk("svcrdma: header too short = %d\n",
-			rqstp->rq_arg.len);
+			rq_arg->len);
 		return -EINVAL;
 	}
 
+	rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
 	if (rmsgp->rm_vers != rpcrdma_version) {
 		dprintk("%s: bad version %u\n", __func__,
 			be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
 		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-		rqstp->rq_arg.head[0].iov_base = va;
+		rq_arg->head[0].iov_base = va;
 		len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-		rqstp->rq_arg.head[0].iov_len -= len;
-		if (len > rqstp->rq_arg.len)
+		rq_arg->head[0].iov_len -= len;
+		if (len > rq_arg->len)
 			return -EINVAL;
 		return len;
 	default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 	 * chunk list and a reply chunk list.
 	 */
 	va = &rmsgp->rm_body.rm_chunks[0];
-	vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
 	va = decode_read_list(va, vaend);
 	if (!va) {
 		dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 		return -EINVAL;
 	}
 
-	rqstp->rq_arg.head[0].iov_base = va;
+	rq_arg->head[0].iov_base = va;
 	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
-	rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+	rq_arg->head[0].iov_len -= hdr_len;
 	return hdr_len;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 1b72f351fbd3..c984b0aaecb1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -636,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	/* Decode the RDMA header. */
 	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-	ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
 	if (ret < 0)
 		goto out_err;
 	if (ret == 0)
-- 
cgit v1.2.3


From c4263233f30e72f2645ff83c9074c994f88b015a Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 3 May 2016 23:22:50 +1000
Subject: drivers/of: Specify parent node in of_fdt_unflatten_tree()

This adds one more argument to of_fdt_unflatten_tree() to specify
the parent node of the FDT blob that is going to be unflattened.
In the result, the function can be used to unflatten FDT blob that
represents device sub-tree in PowerNV PCI hotplug driver.

Cc: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |  2 +-
 drivers/of/fdt.c                             | 14 ++++++++++----
 drivers/of/unittest.c                        |  2 +-
 include/linux/of_fdt.h                       |  1 +
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 106679bca6cb..f9c79dabce20 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -157,7 +157,7 @@ struct device_node * __init tilcdc_get_overlay(struct kfree_table *kft)
 	if (!overlay_data || kfree_table_add(kft, overlay_data))
 		return NULL;
 
-	of_fdt_unflatten_tree(overlay_data, &overlay);
+	of_fdt_unflatten_tree(overlay_data, NULL, &overlay);
 	if (!overlay) {
 		pr_warn("%s: Unfattening overlay tree failed\n", __func__);
 		return NULL;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 9c3e52d0d570..0d53687d4b8c 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -450,11 +450,13 @@ static int unflatten_dt_nodes(const void *blob,
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
  * @blob: The blob to expand
+ * @dad: Parent device node
  * @mynodes: The device_node tree created by the call
  * @dt_alloc: An allocator that provides a virtual address to memory
  * for the resulting tree
  */
 static void __unflatten_device_tree(const void *blob,
+			     struct device_node *dad,
 			     struct device_node **mynodes,
 			     void * (*dt_alloc)(u64 size, u64 align))
 {
@@ -479,7 +481,7 @@ static void __unflatten_device_tree(const void *blob,
 	}
 
 	/* First pass, scan for size */
-	size = unflatten_dt_nodes(blob, NULL, NULL, NULL);
+	size = unflatten_dt_nodes(blob, NULL, dad, NULL);
 	if (size < 0)
 		return;
 
@@ -495,7 +497,7 @@ static void __unflatten_device_tree(const void *blob,
 	pr_debug("  unflattening %p...\n", mem);
 
 	/* Second pass, do actual unflattening */
-	unflatten_dt_nodes(blob, mem, NULL, mynodes);
+	unflatten_dt_nodes(blob, mem, dad, mynodes);
 	if (be32_to_cpup(mem + size) != 0xdeadbeef)
 		pr_warning("End of tree marker overwritten: %08x\n",
 			   be32_to_cpup(mem + size));
@@ -512,6 +514,9 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
 
 /**
  * of_fdt_unflatten_tree - create tree of device_nodes from flat blob
+ * @blob: Flat device tree blob
+ * @dad: Parent device node
+ * @mynodes: The device tree created by the call
  *
  * unflattens the device-tree passed by the firmware, creating the
  * tree of struct device_node. It also fills the "name" and "type"
@@ -519,10 +524,11 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  * can be used.
  */
 void of_fdt_unflatten_tree(const unsigned long *blob,
+			struct device_node *dad,
 			struct device_node **mynodes)
 {
 	mutex_lock(&of_fdt_unflatten_mutex);
-	__unflatten_device_tree(blob, mynodes, &kernel_tree_alloc);
+	__unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
 	mutex_unlock(&of_fdt_unflatten_mutex);
 }
 EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
@@ -1195,7 +1201,7 @@ bool __init early_init_dt_scan(void *params)
  */
 void __init unflatten_device_tree(void)
 {
-	__unflatten_device_tree(initial_boot_params, &of_root,
+	__unflatten_device_tree(initial_boot_params, NULL, &of_root,
 				early_init_dt_alloc_memory_arch);
 
 	/* Get pointer to "/chosen" and "/aliases" nodes for use everywhere */
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index e986e6ee52e0..8c0f11c3153c 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -921,7 +921,7 @@ static int __init unittest_data_add(void)
 			"not running tests\n", __func__);
 		return -ENOMEM;
 	}
-	of_fdt_unflatten_tree(unittest_data, &unittest_data_node);
+	of_fdt_unflatten_tree(unittest_data, NULL, &unittest_data_node);
 	if (!unittest_data_node) {
 		pr_warn("%s: No tree to attach; not running tests\n", __func__);
 		return -ENODATA;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 2fbe8682a66f..1bffcbd56f8e 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -38,6 +38,7 @@ extern bool of_fdt_is_big_endian(const void *blob,
 extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
 extern void of_fdt_unflatten_tree(const unsigned long *blob,
+			       struct device_node *dad,
 			       struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
-- 
cgit v1.2.3


From 83262418b0ef8bda66eca7c72d4c24ae6f7b230e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 3 May 2016 23:22:51 +1000
Subject: drivers/of: Return allocated memory from of_fdt_unflatten_tree()

This returns the allocate memory chunk, storing the unflattened device
tree, from of_fdt_unflatten_tree() so that memory chunk can be released
on demand in PowerNV PCI hotplug driver.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/fdt.c       | 33 ++++++++++++++++++++++-----------
 include/linux/of_fdt.h |  6 +++---
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 0d53687d4b8c..ef1ccee51af8 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -454,11 +454,14 @@ static int unflatten_dt_nodes(const void *blob,
  * @mynodes: The device_node tree created by the call
  * @dt_alloc: An allocator that provides a virtual address to memory
  * for the resulting tree
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
  */
-static void __unflatten_device_tree(const void *blob,
-			     struct device_node *dad,
-			     struct device_node **mynodes,
-			     void * (*dt_alloc)(u64 size, u64 align))
+static void *__unflatten_device_tree(const void *blob,
+				     struct device_node *dad,
+				     struct device_node **mynodes,
+				     void *(*dt_alloc)(u64 size, u64 align))
 {
 	int size;
 	void *mem;
@@ -467,7 +470,7 @@ static void __unflatten_device_tree(const void *blob,
 
 	if (!blob) {
 		pr_debug("No device tree pointer\n");
-		return;
+		return NULL;
 	}
 
 	pr_debug("Unflattening device tree:\n");
@@ -477,13 +480,13 @@ static void __unflatten_device_tree(const void *blob,
 
 	if (fdt_check_header(blob)) {
 		pr_err("Invalid device tree blob header\n");
-		return;
+		return NULL;
 	}
 
 	/* First pass, scan for size */
 	size = unflatten_dt_nodes(blob, NULL, dad, NULL);
 	if (size < 0)
-		return;
+		return NULL;
 
 	size = ALIGN(size, 4);
 	pr_debug("  size is %d, allocating...\n", size);
@@ -503,6 +506,7 @@ static void __unflatten_device_tree(const void *blob,
 			   be32_to_cpup(mem + size));
 
 	pr_debug(" <- unflatten_device_tree()\n");
+	return mem;
 }
 
 static void *kernel_tree_alloc(u64 size, u64 align)
@@ -522,14 +526,21 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  * tree of struct device_node. It also fills the "name" and "type"
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
  */
-void of_fdt_unflatten_tree(const unsigned long *blob,
-			struct device_node *dad,
-			struct device_node **mynodes)
+void *of_fdt_unflatten_tree(const unsigned long *blob,
+			    struct device_node *dad,
+			    struct device_node **mynodes)
 {
+	void *mem;
+
 	mutex_lock(&of_fdt_unflatten_mutex);
-	__unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
+	mem = __unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
 	mutex_unlock(&of_fdt_unflatten_mutex);
+
+	return mem;
 }
 EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
 
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 1bffcbd56f8e..901ec01c9fba 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -37,9 +37,9 @@ extern bool of_fdt_is_big_endian(const void *blob,
 				 unsigned long node);
 extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
-extern void of_fdt_unflatten_tree(const unsigned long *blob,
-			       struct device_node *dad,
-			       struct device_node **mynodes);
+extern void *of_fdt_unflatten_tree(const unsigned long *blob,
+				   struct device_node *dad,
+				   struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
 extern int __initdata dt_root_addr_cells;
-- 
cgit v1.2.3


From 9a99425f0736a416442525ac7b15903173888b86 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 25 Feb 2016 09:56:04 -0800
Subject: iscsi_ibft: Add prefix-len attr and display netmask

The iBFT table only specifies a prefix length, not a netmask.
And the netmask is pretty much pointless for IPv6.
So introduce a new attribute 'prefix-len'.

Some older user-space code might rely on the netmask attribute
being present, so we should always display it.

Changes from v1:
 - Combined two patches into one

Changes from v2:
 - Cleaned up/corrected wording for patch description

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
---
 drivers/firmware/iscsi_ibft.c    | 4 ++++
 drivers/scsi/iscsi_boot_sysfs.c  | 5 +++++
 include/linux/iscsi_boot_sysfs.h | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index 72791232e46b..81037e5fe301 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -319,6 +319,9 @@ static ssize_t ibft_attr_show_nic(void *data, int type, char *buf)
 		val = cpu_to_be32(~((1 << (32-nic->subnet_mask_prefix))-1));
 		str += sprintf(str, "%pI4", &val);
 		break;
+	case ISCSI_BOOT_ETH_PREFIX_LEN:
+		str += sprintf(str, "%d\n", nic->subnet_mask_prefix);
+		break;
 	case ISCSI_BOOT_ETH_ORIGIN:
 		str += sprintf(str, "%d\n", nic->origin);
 		break;
@@ -460,6 +463,7 @@ static umode_t ibft_check_nic_for(void *data, int type)
 		if (address_not_null(nic->ip_addr))
 			rc = S_IRUGO;
 		break;
+	case ISCSI_BOOT_ETH_PREFIX_LEN:
 	case ISCSI_BOOT_ETH_SUBNET_MASK:
 		if (nic->subnet_mask_prefix)
 			rc = S_IRUGO;
diff --git a/drivers/scsi/iscsi_boot_sysfs.c b/drivers/scsi/iscsi_boot_sysfs.c
index 680bf6f0ce76..8f0ea97cf31f 100644
--- a/drivers/scsi/iscsi_boot_sysfs.c
+++ b/drivers/scsi/iscsi_boot_sysfs.c
@@ -166,6 +166,7 @@ static struct attribute_group iscsi_boot_target_attr_group = {
 iscsi_boot_rd_attr(eth_index, index, ISCSI_BOOT_ETH_INDEX);
 iscsi_boot_rd_attr(eth_flags, flags, ISCSI_BOOT_ETH_FLAGS);
 iscsi_boot_rd_attr(eth_ip, ip-addr, ISCSI_BOOT_ETH_IP_ADDR);
+iscsi_boot_rd_attr(eth_prefix, prefix-len, ISCSI_BOOT_ETH_PREFIX_LEN);
 iscsi_boot_rd_attr(eth_subnet, subnet-mask, ISCSI_BOOT_ETH_SUBNET_MASK);
 iscsi_boot_rd_attr(eth_origin, origin, ISCSI_BOOT_ETH_ORIGIN);
 iscsi_boot_rd_attr(eth_gateway, gateway, ISCSI_BOOT_ETH_GATEWAY);
@@ -181,6 +182,7 @@ static struct attribute *ethernet_attrs[] = {
 	&iscsi_boot_attr_eth_index.attr,
 	&iscsi_boot_attr_eth_flags.attr,
 	&iscsi_boot_attr_eth_ip.attr,
+	&iscsi_boot_attr_eth_prefix.attr,
 	&iscsi_boot_attr_eth_subnet.attr,
 	&iscsi_boot_attr_eth_origin.attr,
 	&iscsi_boot_attr_eth_gateway.attr,
@@ -208,6 +210,9 @@ static umode_t iscsi_boot_eth_attr_is_visible(struct kobject *kobj,
 	else if (attr ==  &iscsi_boot_attr_eth_ip.attr)
 		return boot_kobj->is_visible(boot_kobj->data,
 					     ISCSI_BOOT_ETH_IP_ADDR);
+	else if (attr ==  &iscsi_boot_attr_eth_prefix.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_PREFIX_LEN);
 	else if (attr ==  &iscsi_boot_attr_eth_subnet.attr)
 		return boot_kobj->is_visible(boot_kobj->data,
 					     ISCSI_BOOT_ETH_SUBNET_MASK);
diff --git a/include/linux/iscsi_boot_sysfs.h b/include/linux/iscsi_boot_sysfs.h
index 2a8b1659bf35..548d55395488 100644
--- a/include/linux/iscsi_boot_sysfs.h
+++ b/include/linux/iscsi_boot_sysfs.h
@@ -23,6 +23,7 @@ enum iscsi_boot_eth_properties_enum {
 	ISCSI_BOOT_ETH_INDEX,
 	ISCSI_BOOT_ETH_FLAGS,
 	ISCSI_BOOT_ETH_IP_ADDR,
+	ISCSI_BOOT_ETH_PREFIX_LEN,
 	ISCSI_BOOT_ETH_SUBNET_MASK,
 	ISCSI_BOOT_ETH_ORIGIN,
 	ISCSI_BOOT_ETH_GATEWAY,
-- 
cgit v1.2.3


From b3c8eb50383178f3a4dcf1dc867001156da6473d Mon Sep 17 00:00:00 2001
From: David Bond <dbond@suse.com>
Date: Wed, 23 Mar 2016 21:49:26 -0400
Subject: ibft: Expose iBFT acpi header via sysfs

Some ethernet adapter vendors are supplying products which support optional
(payed license) features. On some adapters this includes a hardware iscsi
initiator.  The same adapters in a normal (no extra licenses) mode of
operation can be used as a software iscsi initiator.  In addition, software
iscsi boot initiators are becoming a standard part of many vendors uefi
implementations.  This is creating difficulties during early boot/install
determining the proper configuration method for these adapters when they
are used as a boot device.

The attached patch creates sysfs entries to expose information from the
acpi header of the ibft table.  This information allows for a method to
easily determining if an ibft table was created by a ethernet card's
firmware or the system uefi/bios.  In the case of a hardware initiator this
information in combination with the pci vendor and device id can be used
to ascertain any vendor specific behaviors that need to be accommodated.

Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: David Bond <dbond@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 Documentation/ABI/testing/sysfs-ibft | 10 ++++++
 drivers/firmware/iscsi_ibft.c        | 66 +++++++++++++++++++++++++++++++++++-
 drivers/scsi/iscsi_boot_sysfs.c      | 62 +++++++++++++++++++++++++++++++++
 include/linux/iscsi_boot_sysfs.h     | 13 +++++++
 4 files changed, 150 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-ibft b/Documentation/ABI/testing/sysfs-ibft
index cac3930bdb04..7d6725fe6143 100644
--- a/Documentation/ABI/testing/sysfs-ibft
+++ b/Documentation/ABI/testing/sysfs-ibft
@@ -21,3 +21,13 @@ Contact:	Konrad Rzeszutek <ketuzsezr@darnok.org>
 Description:	The /sys/firmware/ibft/ethernetX directory will contain
 		files that expose the iSCSI Boot Firmware Table NIC data.
 		Usually this contains the IP address, MAC, and gateway of the NIC.
+
+What:		/sys/firmware/ibft/acpi_header
+Date:		March 2016
+Contact:	David Bond <dbond@suse.com>
+Description:	The /sys/firmware/ibft/acpi_header directory will contain files
+		that expose the SIGNATURE, OEM_ID, and OEM_TABLE_ID fields of the
+		acpi table header of the iBFT structure.  This will allow for
+		identification of the creator of the table which is useful in
+		determining quirks associated with some adapters when used in
+		hardware vs software iscsi initiator mode.
diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index 81037e5fe301..14042a64bdd5 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -418,6 +418,31 @@ static ssize_t ibft_attr_show_target(void *data, int type, char *buf)
 	return str - buf;
 }
 
+static ssize_t ibft_attr_show_acpitbl(void *data, int type, char *buf)
+{
+	struct ibft_kobject *entry = data;
+	char *str = buf;
+
+	switch (type) {
+	case ISCSI_BOOT_ACPITBL_SIGNATURE:
+		str += sprintf_string(str, ACPI_NAME_SIZE,
+				      entry->header->header.signature);
+		break;
+	case ISCSI_BOOT_ACPITBL_OEM_ID:
+		str += sprintf_string(str, ACPI_OEM_ID_SIZE,
+				      entry->header->header.oem_id);
+		break;
+	case ISCSI_BOOT_ACPITBL_OEM_TABLE_ID:
+		str += sprintf_string(str, ACPI_OEM_TABLE_ID_SIZE,
+				      entry->header->header.oem_table_id);
+		break;
+	default:
+		break;
+	}
+
+	return str - buf;
+}
+
 static int __init ibft_check_device(void)
 {
 	int len;
@@ -576,6 +601,24 @@ static umode_t __init ibft_check_initiator_for(void *data, int type)
 	return rc;
 }
 
+static umode_t __init ibft_check_acpitbl_for(void *data, int type)
+{
+
+	umode_t rc = 0;
+
+	switch (type) {
+	case ISCSI_BOOT_ACPITBL_SIGNATURE:
+	case ISCSI_BOOT_ACPITBL_OEM_ID:
+	case ISCSI_BOOT_ACPITBL_OEM_TABLE_ID:
+		rc = S_IRUGO;
+		break;
+	default:
+		break;
+	}
+
+	return rc;
+}
+
 static void ibft_kobj_release(void *data)
 {
 	kfree(data);
@@ -699,6 +742,8 @@ free_ibft_obj:
 static int __init ibft_register_kobjects(struct acpi_table_ibft *header)
 {
 	struct ibft_control *control = NULL;
+	struct iscsi_boot_kobj *boot_kobj;
+	struct ibft_kobject *ibft_kobj;
 	void *ptr, *end;
 	int rc = 0;
 	u16 offset;
@@ -726,6 +771,25 @@ static int __init ibft_register_kobjects(struct acpi_table_ibft *header)
 				break;
 		}
 	}
+	if (rc)
+		return rc;
+
+	ibft_kobj = kzalloc(sizeof(*ibft_kobj), GFP_KERNEL);
+	if (!ibft_kobj)
+		return -ENOMEM;
+
+	ibft_kobj->header = header;
+	ibft_kobj->hdr = NULL; /*for ibft_unregister*/
+
+	boot_kobj = iscsi_boot_create_acpitbl(boot_kset, 0,
+					ibft_kobj,
+					ibft_attr_show_acpitbl,
+					ibft_check_acpitbl_for,
+					ibft_kobj_release);
+	if (!boot_kobj)  {
+		kfree(ibft_kobj);
+		rc = -ENOMEM;
+	}
 
 	return rc;
 }
@@ -738,7 +802,7 @@ static void ibft_unregister(void)
 	list_for_each_entry_safe(boot_kobj, tmp_kobj,
 				 &boot_kset->kobj_list, list) {
 		ibft_kobj = boot_kobj->data;
-		if (ibft_kobj->hdr->id == id_nic)
+		if (ibft_kobj->hdr && ibft_kobj->hdr->id == id_nic)
 			sysfs_remove_link(&boot_kobj->kobj, "device");
 	};
 }
diff --git a/drivers/scsi/iscsi_boot_sysfs.c b/drivers/scsi/iscsi_boot_sysfs.c
index 8f0ea97cf31f..d453667612f8 100644
--- a/drivers/scsi/iscsi_boot_sysfs.c
+++ b/drivers/scsi/iscsi_boot_sysfs.c
@@ -306,6 +306,42 @@ static struct attribute_group iscsi_boot_initiator_attr_group = {
 	.is_visible = iscsi_boot_ini_attr_is_visible,
 };
 
+/* iBFT ACPI Table attributes */
+iscsi_boot_rd_attr(acpitbl_signature, signature, ISCSI_BOOT_ACPITBL_SIGNATURE);
+iscsi_boot_rd_attr(acpitbl_oem_id, oem_id, ISCSI_BOOT_ACPITBL_OEM_ID);
+iscsi_boot_rd_attr(acpitbl_oem_table_id, oem_table_id,
+		   ISCSI_BOOT_ACPITBL_OEM_TABLE_ID);
+
+static struct attribute *acpitbl_attrs[] = {
+	&iscsi_boot_attr_acpitbl_signature.attr,
+	&iscsi_boot_attr_acpitbl_oem_id.attr,
+	&iscsi_boot_attr_acpitbl_oem_table_id.attr,
+	NULL
+};
+
+static umode_t iscsi_boot_acpitbl_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int i)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+
+	if (attr ==  &iscsi_boot_attr_acpitbl_signature.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ACPITBL_SIGNATURE);
+	if (attr ==  &iscsi_boot_attr_acpitbl_oem_id.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ACPITBL_OEM_ID);
+	if (attr ==  &iscsi_boot_attr_acpitbl_oem_table_id.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ACPITBL_OEM_TABLE_ID);
+	return 0;
+}
+
+static struct attribute_group iscsi_boot_acpitbl_attr_group = {
+	.attrs = acpitbl_attrs,
+	.is_visible = iscsi_boot_acpitbl_attr_is_visible,
+};
+
 static struct iscsi_boot_kobj *
 iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset,
 		       struct attribute_group *attr_group,
@@ -435,6 +471,32 @@ iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index,
 }
 EXPORT_SYMBOL_GPL(iscsi_boot_create_ethernet);
 
+/**
+ * iscsi_boot_create_acpitbl() - create boot acpi table sysfs dir
+ * @boot_kset: boot kset
+ * @index: not used
+ * @data: driver specific data
+ * @show: attr show function
+ * @is_visible: attr visibility function
+ * @release: release function
+ *
+ * Note: The boot sysfs lib will free the data passed in for the caller
+ * when all refs to the acpitbl kobject have been released.
+ */
+struct iscsi_boot_kobj *
+iscsi_boot_create_acpitbl(struct iscsi_boot_kset *boot_kset, int index,
+			   void *data,
+			   ssize_t (*show)(void *data, int type, char *buf),
+			   umode_t (*is_visible)(void *data, int type),
+			   void (*release)(void *data))
+{
+	return iscsi_boot_create_kobj(boot_kset,
+				      &iscsi_boot_acpitbl_attr_group,
+				      "acpi_header", index, data, show,
+				      is_visible, release);
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_acpitbl);
+
 /**
  * iscsi_boot_create_kset() - creates root sysfs tree
  * @set_name: name of root dir
diff --git a/include/linux/iscsi_boot_sysfs.h b/include/linux/iscsi_boot_sysfs.h
index 548d55395488..10923d730486 100644
--- a/include/linux/iscsi_boot_sysfs.h
+++ b/include/linux/iscsi_boot_sysfs.h
@@ -64,6 +64,12 @@ enum iscsi_boot_initiator_properties_enum {
 	ISCSI_BOOT_INI_END_MARKER,
 };
 
+enum iscsi_boot_acpitbl_properties_enum {
+	ISCSI_BOOT_ACPITBL_SIGNATURE,
+	ISCSI_BOOT_ACPITBL_OEM_ID,
+	ISCSI_BOOT_ACPITBL_OEM_TABLE_ID,
+};
+
 struct attribute_group;
 
 struct iscsi_boot_kobj {
@@ -127,6 +133,13 @@ iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index,
 			 umode_t (*is_visible) (void *data, int type),
 			 void (*release) (void *data));
 
+struct iscsi_boot_kobj *
+iscsi_boot_create_acpitbl(struct iscsi_boot_kset *boot_kset, int index,
+			  void *data,
+			  ssize_t (*show)(void *data, int type, char *buf),
+			  umode_t (*is_visible)(void *data, int type),
+			  void (*release)(void *data));
+
 struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name);
 struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno);
 void iscsi_boot_destroy_kset(struct iscsi_boot_kset *boot_kset);
-- 
cgit v1.2.3


From 9dc0b289c4c09bc1a92bdcc055cb37af9b72eb28 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:39 +0000
Subject: net/mlx5_core: Firmware commands to support flow counters

Getting packet/byte statistics on flows is done through flow counters.
Implement the firmware commands to alloc, free and query flow counters.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c    |  6 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 66 ++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h |  5 ++
 include/linux/mlx5/mlx5_ifc.h                    | 99 +++++++++++++++++++++++-
 4 files changed, 173 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 63cac841cfa9..dcd2df6518de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -294,6 +294,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
 	case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
 	case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -395,6 +396,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -539,6 +542,9 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY);
 	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY);
 	MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 9797768891ee..ccb63a0bb54a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -323,3 +323,69 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
 	return err;
 }
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					 sizeof(out));
+	if (err)
+		return err;
+
+	*id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+
+	return 0;
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(dealloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+	MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+		      u64 *packets, u64 *bytes)
+{
+	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter)];
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)];
+	void *stats;
+	int err = 0;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index c97b4a03eeed..18c111a4691f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -70,4 +70,9 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 			    struct mlx5_flow_table *ft);
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+		      u64 *packets, u64 *bytes);
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4ce4ea422a10..614c795eadea 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -202,6 +202,9 @@ enum {
 	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x936,
 	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x937,
 	MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY       = 0x938,
+	MLX5_CMD_OP_ALLOC_FLOW_COUNTER            = 0x939,
+	MLX5_CMD_OP_DEALLOC_FLOW_COUNTER          = 0x93a,
+	MLX5_CMD_OP_QUERY_FLOW_COUNTER            = 0x93b,
 	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c
 };
 
@@ -265,7 +268,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         ft_support[0x1];
-	u8         reserved_at_1[0x2];
+	u8         reserved_at_1[0x1];
+	u8         flow_counter[0x1];
 	u8	   flow_modify_en[0x1];
 	u8         modify_root[0x1];
 	u8         identified_miss_table_mode[0x1];
@@ -941,6 +945,19 @@ struct mlx5_ifc_dest_format_struct_bits {
 	u8         reserved_at_20[0x20];
 };
 
+struct mlx5_ifc_flow_counter_list_bits {
+	u8         reserved_at_0[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_20[0x20];
+};
+
+union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits {
+	struct mlx5_ifc_dest_format_struct_bits dest_format_struct;
+	struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
+	u8         reserved_at_0[0x40];
+};
+
 struct mlx5_ifc_fte_match_param_bits {
 	struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers;
 
@@ -2006,6 +2023,7 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_ALLOW     = 0x1,
 	MLX5_FLOW_CONTEXT_ACTION_DROP      = 0x2,
 	MLX5_FLOW_CONTEXT_ACTION_FWD_DEST  = 0x4,
+	MLX5_FLOW_CONTEXT_ACTION_COUNT     = 0x8,
 };
 
 struct mlx5_ifc_flow_context_bits {
@@ -2022,13 +2040,16 @@ struct mlx5_ifc_flow_context_bits {
 	u8         reserved_at_80[0x8];
 	u8         destination_list_size[0x18];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x8];
+	u8         flow_counter_list_size[0x18];
+
+	u8         reserved_at_c0[0x140];
 
 	struct mlx5_ifc_fte_match_param_bits match_value;
 
 	u8         reserved_at_1200[0x600];
 
-	struct mlx5_ifc_dest_format_struct_bits destination[0];
+	union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[0];
 };
 
 enum {
@@ -3937,6 +3958,34 @@ struct mlx5_ifc_query_flow_group_in_bits {
 	u8         reserved_at_e0[0x120];
 };
 
+struct mlx5_ifc_query_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_traffic_counter_bits flow_statistics[0];
+};
+
+struct mlx5_ifc_query_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x80];
+
+	u8         clear[0x1];
+	u8         reserved_at_c1[0xf];
+	u8         num_of_counters[0x10];
+
+	u8         reserved_at_e0[0x10];
+	u8         flow_counter_id[0x10];
+};
+
 struct mlx5_ifc_query_esw_vport_context_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -5510,6 +5559,28 @@ struct mlx5_ifc_dealloc_pd_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_dealloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
 struct mlx5_ifc_create_xrc_srq_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -6237,6 +6308,28 @@ struct mlx5_ifc_alloc_pd_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_alloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         flow_counter_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_add_vxlan_udp_dport_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From bd5251dbf156b6bc0661a9409d46e47160df61dd Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:40 +0000
Subject: net/mlx5_core: Introduce flow steering destination of type counter

When adding a flow steering rule with a counter, need to supply a
destination of type MLX5_FLOW_DESTINATION_TYPE_COUNTER, with a pointer
to a struct mlx5_fc.
Also, MLX5_FLOW_CONTEXT_ACTION_COUNT bit should be set in the action.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  | 36 +++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 52 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 23 ++++++++++
 include/linux/mlx5/fs.h                           |  2 +
 include/linux/mlx5/mlx5_ifc.h                     |  2 +
 6 files changed, 106 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index ccb63a0bb54a..a5bb6b695242 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -241,17 +241,20 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
 	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->flow_tag);
 	MLX5_SET(flow_context, in_flow_context, action, fte->action);
-	MLX5_SET(flow_context, in_flow_context, destination_list_size,
-		 fte->dests_size);
 	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
 				      match_value);
 	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
 
+	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
 	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
+		int list_size = 0;
+
 		list_for_each_entry(dst, &fte->node.children, node.list) {
 			unsigned int id;
 
+			if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
 			MLX5_SET(dest_format_struct, in_dests, destination_type,
 				 dst->dest_attr.type);
 			if (dst->dest_attr.type ==
@@ -262,8 +265,31 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			}
 			MLX5_SET(dest_format_struct, in_dests, destination_id, id);
 			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, destination_list_size,
+			 list_size);
+	}
+
+	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			if (dst->dest_attr.type !=
+			    MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
+				 dst->dest_attr.counter->id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
 		}
+
+		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
+			 list_size);
 	}
+
 	memset(out, 0, sizeof(out));
 	err = mlx5_cmd_exec_check_status(dev, in, inlen, out,
 					 sizeof(out));
@@ -283,18 +309,16 @@ int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
 int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
 			struct mlx5_flow_table *ft,
 			unsigned group_id,
+			int modify_mask,
 			struct fs_fte *fte)
 {
 	int opmod;
-	int modify_mask;
 	int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev,
 						flow_table_properties_nic_receive.
 						flow_modify_en);
 	if (!atomic_mod_cap)
 		return -ENOTSUPP;
 	opmod = 1;
-	modify_mask = 1 <<
-		MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST;
 
 	return	mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, group_id, fte);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 18c111a4691f..fc4f7b83fe0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -62,6 +62,7 @@ int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
 int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
 			struct mlx5_flow_table *ft,
 			unsigned group_id,
+			int modify_mask,
 			struct fs_fte *fte);
 
 int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 659a6980cda2..9420def3a2fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -344,6 +344,7 @@ static void del_rule(struct fs_node *node)
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
 	u32	*match_value;
+	int modify_mask;
 	struct mlx5_core_dev *dev = get_dev(node);
 	int match_len = MLX5_ST_SZ_BYTES(fte_match_param);
 	int err;
@@ -367,8 +368,11 @@ static void del_rule(struct fs_node *node)
 	}
 	if ((fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
 	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST),
 		err = mlx5_cmd_update_fte(dev, ft,
-					  fg->id, fte);
+					  fg->id,
+					  modify_mask,
+					  fte);
 		if (err)
 			pr_warn("%s can't del rule fg id=%d fte_index=%d\n",
 				__func__, fg->id, fte->index);
@@ -615,6 +619,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
+	int modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
 	int err = 0;
 
 	fs_get_obj(fte, rule->node.parent);
@@ -626,7 +631,9 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 
 	memcpy(&rule->dest_attr, dest, sizeof(*dest));
 	err = mlx5_cmd_update_fte(get_dev(&ft->node),
-				  ft, fg->id, fte);
+				  ft, fg->id,
+				  modify_mask,
+				  fte);
 	unlock_ref_node(&fte->node);
 
 	return err;
@@ -877,6 +884,7 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_rule *rule;
+	int modify_mask = 0;
 	int err;
 
 	rule = alloc_rule(dest);
@@ -892,14 +900,20 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 		list_add(&rule->node.list, &fte->node.children);
 	else
 		list_add_tail(&rule->node.list, &fte->node.children);
-	if (dest)
+	if (dest) {
 		fte->dests_size++;
+
+		modify_mask |= dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER ?
+			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS) :
+			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	}
+
 	if (fte->dests_size == 1 || !dest)
 		err = mlx5_cmd_create_fte(get_dev(&ft->node),
 					  ft, fg->id, fte);
 	else
 		err = mlx5_cmd_update_fte(get_dev(&ft->node),
-					  ft, fg->id, fte);
+					  ft, fg->id, modify_mask, fte);
 	if (err)
 		goto free_rule;
 
@@ -1092,10 +1106,40 @@ unlock_fg:
 	return rule;
 }
 
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule)
+{
+	struct mlx5_flow_rule *dst;
+	struct fs_fte *fte;
+
+	fs_get_obj(fte, rule->node.parent);
+
+	fs_for_each_dst(dst, fte) {
+		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+			return dst->dest_attr.counter;
+	}
+
+	return NULL;
+}
+
+static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
+		return !counter;
+
+	if (!counter)
+		return false;
+
+	/* Hardware support counter for a drop action only */
+	return action == (MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT);
+}
+
 static bool dest_is_valid(struct mlx5_flow_destination *dest,
 			  u32 action,
 			  struct mlx5_flow_table *ft)
 {
+	if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
+		return counter_is_valid(dest->counter, action);
+
 	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
 		return true;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 8e76cc505f5a..1989048ebdfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -96,6 +96,28 @@ struct mlx5_flow_table {
 	struct list_head		fwd_rules;
 };
 
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	struct list_head list;
+
+	/* last{packets,bytes} members are used when calculating the delta since
+	 * last reading
+	 */
+	u64 lastpackets;
+	u64 lastbytes;
+
+	u16 id;
+	bool deleted;
+	bool aging;
+
+	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+};
+
 /* Type of children is mlx5_flow_rule */
 struct fs_fte {
 	struct fs_node			node;
@@ -105,6 +127,7 @@ struct fs_fte {
 	u32				index;
 	u32				action;
 	enum fs_fte_status		status;
+	struct mlx5_fc			*counter;
 };
 
 /* Type of children is mlx5_flow_table/namespace */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6467569ad76e..c8b9ede1c20a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -73,6 +73,7 @@ struct mlx5_flow_destination {
 		u32			tir_num;
 		struct mlx5_flow_table	*ft;
 		u32			vport_num;
+		struct mlx5_fc		*counter;
 	};
 };
 
@@ -125,4 +126,5 @@ void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
 int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 				 struct mlx5_flow_destination *dest);
 
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 614c795eadea..9a05cd7e5890 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -936,6 +936,8 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
 	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
+
+	MLX5_FLOW_DESTINATION_TYPE_COUNTER      = 0x100,
 };
 
 struct mlx5_ifc_dest_format_struct_bits {
-- 
cgit v1.2.3


From 43a335e055bb7ebdc8a68ce7362ef26ef5bda92b Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirva@mellanox.com>
Date: Fri, 13 May 2016 12:55:41 +0000
Subject: net/mlx5_core: Flow counters infrastructure

If a counter has the aging flag set when created, it is added to a list
of counters that will be queried periodically from a workqueue.  query
result and last use timestamp are cached.
add/del counter must be very efficient since thousands of such
operations might be issued in a second.
There is only a single reference to counters without aging, therefore
no need for locks.
But, counters with aging enabled are stored in a list. In order to make
code as lockless as possible, all the list manipulation and access to
hardware is done from a single context - the periodic counters query
thread.

The hardware supports multiple counters per FTE, however currently we
are using one counter for each FTE.

Signed-off-by: Amir Vadai <amirva@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   3 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 226 +++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  14 ++
 include/linux/mlx5/fs.h                            |   5 +
 6 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index b531d4f3c00b..9ea7b583096a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o
+		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o fs_counters.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
 		en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9420def3a2fe..8b5f0b2c0d5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1771,6 +1771,7 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
 	cleanup_single_prio_root_ns(dev, dev->priv.esw_egress_root_ns);
 	cleanup_single_prio_root_ns(dev, dev->priv.esw_ingress_root_ns);
+	mlx5_cleanup_fc_stats(dev);
 }
 
 static int init_fdb_root_ns(struct mlx5_core_dev *dev)
@@ -1827,10 +1828,14 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	int err = 0;
 
+	err = mlx5_init_fc_stats(dev);
+	if (err)
+		return err;
+
 	if (MLX5_CAP_GEN(dev, nic_flow_table)) {
 		err = init_root_ns(dev);
 		if (err)
-			return err;
+			goto err;
 	}
 	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
 		err = init_fdb_root_ns(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 1989048ebdfd..aa41a7314691 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -169,6 +169,9 @@ struct mlx5_flow_root_namespace {
 	struct mutex			chain_lock;
 };
 
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
new file mode 100644
index 000000000000..164dc37fda72
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+
+#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+
+/* locking scheme:
+ *
+ * It is the responsibility of the user to prevent concurrent calls or bad
+ * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
+ * to struct mlx5_fc.
+ * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
+ * dump (access to struct mlx5_fc) after a counter is destroyed.
+ *
+ * access to counter list:
+ * - create (user context)
+ *   - mlx5_fc_create() only adds to an addlist to be used by
+ *     mlx5_fc_stats_query_work(). addlist is protected by a spinlock.
+ *   - spawn thread to do the actual destroy
+ *
+ * - destroy (user context)
+ *   - mark a counter as deleted
+ *   - spawn thread to do the actual del
+ *
+ * - dump (user context)
+ *   user should not call dump after destroy
+ *
+ * - query (single thread workqueue context)
+ *   destroy/dump - no conflict (see destroy)
+ *   query/dump - packets and bytes might be inconsistent (since update is not
+ *                atomic)
+ *   query/create - no conflict (see create)
+ *   since every create/destroy spawn the work, only after necessary time has
+ *   elapsed, the thread will actually query the hardware.
+ */
+
+static void mlx5_fc_stats_work(struct work_struct *work)
+{
+	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
+						 priv.fc_stats.work.work);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	unsigned long now = jiffies;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+	int err = 0;
+
+	spin_lock(&fc_stats->addlist_lock);
+
+	list_splice_tail_init(&fc_stats->addlist, &fc_stats->list);
+
+	if (!list_empty(&fc_stats->list))
+		queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
+
+	spin_unlock(&fc_stats->addlist_lock);
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->list, list) {
+		struct mlx5_fc_cache *c = &counter->cache;
+		u64 packets;
+		u64 bytes;
+
+		if (counter->deleted) {
+			list_del(&counter->list);
+
+			mlx5_cmd_fc_free(dev, counter->id);
+
+			kfree(counter);
+			continue;
+		}
+
+		if (time_before(now, fc_stats->next_query))
+			continue;
+
+		err = mlx5_cmd_fc_query(dev, counter->id, &packets, &bytes);
+		if (err) {
+			pr_err("Error querying stats for counter id %d\n",
+			       counter->id);
+			continue;
+		}
+
+		if (packets == c->packets)
+			continue;
+
+		c->lastuse = jiffies;
+		c->packets = packets;
+		c->bytes   = bytes;
+	}
+
+	if (time_after_eq(now, fc_stats->next_query))
+		fc_stats->next_query = now + MLX5_FC_STATS_PERIOD;
+}
+
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_cmd_fc_alloc(dev, &counter->id);
+	if (err)
+		goto err_out;
+
+	if (aging) {
+		counter->aging = true;
+
+		spin_lock(&fc_stats->addlist_lock);
+		list_add(&counter->list, &fc_stats->addlist);
+		spin_unlock(&fc_stats->addlist_lock);
+
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+	}
+
+	return counter;
+
+err_out:
+	kfree(counter);
+
+	return ERR_PTR(err);
+}
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (!counter)
+		return;
+
+	if (counter->aging) {
+		counter->deleted = true;
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+		return;
+	}
+
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	INIT_LIST_HEAD(&fc_stats->list);
+	INIT_LIST_HEAD(&fc_stats->addlist);
+	spin_lock_init(&fc_stats->addlist_lock);
+
+	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
+	if (!fc_stats->wq)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
+
+	return 0;
+}
+
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+
+	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
+	destroy_workqueue(dev->priv.fc_stats.wq);
+	dev->priv.fc_stats.wq = NULL;
+
+	list_splice_tail_init(&fc_stats->addlist, &fc_stats->list);
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->list, list) {
+		list_del(&counter->list);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+}
+
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse)
+{
+	struct mlx5_fc_cache c;
+
+	c = counter->cache;
+
+	*bytes = c.bytes - counter->lastbytes;
+	*packets = c.packets - counter->lastpackets;
+	*lastuse = c.lastuse;
+
+	counter->lastbytes = c.bytes;
+	counter->lastpackets = c.packets;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9613143f0561..07b504f7eb84 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
+#include <linux/workqueue.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -457,6 +458,17 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+struct mlx5_fc_stats {
+	struct list_head list;
+	struct list_head addlist;
+	/* protect addlist add/splice operations */
+	spinlock_t addlist_lock;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+};
+
 struct mlx5_eswitch;
 
 struct mlx5_priv {
@@ -520,6 +532,8 @@ struct mlx5_priv {
 	struct mlx5_flow_root_namespace *fdb_root_ns;
 	struct mlx5_flow_root_namespace *esw_egress_root_ns;
 	struct mlx5_flow_root_namespace *esw_ingress_root_ns;
+
+	struct mlx5_fc_stats		fc_stats;
 };
 
 enum mlx5_device_state {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index c8b9ede1c20a..4b7a107d9c19 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -127,4 +127,9 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 				 struct mlx5_flow_destination *dest);
 
 struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse);
+
 #endif
-- 
cgit v1.2.3


From c94987e40ebbae3b7b6c3ece37b6f8338830f6b1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:27 +0200
Subject: bpf: move bpf_jit_enable declaration

Move the bpf_jit_enable declaration to the filter.h file where
most other core code is declared, also since we're going to add
a second knob there.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h    | 2 ++
 include/linux/netdevice.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ec1411c89105..4ff0e647598f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,6 +496,8 @@ void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
+extern int bpf_jit_enable;
+
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
 struct bpf_binary_header *
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c2f5112f08f7..c148edfe4965 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3759,7 +3759,6 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
-extern int		bpf_jit_enable;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
-- 
cgit v1.2.3


From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:30 +0200
Subject: bpf: add bpf_patch_insn_single helper

Move the functionality to patch instructions out of the verifier
code and into the core as the new bpf_patch_insn_single() helper
will be needed later on for blinding as well. No changes in
functionality.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c      | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  | 53 +++++++------------------------------
 3 files changed, 83 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ff0e647598f..c4aae496f376 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -495,6 +495,9 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5313d09d4b62..49b5538a5301 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_JMP  &&
+	       /* Call and Exit are both special jumps with no
+		* target inside the BPF instruction image.
+		*/
+	       BPF_OP(insn->code) != BPF_CALL &&
+	       BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+	struct bpf_insn *insn = prog->insnsi;
+	u32 i, insn_cnt = prog->len;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_is_jmp_and_has_target(insn))
+			continue;
+
+		/* Adjust offset of jmps if we cross boundaries. */
+		if (i < pos && i + insn->off + 1 > pos)
+			insn->off += delta;
+		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+			insn->off -= delta;
+	}
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len)
+{
+	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	struct bpf_prog *prog_adj;
+
+	/* Since our patchlet doesn't expand the image, we're done. */
+	if (insn_delta == 0) {
+		memcpy(prog->insnsi + off, patch, sizeof(*patch));
+		return prog;
+	}
+
+	insn_adj_cnt = prog->len + insn_delta;
+
+	/* Several new instructions need to be inserted. Make room
+	 * for them. Likely, there's no need for a new allocation as
+	 * last page could have large enough tailroom.
+	 */
+	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+				    GFP_USER);
+	if (!prog_adj)
+		return NULL;
+
+	prog_adj->len = insn_adj_cnt;
+
+	/* Patching happens in 3 steps:
+	 *
+	 * 1) Move over tail of insnsi from next instruction onwards,
+	 *    so we can patch the single target insn with one or more
+	 *    new ones (patching is always from 1 to n insns, n > 0).
+	 * 2) Inject new instructions at the target location.
+	 * 3) Adjust branch offsets if necessary.
+	 */
+	insn_rest = insn_adj_cnt - off - len;
+
+	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+		sizeof(*patch) * insn_rest);
+	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+	bpf_adj_branches(prog_adj, off, insn_delta);
+
+	return prog_adj;
+}
+
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 84bff68cf80e..a08d66215245 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
-	struct bpf_insn *insn = prog->insnsi;
-	int insn_cnt = prog->len;
-	int i;
-
-	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (BPF_CLASS(insn->code) != BPF_JMP ||
-		    BPF_OP(insn->code) == BPF_CALL ||
-		    BPF_OP(insn->code) == BPF_EXIT)
-			continue;
-
-		/* adjust offset of jmps if necessary */
-		if (i < pos && i + insn->off + 1 > pos)
-			insn->off += delta;
-		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
-			insn->off -= delta;
-	}
-}
-
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
-	u32 cnt;
-	int i;
 	enum bpf_access_type type;
+	int i;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		u32 insn_delta, cnt;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
 			type = BPF_READ;
 		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (cnt == 1) {
-			memcpy(insn, insn_buf, sizeof(*insn));
-			continue;
-		}
-
-		/* several new insns need to be inserted. Make room for them */
-		insn_cnt += cnt - 1;
-		new_prog = bpf_prog_realloc(env->prog,
-					    bpf_prog_size(insn_cnt),
-					    GFP_USER);
+		new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;
 
-		new_prog->len = insn_cnt;
-
-		memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
-			sizeof(*insn) * (insn_cnt - i - cnt));
-
-		/* copy substitute insns in place of load instruction */
-		memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
-
-		/* adjust branches in the whole program */
-		adjust_branches(new_prog, i, cnt - 1);
+		insn_delta = cnt - 1;
 
 		/* keep walking new program and skip insns we just inserted */
 		env->prog = new_prog;
-		insn = new_prog->insnsi + i + cnt - 1;
-		i += cnt - 1;
+		insn      = new_prog->insnsi + i + insn_delta;
+
+		insn_cnt += insn_delta;
+		i        += insn_delta;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:31 +0200
Subject: bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis

Since the blinding is strictly only called from inside eBPF JITs,
we need to change signatures for bpf_int_jit_compile() and
bpf_prog_select_runtime() first in order to prepare that the
eBPF program we're dealing with can change underneath. Hence,
for call sites, we need to return the latest prog. No functional
change in this patch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c |  7 ++++---
 arch/s390/net/bpf_jit_comp.c  |  8 +++++---
 arch/x86/net/bpf_jit_comp.c   |  7 ++++---
 include/linux/filter.h        |  5 +++--
 kernel/bpf/core.c             | 18 ++++++++++++++----
 kernel/bpf/syscall.c          |  2 +-
 lib/test_bpf.c                |  5 ++++-
 net/core/filter.c             |  6 +++++-
 8 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index ef35e866caf7..dd428807cb30 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -762,7 +762,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 	/* Nothing to do here. We support Internal BPF. */
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header;
 	struct jit_ctx ctx;
@@ -770,14 +770,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	u8 *image_ptr;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.prog = prog;
 
 	ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
 	if (ctx.offset == NULL)
-		return;
+		return prog;
 
 	/* 1. Initial fake pass to compute ctx->idx. */
 
@@ -828,6 +828,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	prog->jited = 1;
 out:
 	kfree(ctx.offset);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *prog)
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 3c0bfc1f2694..fcf301a889e7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,18 +1262,19 @@ void bpf_jit_compile(struct bpf_prog *fp)
 /*
  * Compile eBPF program "fp"
  */
-void bpf_int_jit_compile(struct bpf_prog *fp)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	struct bpf_binary_header *header;
 	struct bpf_jit jit;
 	int pass;
 
 	if (!bpf_jit_enable)
-		return;
+		return fp;
+
 	memset(&jit, 0, sizeof(jit));
 	jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
 	if (jit.addrs == NULL)
-		return;
+		return fp;
 	/*
 	 * Three initial passes:
 	 *   - 1/2: Determine clobbered registers
@@ -1305,6 +1306,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp)
 	}
 free_addrs:
 	kfree(jit.addrs);
+	return fp;
 }
 
 /*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f5bfd4fd28dd..6b2d23ea3590 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1073,7 +1073,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 {
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
 	int proglen, oldproglen = 0;
@@ -1084,11 +1084,11 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	int i;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
 	if (!addrs)
-		return;
+		return prog;
 
 	/* Before first pass, make a rough estimation of addrs[]
 	 * each bpf instruction is translated to less than 64 bytes
@@ -1140,6 +1140,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 out:
 	kfree(addrs);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *fp)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c4aae496f376..891852cf7716 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-int bpf_prog_select_runtime(struct bpf_prog *fp);
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
@@ -492,7 +492,8 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-void bpf_int_jit_compile(struct bpf_prog *fp);
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_skb_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49b5538a5301..70f0821aca47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 /**
  *	bpf_prog_select_runtime - select exec runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
+ *	@err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
  * The BPF program will be executed via BPF_PROG_RUN() macro.
  */
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
-	bpf_int_jit_compile(fp);
+	/* eBPF JITs can rewrite the program in case constant
+	 * blinding is active. However, in case of error during
+	 * blinding, bpf_int_jit_compile() must always return a
+	 * valid program, which in this case would simply not
+	 * be JITed, but falls back to the interpreter.
+	 */
+	fp = bpf_int_jit_compile(fp);
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
 	 * with JITed or non JITed program concatenations and not
 	 * all eBPF JITs might immediately support all features.
 	 */
-	return bpf_check_tail_call(fp);
+	*err = bpf_check_tail_call(fp);
+
+	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
@@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 };
 
 /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	return prog;
 }
 
 bool __weak bpf_helper_changes_skb_data(void *func)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5e9f7ad13a..46ecce4b79ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	err = bpf_prog_select_runtime(prog);
+	prog = bpf_prog_select_runtime(prog, &err);
 	if (err < 0)
 		goto free_used_maps;
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8f22fbedc3a6..93f45011a59d 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		fp->type = BPF_PROG_TYPE_SOCKET_FILTER;
 		memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
 
-		bpf_prog_select_runtime(fp);
+		/* We cannot error here as we don't need type compatibility
+		 * checks.
+		 */
+		fp = bpf_prog_select_runtime(fp, err);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ea51b479cf02..68adb5f52110 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 		 */
 		goto out_err_free;
 
-	bpf_prog_select_runtime(fp);
+	/* We are guaranteed to never error here with cBPF to eBPF
+	 * transitions, since there's no issue with type compatibility
+	 * checks on program arrays.
+	 */
+	fp = bpf_prog_select_runtime(fp, &err);
 
 	kfree(old_prog);
 	return fp;
-- 
cgit v1.2.3


From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:32 +0200
Subject: bpf: add generic constant blinding for use in jits

This work adds a generic facility for use from eBPF JIT compilers
that allows for further hardening of JIT generated images through
blinding constants. In response to the original work on BPF JIT
spraying published by Keegan McAllister [1], most BPF JITs were
changed to make images read-only and start at a randomized offset
in the page, where the rest was filled with trap instructions. We
have this nowadays in x86, arm, arm64 and s390 JIT compilers.
Additionally, later work also made eBPF interpreter images read
only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86,
arm, arm64 and s390 archs as well currently. This is done by
default for mentioned JITs when JITing is enabled. Furthermore,
we had a generic and configurable constant blinding facility on our
todo for quite some time now to further make spraying harder, and
first implementation since around netconf 2016.

We found that for systems where untrusted users can load cBPF/eBPF
code where JIT is enabled, start offset randomization helps a bit
to make jumps into crafted payload harder, but in case where larger
programs that cross page boundary are injected, we again have some
part of the program opcodes at a page start offset. With improved
guessing and more reliable payload injection, chances can increase
to jump into such payload. Elena Reshetova recently wrote a test
case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which
can leave some more room for payloads. Note that for all this,
additional bugs in the kernel are still required to make the jump
(and of course to guess right, to not jump into a trap) and naturally
the JIT must be enabled, which is disabled by default.

For helping mitigation, the general idea is to provide an option
bpf_jit_harden that admins can tweak along with bpf_jit_enable, so
that for cases where JIT should be enabled for performance reasons,
the generated image can be further hardened with blinding constants
for unpriviledged users (bpf_jit_harden == 1), with trading off
performance for these, but not for privileged ones. We also added
the option of blinding for all users (bpf_jit_harden == 2), which
is quite helpful for testing f.e. with test_bpf.ko. There are no
further e.g. hardening levels of bpf_jit_harden switch intended,
rationale is to have it dead simple to use as on/off. Since this
functionality would need to be duplicated over and over for JIT
compilers to use, which are already complex enough, we provide a
generic eBPF byte-code level based blinding implementation, which is
then just transparently JITed. JIT compilers need to make only a few
changes to integrate this facility and can be migrated one by one.

This option is for eBPF JITs and will be used in x86, arm64, s390
without too much effort, and soon ppc64 JITs, thus that native eBPF
can be blinded as well as cBPF to eBPF migrations, so that both can
be covered with a single implementation. The rule for JITs is that
bpf_jit_blind_constants() must be called from bpf_int_jit_compile(),
and in case blinding is disabled, we follow normally with JITing the
passed program. In case blinding is enabled and we fail during the
process of blinding itself, we must return with the interpreter.
Similarly, in case the JITing process after the blinding failed, we
return normally to the interpreter with the non-blinded code. Meaning,
interpreter doesn't change in any way and operates on eBPF code as
usual. For doing this pre-JIT blinding step, we need to make use of
a helper/auxiliary register, here BPF_REG_AX. This is strictly internal
to the JIT and not in any way part of the eBPF architecture. Just like
in the same way as JITs internally make use of some helper registers
when emitting code, only that here the helper register is one
abstraction level higher in eBPF bytecode, but nevertheless in JIT
phase. That helper register is needed since f.e. manually written
program can issue loads to all registers of eBPF architecture.

The core concept with the additional register is: blind out all 32
and 64 bit constants by converting BPF_K based instructions into a
small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this
is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND,
and REG <OP> BPF_REG_AX, so actual operation on the target register
is translated from BPF_K into BPF_X one that is operating on
BPF_REG_AX's content. During rewriting phase when blinding, RND is
newly generated via prandom_u32() for each processed instruction.
64 bit loads are split into two 32 bit loads to make translation and
patching not too complex. Only basic thing required by JITs is to
call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair, and to map BPF_REG_AX into an unused register.

Small bpf_jit_disasm extract from [2] when applied to x86 JIT:

echo 0 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f5e9 + <x>:
  [...]
  39:   mov    $0xa8909090,%eax
  3e:   mov    $0xa8909090,%eax
  43:   mov    $0xa8ff3148,%eax
  48:   mov    $0xa89081b4,%eax
  4d:   mov    $0xa8900bb0,%eax
  52:   mov    $0xa810e0c1,%eax
  57:   mov    $0xa8908eb4,%eax
  5c:   mov    $0xa89020b0,%eax
  [...]

echo 1 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f1e5 + <x>:
  [...]
  39:   mov    $0xe1192563,%r10d
  3f:   xor    $0x4989b5f3,%r10d
  46:   mov    %r10d,%eax
  49:   mov    $0xb8296d93,%r10d
  4f:   xor    $0x10b9fd03,%r10d
  56:   mov    %r10d,%eax
  59:   mov    $0x8c381146,%r10d
  5f:   xor    $0x24c7200e,%r10d
  66:   mov    %r10d,%eax
  69:   mov    $0xeb2a830e,%r10d
  6f:   xor    $0x43ba02ba,%r10d
  76:   mov    %r10d,%eax
  79:   mov    $0xd9730af,%r10d
  7f:   xor    $0xa5073b1f,%r10d
  86:   mov    %r10d,%eax
  89:   mov    $0x9a45662b,%r10d
  8f:   xor    $0x325586ea,%r10d
  96:   mov    %r10d,%eax
  [...]

As can be seen, original constants that carry payload are hidden
when enabled, actual operations are transformed from constant-based
to register-based ones, making jumps into constants ineffective.
Above extract/example uses single BPF load instruction over and
over, but of course all instructions with constants are blinded.

Performance wise, JIT with blinding performs a bit slower than just
JIT and faster than interpreter case. This is expected, since we
still get all the performance benefits from JITing and in normal
use-cases not every single instruction needs to be blinded. Summing
up all 296 test cases averaged over multiple runs from test_bpf.ko
suite, interpreter was 55% slower than JIT only and JIT with blinding
was 8% slower than JIT only. Since there are also some extremes in
the test suite, I expect for ordinary workloads that the performance
for the JIT with blinding case is even closer to JIT only case,
f.e. nmap test case from suite has averaged timings in ns 29 (JIT),
35 (+ blinding), and 151 (interpreter).

BPF test suite, seccomp test suite, eBPF sample code and various
bigger networking eBPF programs have been tested with this and were
running fine. For testing purposes, I also adapted interpreter and
redirected blinded eBPF image to interpreter and also here all tests
pass.

  [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html
  [2] https://github.com/01org/jit-spray-poc-for-ksp/
  [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Elena Reshetova <elena.reshetova@intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt |  11 +++
 include/linux/filter.h       |  42 +++++++++
 kernel/bpf/core.c            | 203 +++++++++++++++++++++++++++++++++++++++++++
 net/Kconfig                  |   7 +-
 net/core/sysctl_net_core.c   |   9 ++
 5 files changed, 270 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 809ab6efcc74..f0480f7ea740 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -43,6 +43,17 @@ Values :
 	1 - enable the JIT
 	2 - enable the JIT and ask the compiler to emit traces on kernel log.
 
+bpf_jit_harden
+--------------
+
+This enables hardening for the Berkeley Packet Filter Just in Time compiler.
+Supported are eBPF JIT backends. Enabling hardening trades off performance,
+but can mitigate JIT spraying.
+Values :
+	0 - disable JIT hardening (default value)
+	1 - enable JIT hardening for unprivileged users only
+	2 - enable JIT hardening for all users
+
 dev_weight
 --------------
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 891852cf7716..6fc31ef1da2d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -13,6 +13,8 @@
 #include <linux/printk.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
+
 #include <net/sch_generic.h>
 
 #include <asm/cacheflush.h>
@@ -42,6 +44,15 @@ struct bpf_prog_aux;
 #define BPF_REG_X	BPF_REG_7
 #define BPF_REG_TMP	BPF_REG_8
 
+/* Kernel hidden auxiliary/helper register for hardening step.
+ * Only used by eBPF JITs. It's nothing more than a temporary
+ * register that JITs use internally, only that here it's part
+ * of eBPF instructions that have been rewritten for blinding
+ * constants. See JIT pre-step in bpf_jit_blind_constants().
+ */
+#define BPF_REG_AX		MAX_BPF_REG
+#define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -501,6 +512,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
+extern int bpf_jit_harden;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -513,6 +525,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr);
 void bpf_jit_compile(struct bpf_prog *fp);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);
+
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
 {
@@ -523,6 +538,33 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
 }
+
+static inline bool bpf_jit_is_ebpf(void)
+{
+# ifdef CONFIG_HAVE_EBPF_JIT
+	return true;
+# else
+	return false;
+# endif
+}
+
+static inline bool bpf_jit_blinding_enabled(void)
+{
+	/* These are the prerequisites, should someone ever have the
+	 * idea to call blinding outside of them, we make sure to
+	 * bail out.
+	 */
+	if (!bpf_jit_is_ebpf())
+		return false;
+	if (!bpf_jit_enable)
+		return false;
+	if (!bpf_jit_harden)
+		return false;
+	if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
+		return false;
+
+	return true;
+}
 #else
 static inline void bpf_jit_compile(struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 70f0821aca47..f1e8a0def99b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	module_memfree(hdr);
 }
+
+int bpf_jit_harden __read_mostly;
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+			      const struct bpf_insn *aux,
+			      struct bpf_insn *to_buff)
+{
+	struct bpf_insn *to = to_buff;
+	u32 imm_rnd = prandom_u32();
+	s16 off;
+
+	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+	if (from->imm == 0 &&
+	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
+	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+		goto out;
+	}
+
+	switch (from->code) {
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU | BPF_OR  | BPF_K:
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_OR  | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_JMP | BPF_JEQ  | BPF_K:
+	case BPF_JMP | BPF_JNE  | BPF_K:
+	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSET | BPF_K:
+		/* Accommodate for extra offset in case of a backjump. */
+		off = from->off;
+		if (off < 0)
+			off -= 2;
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+		break;
+
+	case BPF_LD | BPF_ABS | BPF_W:
+	case BPF_LD | BPF_ABS | BPF_H:
+	case BPF_LD | BPF_ABS | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IND | BPF_W:
+	case BPF_LD | BPF_IND | BPF_H:
+	case BPF_LD | BPF_IND | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IMM | BPF_DW:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+		break;
+	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_DW:
+	case BPF_ST | BPF_MEM | BPF_W:
+	case BPF_ST | BPF_MEM | BPF_H:
+	case BPF_ST | BPF_MEM | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+		break;
+	}
+out:
+	return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+					      gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_prog *fp;
+
+	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
+		/* aux->prog still points to the fp_other one, so
+		 * when promoting the clone to the real program,
+		 * this still needs to be adapted.
+		 */
+		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+	}
+
+	return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+	/* aux was stolen by the other clone, so we cannot free
+	 * it from this path! It will be freed eventually by the
+	 * other program on release.
+	 *
+	 * At this point, we don't need a deferred release since
+	 * clone is guaranteed to not be locked.
+	 */
+	fp->aux = NULL;
+	__bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+	/* We have to repoint aux->prog to self, as we don't
+	 * know whether fp here is the clone or the original.
+	 */
+	fp->aux->prog = fp;
+	bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+	struct bpf_insn insn_buff[16], aux[2];
+	struct bpf_prog *clone, *tmp;
+	int insn_delta, insn_cnt;
+	struct bpf_insn *insn;
+	int i, rewritten;
+
+	if (!bpf_jit_blinding_enabled())
+		return prog;
+
+	clone = bpf_prog_clone_create(prog, GFP_USER);
+	if (!clone)
+		return ERR_PTR(-ENOMEM);
+
+	insn_cnt = clone->len;
+	insn = clone->insnsi;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		/* We temporarily need to hold the original ld64 insn
+		 * so that we can still access the first part in the
+		 * second blinding run.
+		 */
+		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    insn[1].code == 0)
+			memcpy(aux, insn, sizeof(aux));
+
+		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+		if (!rewritten)
+			continue;
+
+		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+		if (!tmp) {
+			/* Patching may have repointed aux->prog during
+			 * realloc from the original one, so we need to
+			 * fix it up here on error.
+			 */
+			bpf_jit_prog_release_other(prog, clone);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		clone = tmp;
+		insn_delta = rewritten - 1;
+
+		/* Walk new program and skip insns we just inserted. */
+		insn = clone->insnsi + i + insn_delta;
+		insn_cnt += insn_delta;
+		i        += insn_delta;
+	}
+
+	return clone;
+}
 #endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
diff --git a/net/Kconfig b/net/Kconfig
index f7148f24f114..ff40562a782c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -295,8 +295,11 @@ config BPF_JIT
 	  Berkeley Packet Filter filtering capabilities are normally handled
 	  by an interpreter. This option allows kernel to generate a native
 	  code when filter is loaded in memory. This should speedup
-	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
-	  this feature changing /proc/sys/net/core/bpf_jit_enable
+	  packet sniffing (libpcap/tcpdump).
+
+	  Note, admin should enable this feature changing:
+	  /proc/sys/net/core/bpf_jit_enable
+	  /proc/sys/net/core/bpf_jit_harden (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a6beb7b6ae55..0df2aa652530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+# ifdef CONFIG_HAVE_EBPF_JIT
+	{
+		.procname	= "bpf_jit_harden",
+		.data		= &bpf_jit_harden,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
+# endif
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-- 
cgit v1.2.3


From 92efb1bd9bcbdf83cc0e6cfead752d0c82f63677 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 16 May 2016 15:12:02 -0500
Subject: PCI: Identify Enhanced Allocation (EA) BAR Equivalent resources in
 sysfs

Resource flags are exposed to userspace via the sysfs "resource" file.
lspci reads the sysfs file to determine resource properties.

Add a "BAR Equivalent Indicator" flag so lspci can distinguish between
[virtual] and [enhanced] resources.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Sean O. Stalley <sean.stalley@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c      | 2 +-
 include/linux/ioport.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 25e0327d4429..7b008c7cac35 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2228,7 +2228,7 @@ void pci_pm_init(struct pci_dev *dev)
 
 static unsigned long pci_ea_flags(struct pci_dev *dev, u8 prop)
 {
-	unsigned long flags = IORESOURCE_PCI_FIXED;
+	unsigned long flags = IORESOURCE_PCI_FIXED | IORESOURCE_PCI_EA_BEI;
 
 	switch (prop) {
 	case PCI_EA_P_MEM:
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 0b65543dc6cf..6230064d7f95 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -26,6 +26,9 @@ struct resource {
 
 /*
  * IO resources have these defined flags.
+ *
+ * PCI devices expose these flags to userspace in the "resource" sysfs file,
+ * so don't move them.
  */
 #define IORESOURCE_BITS		0x000000ff	/* Bus-specific bits */
 
@@ -110,6 +113,7 @@ struct resource {
 
 /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
 #define IORESOURCE_PCI_FIXED		(1<<4)	/* Do not move resource */
+#define IORESOURCE_PCI_EA_BEI		(1<<5)	/* BAR Equivalent Indicator */
 
 /*
  * I/O Resource Descriptors
-- 
cgit v1.2.3


From e4b2749158631e6d74bf14d2ef3804d780e2f770 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 11 May 2016 11:58:47 +0200
Subject: DAX: move RADIX_DAX_ definitions to dax.c

These don't belong in radix-tree.c any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.

Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c                   | 9 +++++++++
 include/linux/radix-tree.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 2494255c5785..7ef5aef78442 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..d08d6ec3bf53 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -52,15 +52,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
-- 
cgit v1.2.3


From 02fbd139759feb1f331cebd858523b5d774082e6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 May 2016 11:58:48 +0200
Subject: dax: Remove complete_unwritten argument

Fault handlers currently take complete_unwritten argument to convert
unwritten extents after PTEs are updated. However no filesystem uses
this anymore as the code is racy. Remove the unused argument.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c      |  4 ++--
 fs/dax.c            | 43 +++++++++----------------------------------
 fs/ext2/file.c      |  4 ++--
 fs/ext4/file.c      |  4 ++--
 fs/xfs/xfs_file.c   |  7 +++----
 include/linux/dax.h | 17 +++++++----------
 include/linux/fs.h  |  1 -
 7 files changed, 25 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..b25bb230b28a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1746,7 +1746,7 @@ static const struct address_space_operations def_blk_aops = {
  */
 static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+	return __dax_fault(vma, vmf, blkdev_get_block);
 }
 
 static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
@@ -1758,7 +1758,7 @@ static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
 static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, unsigned int flags)
 {
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block);
 }
 
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index 7ef5aef78442..83e7894d86d8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -612,19 +612,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- *	to written so the data written to them is exposed. This is required for
- *	required by write faults for filesystems that will return unwritten
- *	extent mappings from @get_block, but it is optional for reads as
- *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
- *	not support unwritten extents, the it should pass NULL.
  *
  * When a page fault occurs, filesystems may call this helper in their
  * fault handler for DAX files. __dax_fault() assumes the caller has done all
  * the necessary locking for the page fault to proceed successfully.
  */
 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block, dax_iodone_t complete_unwritten)
+			get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -727,23 +721,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		page = NULL;
 	}
 
-	/*
-	 * If we successfully insert the new mapping over an unwritten extent,
-	 * we need to ensure we convert the unwritten extent. If there is an
-	 * error inserting the mapping, the filesystem needs to leave it as
-	 * unwritten to prevent exposure of the stale underlying data to
-	 * userspace, but we still need to call the completion function so
-	 * the private resources on the mapping buffer can be released. We
-	 * indicate what the callback should do via the uptodate variable, same
-	 * as for normal BH based IO completions.
-	 */
+	/* Filesystem should not return unwritten buffers to us! */
+	WARN_ON_ONCE(buffer_unwritten(&bh));
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
-	if (buffer_unwritten(&bh)) {
-		if (complete_unwritten)
-			complete_unwritten(&bh, !error);
-		else
-			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
-	}
 
  out:
 	if (error == -ENOMEM)
@@ -772,7 +752,7 @@ EXPORT_SYMBOL(__dax_fault);
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-	      get_block_t get_block, dax_iodone_t complete_unwritten)
+	      get_block_t get_block)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +761,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+	result = __dax_fault(vma, vmf, get_block);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
@@ -815,8 +795,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
 
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, get_block_t get_block,
-		dax_iodone_t complete_unwritten)
+		pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -875,6 +854,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		if (get_block(inode, block, &bh, 1) != 0)
 			return VM_FAULT_SIGBUS;
 		alloc = true;
+		WARN_ON_ONCE(buffer_unwritten(&bh));
 	}
 
 	bdev = bh.b_bdev;
@@ -1020,9 +1000,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  out:
 	i_mmap_unlock_read(mapping);
 
-	if (buffer_unwritten(&bh))
-		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
 	return result;
 
  fallback:
@@ -1042,8 +1019,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  * pmd_fault handler for DAX files.
  */
 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-			pmd_t *pmd, unsigned int flags, get_block_t get_block,
-			dax_iodone_t complete_unwritten)
+			pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +1028,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
-				complete_unwritten);
+	result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
 	if (flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+	ret = __dax_fault(vma, vmf, ext2_get_block);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	}
 	down_read(&ei->dax_sem);
 
-	ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
 
 	up_read(&ei->dax_sem);
 	if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 37e28082885a..7a6398867ff2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
+		result = __dax_fault(vma, vmf, ext4_dax_get_block);
 
 	if (write) {
 		if (!IS_ERR(handle))
@@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 		result = VM_FAULT_SIGBUS;
 	else
 		result = __dax_pmd_fault(vma, addr, pmd, flags,
-					 ext4_dax_get_block, NULL);
+					 ext4_dax_get_block);
 
 	if (write) {
 		if (!IS_ERR(handle))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 569938a4a357..c2946f436a3a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1558,7 +1558,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
 	} else {
 		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
 		ret = block_page_mkwrite_return(ret);
@@ -1592,7 +1592,7 @@ xfs_filemap_fault(
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
-		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1629,8 +1629,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
-			      NULL);
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59ab505..7c45ac7ea1d1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -10,10 +10,8 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-		dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
@@ -27,21 +25,20 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t, dax_iodone_t);
+				unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t, dax_iodone_t);
+				unsigned int flags, get_block_t);
 #else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-				pmd_t *pmd, unsigned int flags, get_block_t gb,
-				dax_iodone_t di)
+				pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
 	return VM_FAULT_FALLBACK;
 }
 #define __dax_pmd_fault dax_pmd_fault
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb)	__dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..9f2813090d1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC		0x00000001
 #define MAY_WRITE		0x00000002
-- 
cgit v1.2.3


From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 Apr 2016 12:30:53 -0300
Subject: perf core: Pass max stack as a perf_callchain_entry context

This makes perf_callchain_{user,kernel}() receive the max stack
as context for the perf_callchain_entry, instead of accessing
the global sysctl_perf_event_max_stack.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arc/kernel/perf_event.c       |  6 +++---
 arch/arm/kernel/perf_callchain.c   | 10 +++++-----
 arch/arm64/kernel/perf_callchain.c | 14 +++++++-------
 arch/metag/kernel/perf_callchain.c | 10 +++++-----
 arch/mips/kernel/perf_event.c      | 12 ++++++------
 arch/powerpc/perf/callchain.c      | 14 +++++++-------
 arch/s390/kernel/perf_event.c      |  4 ++--
 arch/sh/kernel/perf_callchain.c    |  4 ++--
 arch/sparc/kernel/perf_event.c     | 14 +++++++-------
 arch/tile/kernel/perf_event.c      |  6 +++---
 arch/x86/events/core.c             | 14 +++++++-------
 arch/xtensa/kernel/perf_event.c    | 10 +++++-----
 include/linux/perf_event.h         | 16 +++++++++++-----
 kernel/bpf/stackmap.c              |  3 ++-
 kernel/events/callchain.c          | 20 ++++++++++++--------
 15 files changed, 84 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 8b134cfe5e1f..6fd48021324b 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -48,7 +48,7 @@ struct arc_callchain_trace {
 static int callchain_trace(unsigned int addr, void *data)
 {
 	struct arc_callchain_trace *ctrl = data;
-	struct perf_callchain_entry *entry = ctrl->perf_stuff;
+	struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff;
 	perf_callchain_store(entry, addr);
 
 	if (ctrl->depth++ < 3)
@@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct arc_callchain_trace ctrl = {
 		.depth = 0,
@@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	/*
 	 * User stack can't be unwound trivially with kernel dwarf unwinder
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 27563befa8a2..bc552e813e7b 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct frame_tail __user *tail;
 
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) &&
+	while ((entry->entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
@@ -89,13 +89,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 32c3c6e70119..0d60150057cf 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -76,7 +76,7 @@ struct compat_frame_tail {
 
 static struct compat_frame_tail __user *
 compat_user_backtrace(struct compat_frame_tail __user *tail,
-		      struct perf_callchain_entry *entry)
+		      struct perf_callchain_entry_ctx *entry)
 {
 	struct compat_frame_tail buftail;
 	unsigned long err;
@@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < sysctl_perf_event_max_stack &&
+		while (entry->entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < sysctl_perf_event_max_stack) &&
+		while ((entry->entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
@@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
  */
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	struct stackframe frame;
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 252abc12a5a3..b3261a98b15b 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr)
 
 static struct metag_frame __user *
 user_backtrace(struct metag_frame __user *user_frame,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct metag_frame frame;
 	unsigned long calladdr;
@@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp = regs->ctx.AX[0].U0;
 	struct metag_frame __user *frame;
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
+	while ((entry->entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
@@ -78,13 +78,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 5021c546ad07..22395c7d7030 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -25,8 +25,8 @@
  * the user stack callchains, we will add it here.
  */
 
-static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
-	unsigned long reg29)
+static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
+				    unsigned long reg29)
 {
 	unsigned long *sp = (unsigned long *)reg29;
 	unsigned long addr;
@@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= sysctl_perf_event_max_stack)
+			if (entry->entry->nr >= entry->max_stack)
 				break;
 		}
 	}
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
-		      struct pt_regs *regs)
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+			   struct pt_regs *regs)
 {
 	unsigned long sp = regs->regs[29];
 #ifdef CONFIG_KALLSYMS
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= sysctl_perf_event_max_stack)
+		if (entry->entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 22d9015c1acc..c9260c1dfdbc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
 	unsigned long next_ip;
@@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp)
 		puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
 	return rc;
 }
 
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 					  struct pt_regs *regs)
 {
 }
@@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
 	return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned int sp, next_sp;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
@@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (current_is_64bit())
 		perf_callchain_user_64(entry, regs);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index c3e4099b60a5..87035fa58bbe 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register);
 
 static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, address);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	if (user_mode(regs))
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
index cc80b614b5fa..fa2c0cd23eaa 100644
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name)
 
 static void callchain_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	if (reliable)
 		perf_callchain_store(entry, addr);
@@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	perf_callchain_store(entry, regs->pc);
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index a4b8b5aed21c..bcc5376db74b 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void)
 }
 pure_initcall(init_hw_perf_events);
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	unsigned long ksp, fp;
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
 	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	u64 saved_fault_address = current_thread_info()->fault_address;
 	u8 saved_fault_code = get_thread_fault_code();
diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c
index 8767060d70fb..6394c1ccb68e 100644
--- a/arch/tile/kernel/perf_event.c
+++ b/arch/tile/kernel/perf_event.c
@@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events);
 /*
  * Tile specific backtracing code for perf_events.
  */
-static inline void perf_callchain(struct perf_callchain_entry *entry,
+static inline void perf_callchain(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	struct KBacktraceIterator kbt;
@@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry,
 	}
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 		      struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e5e76a52f58..07f2b01cfb72 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name)
 
 static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	return perf_callchain_store(entry, addr);
 }
@@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
@@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment)
 #include <asm/compat.h>
 
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
 	unsigned long ss_base, cs_base;
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #else
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
     return 0;
 }
 #endif
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
 	const void __user *fp;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index a6b00b3af429..ef90479e0397 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event)
 
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_kernel(regs, entry->max_stack,
 				callchain_trace, NULL, entry);
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_user(regs, entry->max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9e1c3ada91c4..dbd18246b36e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -61,6 +61,11 @@ struct perf_callchain_entry {
 	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
+struct perf_callchain_entry_ctx {
+	struct perf_callchain_entry *entry;
+	u32			    max_stack;
+};
+
 struct perf_raw_record {
 	u32				size;
 	void				*data;
@@ -1063,19 +1068,20 @@ extern void perf_event_fork(struct task_struct *tsk);
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
-extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
-static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (entry->nr < sysctl_perf_event_max_stack) {
+	struct perf_callchain_entry *entry = ctx->entry;
+	if (entry->nr < ctx->max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f5a19548be12..a82d7605db3f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 7fc89939ede9..af95ad92893a 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 				  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				struct pt_regs *regs)
 {
 }
@@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
+	struct perf_callchain_entry_ctx ctx;
 	int rctx;
 
 	entry = get_callchain_entry(&rctx);
@@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	if (!entry)
 		goto exit_put;
 
+	ctx.entry     = entry;
+	ctx.max_stack = max_stack;
+
 	entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(entry, regs);
+			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
@@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(entry, PERF_CONTEXT_USER);
-			perf_callchain_user(entry, regs);
+				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+			perf_callchain_user(&ctx, regs);
 		}
 	}
 
-- 
cgit v1.2.3


From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 18:08:32 -0300
Subject: perf core: Add a 'nr' field to perf_event_callchain_context

We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.

This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arm/kernel/perf_callchain.c   | 2 +-
 arch/arm64/kernel/perf_callchain.c | 4 ++--
 arch/metag/kernel/perf_callchain.c | 2 +-
 arch/mips/kernel/perf_event.c      | 4 ++--
 arch/powerpc/perf/callchain.c      | 4 ++--
 arch/sparc/kernel/perf_event.c     | 6 +++---
 arch/x86/events/core.c             | 4 ++--
 include/linux/perf_event.h         | 6 ++++--
 kernel/events/callchain.c          | 3 +--
 9 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index bc552e813e7b..22bf1f64d99a 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->entry->nr < entry->max_stack) &&
+	while ((entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 0d60150057cf..713ca824f266 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->entry->nr < entry->max_stack &&
+		while (entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->entry->nr < entry->max_stack) &&
+		while ((entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index b3261a98b15b..3e8e048040df 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	--frame;
 
-	while ((entry->entry->nr < entry->max_stack) && frame)
+	while ((entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 22395c7d7030..d64056e0bb56 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->entry->nr >= entry->max_stack)
+			if (entry->nr >= entry->max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->entry->nr >= entry->max_stack)
+		if (entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index c9260c1dfdbc..f68f213dc36c 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index bcc5376db74b..710f3278d448 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			}
 		}
 #endif
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 07f2b01cfb72..5de96a18cd9c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 		return;
 
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dbd18246b36e..3803bb1a862b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,7 @@ struct perf_callchain_entry {
 struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
+	u32			    nr;
 };
 
 struct perf_raw_record {
@@ -1080,9 +1081,10 @@ extern int sysctl_perf_event_max_stack;
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	struct perf_callchain_entry *entry = ctx->entry;
-	if (entry->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
+		++ctx->nr;
 		return 0;
 	} else {
 		return -1; /* no more room, stop walking the stack */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index af95ad92893a..8774ff86debb 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
-
-	entry->nr = init_nr;
+	ctx.nr	      = entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-- 
cgit v1.2.3


From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:01:50 -0300
Subject: perf core: Add perf_callchain_store_context() helper

We need have different helpers to account how many contexts we have in
the sample and for real addresses, so do it now as a prep patch, to
ease review.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/perf/callchain.c | 6 +++---
 include/linux/perf_event.h    | 2 ++
 kernel/events/callchain.c     | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index f68f213dc36c..f62597dbd757 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 			next_ip = regs->nip;
 			lr = regs->link;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL);
 
 		} else {
 			if (level == 0)
@@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_64(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
@@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_32(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3803bb1a862b..2024b14cc2b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1079,6 +1079,8 @@ extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
+#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
 	if (ctx->nr < ctx->max_stack) {
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8774ff86debb..ca645736a983 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(&ctx, regs);
 	}
 
@@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 			perf_callchain_user(&ctx, regs);
 		}
 	}
-- 
cgit v1.2.3


From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:06:21 -0300
Subject: perf core: Separate accounting of contexts and real addresses in a
 stack trace

The perf_sample->ip_callchain->nr value includes all the entries in the
ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc},
while what the user expects is that what is in the kernel.perf_event_max_stack
sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be
honoured in terms of IP addresses in the stack trace.

So allocate a bunch of extra entries for contexts, and do the accounting
via perf_callchain_entry_ctx struct members.

A new sysctl, kernel.perf_event_max_contexts_per_stack is also
introduced for investigating possible bugs in the callchain
implementation by some arch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 include/linux/perf_event.h      | 18 ++++++++++++++++--
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/callchain.c       | 10 +++++++++-
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index daabdd7ee543..a3683ce2a2f3 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -668,6 +669,19 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_max_contexts_per_stack:
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2024b14cc2b1..6b87be908790 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -65,6 +65,8 @@ struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
 	u32			    nr;
+	short			    contexts;
+	bool			    contexts_maxed;
 };
 
 struct perf_raw_record {
@@ -1078,12 +1080,24 @@ extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
 
-#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
+{
+	if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
+		entry->ip[entry->nr++] = ip;
+		++ctx->contexts;
+		return 0;
+	} else {
+		ctx->contexts_maxed = true;
+		return -1; /* no more room, stop walking the stack */
+	}
+}
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (ctx->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
 		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
 		++ctx->nr;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 43fc8d213472..36ce552cf6a9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -862,6 +862,7 @@ enum perf_event_type {
 };
 
 #define PERF_MAX_STACK_DEPTH		127
+#define PERF_MAX_CONTEXTS_PER_STACK	  8
 
 enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index ca645736a983..179ef4640964 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
 	return (sizeof(struct perf_callchain_entry) +
-		sizeof(__u64) * sysctl_perf_event_max_stack);
+		sizeof(__u64) * (sysctl_perf_event_max_stack +
+				 sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
 	ctx.nr	      = entry->nr = init_nr;
+	ctx.contexts       = 0;
+	ctx.contexts_maxed = false;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
@@ -228,6 +232,10 @@ exit_put:
 	return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ec6907a16b3..bec4c11c47d6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &six_hundred_forty_kb,
 	},
+	{
+		.procname	= "perf_event_max_contexts_per_stack",
+		.data		= &sysctl_perf_event_max_contexts_per_stack,
+		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &one_thousand,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.2.3


From 2af3a8159cd204fc8437ed2f75863f0fb930f0d0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:52 -0600
Subject: block: Add vfs_msg() interface

In preparation of moving DAX capability checks to the block layer
from filesystem code, add a VFS message interface that aligns with
filesystem's message format.

For instance, a vfs_msg() message followed by XFS messages in case
of a dax mount error may look like:

  VFS (pmem0p1): error: unaligned partition for dax
  XFS (pmem0p1): DAX unsupported by block device. Turning off DAX.
  XFS (pmem0p1): Mounting V5 Filesystem
   :

vfs_msg() is largely based on ext4_msg().

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c         | 12 ++++++++++++
 include/linux/blkdev.h | 11 +++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b25bb230b28a..91e0ec0233c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,6 +50,18 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+	va_end(args);
+}
+
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 669e419d6234..78c48ab22f46 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -767,6 +767,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
+#ifdef CONFIG_PRINTK
+#define vfs_msg(sb, level, fmt, ...)				\
+	__vfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#else
+#define vfs_msg(sb, level, fmt, ...)				\
+do {								\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__vfs_msg(sb, "", " ");					\
+} while (0)
+#endif
+
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
-- 
cgit v1.2.3


From 2d96afc8f70ef86c66a0b5d80c24a27d6dd13df3 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:53 -0600
Subject: block: Add bdev_dax_supported() for dax mount checks

DAX imposes additional requirements to a device.  Add
bdev_dax_supported() which performs all the precondition checks
necessary for filesystem to mount the device with dax option.

Also add a new check to verify if a partition is aligned by 4KB.
When a partition is unaligned, any dax read/write access fails,
except for metadata update.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/block_dev.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 91e0ec0233c0..518cde62c01c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -509,6 +509,51 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	struct blk_dax_ctl dax = {
+		.sector = 0,
+		.size = PAGE_SIZE,
+	};
+	int err;
+
+	if (blocksize != PAGE_SIZE) {
+		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+		return -EINVAL;
+	}
+
+	err = bdev_direct_access(sb->s_bdev, &dax);
+	if (err < 0) {
+		switch (err) {
+		case -EOPNOTSUPP:
+			vfs_msg(sb, KERN_ERR,
+				"error: device does not support dax");
+			break;
+		case -EINVAL:
+			vfs_msg(sb, KERN_ERR,
+				"error: unaligned partition for dax");
+			break;
+		default:
+			vfs_msg(sb, KERN_ERR,
+				"error: dax access failed (%d)", err);
+		}
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78c48ab22f46..71231a55debd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1688,6 +1688,7 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+extern int bdev_dax_supported(struct super_block *, int);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From a8078b1fc616da6112eb95f0063cd34531d4ccf0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 10 May 2016 10:23:57 -0600
Subject: block: Update blkdev_dax_capable() for consistency

blkdev_dax_capable() is similar to bdev_dax_supported(), but needs
to remain as a separate interface for checking dax capability of
a raw block device.

Rename and relocate blkdev_dax_capable() to keep them maintained
consistently, and call bdev_direct_access() for the dax capability
check.

There is no change in the behavior.

Link: https://lkml.org/lkml/2016/5/9/950
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 block/ioctl.c          | 30 ------------------------------
 fs/block_dev.c         | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/blkdev.h |  1 +
 include/linux/fs.h     |  8 --------
 4 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..7eeda072dc70 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,7 +4,6 @@
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
-#include <linux/badblocks.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
@@ -407,35 +406,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 518cde62c01c..8477d4501b1e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -554,6 +555,40 @@ int bdev_dax_supported(struct super_block *sb, int blocksize)
 }
 EXPORT_SYMBOL_GPL(bdev_dax_supported);
 
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct blk_dax_ctl dax = {
+		.size = PAGE_SIZE,
+	};
+
+	if (!IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	dax.sector = 0;
+	if (bdev_direct_access(bdev, &dax) < 0)
+		return false;
+
+	dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+	if (bdev_direct_access(bdev, &dax) < 0)
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 /*
  * pseudo-fs
  */
@@ -1295,7 +1330,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				if (!blkdev_dax_capable(bdev))
+				if (!bdev_dax_capable(bdev))
 					bdev->bd_inode->i_flags &= ~S_DAX;
 			}
 
@@ -1332,7 +1367,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			if (!blkdev_dax_capable(bdev))
+			if (!bdev_dax_capable(bdev))
 				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 71231a55debd..27cbefe8c985 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1689,6 +1689,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
+extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f2813090d1b..17f934fcf564 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2319,14 +2319,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
-- 
cgit v1.2.3


From 459a25afe97cb3d7f978b90c881f4d7aac8fb755 Mon Sep 17 00:00:00 2001
From: Boris BREZILLON <boris.brezillon@free-electrons.com>
Date: Wed, 30 Mar 2016 22:03:27 +0200
Subject: pwm: Get rid of pwm->lock

PWM devices are not protected against concurrent accesses. The lock in
struct pwm_device might let PWM users think it is, but it's actually
only protecting the enabled state.

Removing this lock should be fine as long as all PWM users are aware
that accesses to the PWM device have to be serialized, which seems to be
the case for all of them except the sysfs interface. Patch the sysfs
code by adding a lock to the pwm_export struct and making sure it's
taken for all relevant accesses to the exported PWM device.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 19 ++++---------------
 drivers/pwm/sysfs.c | 24 ++++++++++++++++++++----
 include/linux/pwm.h |  2 --
 3 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 22cf3959041c..cb762cf51332 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -269,7 +269,6 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
 		pwm->polarity = polarity;
-		mutex_init(&pwm->lock);
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -474,22 +473,16 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 	if (!pwm->chip->ops->set_polarity)
 		return -ENOSYS;
 
-	mutex_lock(&pwm->lock);
-
-	if (pwm_is_enabled(pwm)) {
-		err = -EBUSY;
-		goto unlock;
-	}
+	if (pwm_is_enabled(pwm))
+		return -EBUSY;
 
 	err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
 	if (err)
-		goto unlock;
+		return err;
 
 	pwm->polarity = polarity;
 
-unlock:
-	mutex_unlock(&pwm->lock);
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pwm_set_polarity);
 
@@ -506,16 +499,12 @@ int pwm_enable(struct pwm_device *pwm)
 	if (!pwm)
 		return -EINVAL;
 
-	mutex_lock(&pwm->lock);
-
 	if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
 		err = pwm->chip->ops->enable(pwm->chip, pwm);
 		if (err)
 			clear_bit(PWMF_ENABLED, &pwm->flags);
 	}
 
-	mutex_unlock(&pwm->lock);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(pwm_enable);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 9c90886f4123..187ca0875cf6 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -26,6 +26,7 @@
 struct pwm_export {
 	struct device child;
 	struct pwm_device *pwm;
+	struct mutex lock;
 };
 
 static struct pwm_export *child_to_pwm_export(struct device *child)
@@ -53,7 +54,8 @@ static ssize_t period_store(struct device *child,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	unsigned int val;
 	int ret;
 
@@ -61,7 +63,9 @@ static ssize_t period_store(struct device *child,
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
 	ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val);
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -79,7 +83,8 @@ static ssize_t duty_cycle_store(struct device *child,
 				struct device_attribute *attr,
 				const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	unsigned int val;
 	int ret;
 
@@ -87,7 +92,9 @@ static ssize_t duty_cycle_store(struct device *child,
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
 	ret = pwm_config(pwm, val, pwm_get_period(pwm));
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -105,13 +112,16 @@ static ssize_t enable_store(struct device *child,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	int val, ret;
 
 	ret = kstrtoint(buf, 0, &val);
 	if (ret)
 		return ret;
 
+	mutex_lock(&export->lock);
+
 	switch (val) {
 	case 0:
 		pwm_disable(pwm);
@@ -124,6 +134,8 @@ static ssize_t enable_store(struct device *child,
 		break;
 	}
 
+	mutex_unlock(&export->lock);
+
 	return ret ? : size;
 }
 
@@ -151,7 +163,8 @@ static ssize_t polarity_store(struct device *child,
 			      struct device_attribute *attr,
 			      const char *buf, size_t size)
 {
-	struct pwm_device *pwm = child_to_pwm_device(child);
+	struct pwm_export *export = child_to_pwm_export(child);
+	struct pwm_device *pwm = export->pwm;
 	enum pwm_polarity polarity;
 	int ret;
 
@@ -162,7 +175,9 @@ static ssize_t polarity_store(struct device *child,
 	else
 		return -EINVAL;
 
+	mutex_lock(&export->lock);
 	ret = pwm_set_polarity(pwm, polarity);
+	mutex_unlock(&export->lock);
 
 	return ret ? : size;
 }
@@ -203,6 +218,7 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm)
 	}
 
 	export->pwm = pwm;
+	mutex_init(&export->lock);
 
 	export->child.release = pwm_export_release;
 	export->child.parent = parent;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index b78d27c42629..d2e7430ccedb 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -106,7 +106,6 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @lock: used to serialize accesses to the PWM device where necessary
  * @period: period of the PWM signal (in nanoseconds)
  * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
  * @polarity: polarity of the PWM signal
@@ -119,7 +118,6 @@ struct pwm_device {
 	unsigned int pwm;
 	struct pwm_chip *chip;
 	void *chip_data;
-	struct mutex lock;
 
 	unsigned int period;
 	unsigned int duty_cycle;
-- 
cgit v1.2.3


From a8c3862551e063344f80c3e05d595f9d8836f355 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:37 +0200
Subject: pwm: Keep PWM state in sync with hardware state

Before the introduction of pwm_args, the core was resetting the PWM
period and polarity states to the reference values (those provided
through the DT, a PWM lookup table or hardcoded in the driver).

Now that all PWM users are correctly using pwm_args to configure their
PWM device, we can safely remove the pwm_apply_args() call in pwm_get()
and of_pwm_get().

We can also get rid of the pwm_set_period() call in pwm_apply_args(),
because PWM users are now directly using pargs->period instead of
pwm_get_period(). By doing that we avoid messing with the current PWM
period.

The only remaining bit in pwm_apply_args() is the initial polarity
setting, and it should go away when all PWM users have been patched to
use the atomic API (with this API the polarity will be set along with
other PWM arguments when configuring the PWM).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 14 --------------
 include/linux/pwm.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index cb762cf51332..64330595e17b 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -609,13 +609,6 @@ struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
 
 	pwm->label = con_id;
 
-	/*
-	 * FIXME: This should be removed once all PWM users properly make use
-	 * of struct pwm_args to initialize the PWM device. As long as this is
-	 * here, the PWM state and hardware state can get out of sync.
-	 */
-	pwm_apply_args(pwm);
-
 put:
 	of_node_put(args.np);
 
@@ -750,13 +743,6 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	pwm->args.period = chosen->period;
 	pwm->args.polarity = chosen->polarity;
 
-	/*
-	 * FIXME: This should be removed once all PWM users properly make use
-	 * of struct pwm_args to initialize the PWM device. As long as this is
-	 * here, the PWM state and hardware state can get out of sync.
-	 */
-	pwm_apply_args(pwm);
-
 out:
 	mutex_unlock(&pwm_lookup_lock);
 	return pwm;
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d2e7430ccedb..7caf549f720e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -171,7 +171,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
 {
-	pwm_set_period(pwm, pwm->args.period);
 	pwm_set_polarity(pwm, pwm->args.polarity);
 }
 
-- 
cgit v1.2.3


From 43a276b003ed2e03de9d94b02a1ba49c1849b931 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:38 +0200
Subject: pwm: Introduce the pwm_state concept

The PWM state, represented by its period, duty_cycle and polarity is
currently directly stored in the PWM device. Declare a pwm_state
structure embedding those field so that we can later use this struct
to atomically update all the PWM parameters at once.

All pwm_get_xxx() helpers are now implemented as wrappers around
pwm_get_state().

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  |  8 ++++----
 include/linux/pwm.h | 54 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 64330595e17b..f3f91e716a42 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -268,7 +268,7 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->chip = chip;
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
-		pwm->polarity = polarity;
+		pwm->state.polarity = polarity;
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -446,8 +446,8 @@ int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
 	if (err)
 		return err;
 
-	pwm->duty_cycle = duty_ns;
-	pwm->period = period_ns;
+	pwm->state.duty_cycle = duty_ns;
+	pwm->state.period = period_ns;
 
 	return 0;
 }
@@ -480,7 +480,7 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 	if (err)
 		return err;
 
-	pwm->polarity = polarity;
+	pwm->state.polarity = polarity;
 
 	return 0;
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 7caf549f720e..51d4005418f9 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -98,6 +98,18 @@ enum {
 	PWMF_EXPORTED = 1 << 2,
 };
 
+/*
+ * struct pwm_state - state of a PWM channel
+ * @period: PWM period (in nanoseconds)
+ * @duty_cycle: PWM duty cycle (in nanoseconds)
+ * @polarity: PWM polarity
+ */
+struct pwm_state {
+	unsigned int period;
+	unsigned int duty_cycle;
+	enum pwm_polarity polarity;
+};
+
 /**
  * struct pwm_device - PWM channel object
  * @label: name of the PWM device
@@ -106,10 +118,8 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @period: period of the PWM signal (in nanoseconds)
- * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
- * @polarity: polarity of the PWM signal
  * @args: PWM arguments
+ * @state: curent PWM channel state
  */
 struct pwm_device {
 	const char *label;
@@ -119,13 +129,21 @@ struct pwm_device {
 	struct pwm_chip *chip;
 	void *chip_data;
 
-	unsigned int period;
-	unsigned int duty_cycle;
-	enum pwm_polarity polarity;
-
 	struct pwm_args args;
+	struct pwm_state state;
 };
 
+/**
+ * pwm_get_state() - retrieve the current PWM state
+ * @pwm: PWM device
+ * @state: state to fill with the current PWM state
+ */
+static inline void pwm_get_state(const struct pwm_device *pwm,
+				 struct pwm_state *state)
+{
+	*state = pwm->state;
+}
+
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
 	return test_bit(PWMF_ENABLED, &pwm->flags);
@@ -134,23 +152,31 @@ static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
 	if (pwm)
-		pwm->period = period;
+		pwm->state.period = period;
 }
 
 static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->period : 0;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.period;
 }
 
 static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
 {
 	if (pwm)
-		pwm->duty_cycle = duty;
+		pwm->state.duty_cycle = duty;
 }
 
 static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->duty_cycle : 0;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.duty_cycle;
 }
 
 /*
@@ -160,7 +186,11 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
 
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
-	return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.polarity;
 }
 
 static inline void pwm_get_args(const struct pwm_device *pwm,
-- 
cgit v1.2.3


From 09a7e4a3d9fcb95ade2cb02167e85fbeb8315ce0 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:39 +0200
Subject: pwm: Move the enabled/disabled info into pwm_state

Prepare the transition to PWM atomic update by moving the enabled and
disabled state into the pwm_state struct. This way we can easily update
the whole PWM state by copying the new state in the ->state field.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 13 +++++++++----
 include/linux/pwm.h | 11 ++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index f3f91e716a42..c240b5437145 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -499,10 +499,10 @@ int pwm_enable(struct pwm_device *pwm)
 	if (!pwm)
 		return -EINVAL;
 
-	if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
+	if (!pwm_is_enabled(pwm)) {
 		err = pwm->chip->ops->enable(pwm->chip, pwm);
-		if (err)
-			clear_bit(PWMF_ENABLED, &pwm->flags);
+		if (!err)
+			pwm->state.enabled = true;
 	}
 
 	return err;
@@ -515,8 +515,13 @@ EXPORT_SYMBOL_GPL(pwm_enable);
  */
 void pwm_disable(struct pwm_device *pwm)
 {
-	if (pwm && test_and_clear_bit(PWMF_ENABLED, &pwm->flags))
+	if (!pwm)
+		return;
+
+	if (pwm_is_enabled(pwm)) {
 		pwm->chip->ops->disable(pwm->chip, pwm);
+		pwm->state.enabled = false;
+	}
 }
 EXPORT_SYMBOL_GPL(pwm_disable);
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 51d4005418f9..150563a806d6 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -94,8 +94,7 @@ struct pwm_args {
 
 enum {
 	PWMF_REQUESTED = 1 << 0,
-	PWMF_ENABLED = 1 << 1,
-	PWMF_EXPORTED = 1 << 2,
+	PWMF_EXPORTED = 1 << 1,
 };
 
 /*
@@ -103,11 +102,13 @@ enum {
  * @period: PWM period (in nanoseconds)
  * @duty_cycle: PWM duty cycle (in nanoseconds)
  * @polarity: PWM polarity
+ * @enabled: PWM enabled status
  */
 struct pwm_state {
 	unsigned int period;
 	unsigned int duty_cycle;
 	enum pwm_polarity polarity;
+	bool enabled;
 };
 
 /**
@@ -146,7 +147,11 @@ static inline void pwm_get_state(const struct pwm_device *pwm,
 
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
-	return test_bit(PWMF_ENABLED, &pwm->flags);
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return state.enabled;
 }
 
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
-- 
cgit v1.2.3


From 15fa8a43c147213a9563903c87b29671035eb6e8 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:40 +0200
Subject: pwm: Add hardware readout infrastructure

Add a ->get_state() function to the pwm_ops struct to let PWM drivers
initialize the PWM state attached to a PWM device.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  |  3 +++
 include/linux/pwm.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index c240b5437145..a909c64ee863 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -270,6 +270,9 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->hwpwm = i;
 		pwm->state.polarity = polarity;
 
+		if (chip->ops->get_state)
+			chip->ops->get_state(chip, pwm, &pwm->state);
+
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 150563a806d6..33f8decd9f38 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -206,6 +206,29 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
 {
+	/*
+	 * PWM users calling pwm_apply_args() expect to have a fresh config
+	 * where the polarity and period are set according to pwm_args info.
+	 * The problem is, polarity can only be changed when the PWM is
+	 * disabled.
+	 *
+	 * PWM drivers supporting hardware readout may declare the PWM device
+	 * as enabled, and prevent polarity setting, which changes from the
+	 * existing behavior, where all PWM devices are declared as disabled
+	 * at startup (even if they are actually enabled), thus authorizing
+	 * polarity setting.
+	 *
+	 * Instead of setting ->enabled to false, we call pwm_disable()
+	 * before pwm_set_polarity() to ensure that everything is configured
+	 * as expected, and the PWM is really disabled when the user request
+	 * it.
+	 *
+	 * Note that PWM users requiring a smooth handover between the
+	 * bootloader and the kernel (like critical regulators controlled by
+	 * PWM devices) will have to switch to the atomic API and avoid calling
+	 * pwm_apply_args().
+	 */
+	pwm_disable(pwm);
 	pwm_set_polarity(pwm, pwm->args.polarity);
 }
 
@@ -217,6 +240,9 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @get_state: get the current PWM state. This function is only
+ *	       called once per PWM device when the PWM chip is
+ *	       registered.
  * @dbg_show: optional routine to show contents in debugfs
  * @owner: helps prevent removal of modules exporting active PWMs
  */
@@ -229,6 +255,8 @@ struct pwm_ops {
 			    enum pwm_polarity polarity);
 	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
 	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
+			  struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
 	void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
-- 
cgit v1.2.3


From 5ec803edcb703fe379836f13560b79dfac79b01d Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 14 Apr 2016 21:17:41 +0200
Subject: pwm: Add core infrastructure to allow atomic updates

Add an ->apply() method to the pwm_ops struct to allow PWM drivers to
implement atomic updates. This method is preferred over the ->enable(),
->disable() and ->config() methods if available.

Add the pwm_apply_state() function to the PWM user API.

Note that the pwm_apply_state() does not guarantee the atomicity of the
update operation, it all depends on the availability and implementation
of the ->apply() method.

pwm_enable/disable/set_polarity/config() are now implemented as wrappers
around the pwm_apply_state() function.

pwm_adjust_config() is allowing smooth handover between the bootloader
and the kernel. This function tries to adapt the current PWM state to
the PWM arguments coming from a PWM lookup table or a DT definition
without changing the duty_cycle/period proportion.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
[thierry.reding@gmail.com: fix a couple of typos]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 187 +++++++++++++++++++++++-------------
 include/linux/pwm.h | 269 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 303 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index a909c64ee863..729d457861fd 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -226,6 +226,19 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
 }
 EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
+static bool pwm_ops_check(const struct pwm_ops *ops)
+{
+	/* driver supports legacy, non-atomic operation */
+	if (ops->config && ops->enable && ops->disable)
+		return true;
+
+	/* driver supports atomic operation */
+	if (ops->apply)
+		return true;
+
+	return false;
+}
+
 /**
  * pwmchip_add_with_polarity() - register a new PWM chip
  * @chip: the PWM chip to add
@@ -244,8 +257,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 	unsigned int i;
 	int ret;
 
-	if (!chip || !chip->dev || !chip->ops || !chip->ops->config ||
-	    !chip->ops->enable || !chip->ops->disable || !chip->npwm)
+	if (!chip || !chip->dev || !chip->ops || !chip->npwm)
+		return -EINVAL;
+
+	if (!pwm_ops_check(chip->ops))
 		return -EINVAL;
 
 	mutex_lock(&pwm_lock);
@@ -431,102 +446,138 @@ void pwm_free(struct pwm_device *pwm)
 EXPORT_SYMBOL_GPL(pwm_free);
 
 /**
- * pwm_config() - change a PWM device configuration
+ * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
- * @duty_ns: "on" time (in nanoseconds)
- * @period_ns: duration (in nanoseconds) of one cycle
- *
- * Returns: 0 on success or a negative error code on failure.
+ * @state: new state to apply. This can be adjusted by the PWM driver
+ *	   if the requested config is not achievable, for example,
+ *	   ->duty_cycle and ->period might be approximated.
  */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
 {
 	int err;
 
-	if (!pwm || duty_ns < 0 || period_ns <= 0 || duty_ns > period_ns)
+	if (!pwm)
 		return -EINVAL;
 
-	err = pwm->chip->ops->config(pwm->chip, pwm, duty_ns, period_ns);
-	if (err)
-		return err;
-
-	pwm->state.duty_cycle = duty_ns;
-	pwm->state.period = period_ns;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pwm_config);
+	if (!memcmp(state, &pwm->state, sizeof(*state)))
+		return 0;
 
-/**
- * pwm_set_polarity() - configure the polarity of a PWM signal
- * @pwm: PWM device
- * @polarity: new polarity of the PWM signal
- *
- * Note that the polarity cannot be configured while the PWM device is
- * enabled.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
-{
-	int err;
+	if (pwm->chip->ops->apply) {
+		err = pwm->chip->ops->apply(pwm->chip, pwm, state);
+		if (err)
+			return err;
 
-	if (!pwm || !pwm->chip->ops)
-		return -EINVAL;
+		pwm->state = *state;
+	} else {
+		/*
+		 * FIXME: restore the initial state in case of error.
+		 */
+		if (state->polarity != pwm->state.polarity) {
+			if (!pwm->chip->ops->set_polarity)
+				return -ENOTSUPP;
+
+			/*
+			 * Changing the polarity of a running PWM is
+			 * only allowed when the PWM driver implements
+			 * ->apply().
+			 */
+			if (pwm->state.enabled) {
+				pwm->chip->ops->disable(pwm->chip, pwm);
+				pwm->state.enabled = false;
+			}
+
+			err = pwm->chip->ops->set_polarity(pwm->chip, pwm,
+							   state->polarity);
+			if (err)
+				return err;
+
+			pwm->state.polarity = state->polarity;
+		}
 
-	if (!pwm->chip->ops->set_polarity)
-		return -ENOSYS;
+		if (state->period != pwm->state.period ||
+		    state->duty_cycle != pwm->state.duty_cycle) {
+			err = pwm->chip->ops->config(pwm->chip, pwm,
+						     state->duty_cycle,
+						     state->period);
+			if (err)
+				return err;
 
-	if (pwm_is_enabled(pwm))
-		return -EBUSY;
+			pwm->state.duty_cycle = state->duty_cycle;
+			pwm->state.period = state->period;
+		}
 
-	err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
-	if (err)
-		return err;
+		if (state->enabled != pwm->state.enabled) {
+			if (state->enabled) {
+				err = pwm->chip->ops->enable(pwm->chip, pwm);
+				if (err)
+					return err;
+			} else {
+				pwm->chip->ops->disable(pwm->chip, pwm);
+			}
 
-	pwm->state.polarity = polarity;
+			pwm->state.enabled = state->enabled;
+		}
+	}
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pwm_set_polarity);
+EXPORT_SYMBOL_GPL(pwm_apply_state);
 
 /**
- * pwm_enable() - start a PWM output toggling
+ * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
  * @pwm: PWM device
  *
- * Returns: 0 on success or a negative error code on failure.
+ * This function will adjust the PWM config to the PWM arguments provided
+ * by the DT or PWM lookup table. This is particularly useful to adapt
+ * the bootloader config to the Linux one.
  */
-int pwm_enable(struct pwm_device *pwm)
+int pwm_adjust_config(struct pwm_device *pwm)
 {
-	int err = 0;
+	struct pwm_state state;
+	struct pwm_args pargs;
 
-	if (!pwm)
-		return -EINVAL;
+	pwm_get_args(pwm, &pargs);
+	pwm_get_state(pwm, &state);
+
+	/*
+	 * If the current period is zero it means that either the PWM driver
+	 * does not support initial state retrieval or the PWM has not yet
+	 * been configured.
+	 *
+	 * In either case, we setup the new period and polarity, and assign a
+	 * duty cycle of 0.
+	 */
+	if (!state.period) {
+		state.duty_cycle = 0;
+		state.period = pargs.period;
+		state.polarity = pargs.polarity;
 
-	if (!pwm_is_enabled(pwm)) {
-		err = pwm->chip->ops->enable(pwm->chip, pwm);
-		if (!err)
-			pwm->state.enabled = true;
+		return pwm_apply_state(pwm, &state);
 	}
 
-	return err;
-}
-EXPORT_SYMBOL_GPL(pwm_enable);
+	/*
+	 * Adjust the PWM duty cycle/period based on the period value provided
+	 * in PWM args.
+	 */
+	if (pargs.period != state.period) {
+		u64 dutycycle = (u64)state.duty_cycle * pargs.period;
 
-/**
- * pwm_disable() - stop a PWM output toggling
- * @pwm: PWM device
- */
-void pwm_disable(struct pwm_device *pwm)
-{
-	if (!pwm)
-		return;
+		do_div(dutycycle, state.period);
+		state.duty_cycle = dutycycle;
+		state.period = pargs.period;
+	}
 
-	if (pwm_is_enabled(pwm)) {
-		pwm->chip->ops->disable(pwm->chip, pwm);
-		pwm->state.enabled = false;
+	/*
+	 * If the polarity changed, we should also change the duty cycle.
+	 */
+	if (pargs.polarity != state.polarity) {
+		state.polarity = pargs.polarity;
+		state.duty_cycle = state.period - state.duty_cycle;
 	}
+
+	return pwm_apply_state(pwm, &state);
 }
-EXPORT_SYMBOL_GPL(pwm_disable);
+EXPORT_SYMBOL_GPL(pwm_adjust_config);
 
 static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 33f8decd9f38..17018f3c066e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -5,59 +5,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct pwm_device;
 struct seq_file;
-
-#if IS_ENABLED(CONFIG_PWM)
-/*
- * pwm_request - request a PWM device
- */
-struct pwm_device *pwm_request(int pwm_id, const char *label);
-
-/*
- * pwm_free - free a PWM device
- */
-void pwm_free(struct pwm_device *pwm);
-
-/*
- * pwm_config - change a PWM device configuration
- */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
-
-/*
- * pwm_enable - start a PWM output toggling
- */
-int pwm_enable(struct pwm_device *pwm);
-
-/*
- * pwm_disable - stop a PWM output toggling
- */
-void pwm_disable(struct pwm_device *pwm);
-#else
-static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void pwm_free(struct pwm_device *pwm)
-{
-}
-
-static inline int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
-{
-	return -EINVAL;
-}
-
-static inline int pwm_enable(struct pwm_device *pwm)
-{
-	return -EINVAL;
-}
-
-static inline void pwm_disable(struct pwm_device *pwm)
-{
-}
-#endif
-
 struct pwm_chip;
 
 /**
@@ -184,11 +132,6 @@ static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 	return state.duty_cycle;
 }
 
-/*
- * pwm_set_polarity - configure the polarity of a PWM signal
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
-
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
@@ -204,34 +147,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 	*args = pwm->args;
 }
 
-static inline void pwm_apply_args(struct pwm_device *pwm)
-{
-	/*
-	 * PWM users calling pwm_apply_args() expect to have a fresh config
-	 * where the polarity and period are set according to pwm_args info.
-	 * The problem is, polarity can only be changed when the PWM is
-	 * disabled.
-	 *
-	 * PWM drivers supporting hardware readout may declare the PWM device
-	 * as enabled, and prevent polarity setting, which changes from the
-	 * existing behavior, where all PWM devices are declared as disabled
-	 * at startup (even if they are actually enabled), thus authorizing
-	 * polarity setting.
-	 *
-	 * Instead of setting ->enabled to false, we call pwm_disable()
-	 * before pwm_set_polarity() to ensure that everything is configured
-	 * as expected, and the PWM is really disabled when the user request
-	 * it.
-	 *
-	 * Note that PWM users requiring a smooth handover between the
-	 * bootloader and the kernel (like critical regulators controlled by
-	 * PWM devices) will have to switch to the atomic API and avoid calling
-	 * pwm_apply_args().
-	 */
-	pwm_disable(pwm);
-	pwm_set_polarity(pwm, pwm->args.polarity);
-}
-
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -240,6 +155,10 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @apply: atomically apply a new PWM config. The state argument
+ *	   should be adjusted with the real hardware config (if the
+ *	   approximate the period or duty_cycle value, state should
+ *	   reflect it)
  * @get_state: get the current PWM state. This function is only
  *	       called once per PWM device when the PWM chip is
  *	       registered.
@@ -255,6 +174,8 @@ struct pwm_ops {
 			    enum pwm_polarity polarity);
 	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
 	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
+		     struct pwm_state *state);
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
@@ -292,6 +213,115 @@ struct pwm_chip {
 };
 
 #if IS_ENABLED(CONFIG_PWM)
+/* PWM user APIs */
+struct pwm_device *pwm_request(int pwm_id, const char *label);
+void pwm_free(struct pwm_device *pwm);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+int pwm_adjust_config(struct pwm_device *pwm);
+
+/**
+ * pwm_config() - change a PWM device configuration
+ * @pwm: PWM device
+ * @duty_ns: "on" time (in nanoseconds)
+ * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+			     int period_ns)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.duty_cycle == duty_ns && state.period == period_ns)
+		return 0;
+
+	state.duty_cycle = duty_ns;
+	state.period = period_ns;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_set_polarity() - configure the polarity of a PWM signal
+ * @pwm: PWM device
+ * @polarity: new polarity of the PWM signal
+ *
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+				   enum pwm_polarity polarity)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.polarity == polarity)
+		return 0;
+
+	/*
+	 * Changing the polarity of a running PWM without adjusting the
+	 * dutycycle/period value is a bit risky (can introduce glitches).
+	 * Return -EBUSY in this case.
+	 * Note that this is allowed when using pwm_apply_state() because
+	 * the user specifies all the parameters.
+	 */
+	if (state.enabled)
+		return -EBUSY;
+
+	state.polarity = polarity;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_enable() - start a PWM output toggling
+ * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return -EINVAL;
+
+	pwm_get_state(pwm, &state);
+	if (state.enabled)
+		return 0;
+
+	state.enabled = true;
+	return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_disable() - stop a PWM output toggling
+ * @pwm: PWM device
+ */
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+	struct pwm_state state;
+
+	if (!pwm)
+		return;
+
+	pwm_get_state(pwm, &state);
+	if (!state.enabled)
+		return;
+
+	state.enabled = false;
+	pwm_apply_state(pwm, &state);
+}
+
+
+/* PWM provider APIs */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
@@ -317,6 +347,47 @@ void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 
 bool pwm_can_sleep(struct pwm_device *pwm);
 #else
+static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void pwm_free(struct pwm_device *pwm)
+{
+}
+
+static inline int pwm_apply_state(struct pwm_device *pwm,
+				  const struct pwm_state *state)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_adjust_config(struct pwm_device *pwm)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+			     int period_ns)
+{
+	return -EINVAL;
+}
+
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+				   enum pwm_polarity polarity)
+{
+	return -ENOTSUPP;
+}
+
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+	return -EINVAL;
+}
+
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+}
+
 static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
 	return -EINVAL;
@@ -388,6 +459,34 @@ static inline bool pwm_can_sleep(struct pwm_device *pwm)
 }
 #endif
 
+static inline void pwm_apply_args(struct pwm_device *pwm)
+{
+	/*
+	 * PWM users calling pwm_apply_args() expect to have a fresh config
+	 * where the polarity and period are set according to pwm_args info.
+	 * The problem is, polarity can only be changed when the PWM is
+	 * disabled.
+	 *
+	 * PWM drivers supporting hardware readout may declare the PWM device
+	 * as enabled, and prevent polarity setting, which changes from the
+	 * existing behavior, where all PWM devices are declared as disabled
+	 * at startup (even if they are actually enabled), thus authorizing
+	 * polarity setting.
+	 *
+	 * Instead of setting ->enabled to false, we call pwm_disable()
+	 * before pwm_set_polarity() to ensure that everything is configured
+	 * as expected, and the PWM is really disabled when the user request
+	 * it.
+	 *
+	 * Note that PWM users requiring a smooth handover between the
+	 * bootloader and the kernel (like critical regulators controlled by
+	 * PWM devices) will have to switch to the atomic API and avoid calling
+	 * pwm_apply_args().
+	 */
+	pwm_disable(pwm);
+	pwm_set_polarity(pwm, pwm->args.polarity);
+}
+
 struct pwm_lookup {
 	struct list_head list;
 	const char *provider;
-- 
cgit v1.2.3


From c35095215a80b13af2f521e1ab1c727a5224e53b Mon Sep 17 00:00:00 2001
From: Wei Ni <wni@nvidia.com>
Date: Tue, 29 Mar 2016 18:29:17 +0800
Subject: thermal: of-thermal: allow setting trip_temp on hardware

In current of-thermal, the .set_trip_temp only support to
set trip_temp for SW. But some sensors support to set
trip_temp on hardware, so that can trigger interrupt,
shutdown or any other events.
This patch adds .set_trip_temp() callback in
thermal_zone_of_device_ops{}, so that the sensor device can
use it to set trip_temp on hardware.

Signed-off-by: Wei Ni <wni@nvidia.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 drivers/thermal/of-thermal.c | 8 ++++++++
 include/linux/thermal.h      | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index 82dd82afa555..b8e509c60848 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -331,6 +331,14 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
 	if (trip >= data->ntrips || trip < 0)
 		return -EDOM;
 
+	if (data->ops->set_trip_temp) {
+		int ret;
+
+		ret = data->ops->set_trip_temp(data->sensor_data, trip, temp);
+		if (ret)
+			return ret;
+	}
+
 	/* thermal framework should take care of data->mask & (1 << trip) */
 	data->trips[trip].temperature = temp;
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1b8a5a7876ce..e45abe7db9a6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -340,6 +340,7 @@ struct thermal_zone_of_device_ops {
 	int (*get_temp)(void *, int *);
 	int (*get_trend)(void *, long *);
 	int (*set_emul_temp)(void *, int);
+	int (*set_trip_temp)(void *, int, int);
 };
 
 /**
-- 
cgit v1.2.3


From 7ff2760999a86e4d2b1af93dcf0f0d336c309571 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 16 May 2016 15:35:24 +0300
Subject: mmc: core: Add a facility to "pause" re-tuning

Re-tuning is not possible when switched to the RPMB
partition.  However re-tuning should not be needed
if re-tuning is done immediately before switching,
a small set of operations is done, and then we
immediately switch back to the main partition.

To ensure that re-tuning can't be done for a short
while, add a facility to "pause" re-tuning.

The existing facility to hold / release re-tuning
is used but it also flags re-tuning as needed to cause
re-tuning before the next command (which will be the
switch to RPMB).

We also need to "unpause" in the recovery path, which
is catered for by adding it to mmc_retune_disable().

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c  | 24 ++++++++++++++++++++++++
 include/linux/mmc/host.h |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index e0a3ee16c0d3..1be42fab1a30 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -68,8 +68,32 @@ void mmc_retune_enable(struct mmc_host *host)
 			  jiffies + host->retune_period * HZ);
 }
 
+/*
+ * Pause re-tuning for a small set of operations.  The pause begins after the
+ * next command and after first doing re-tuning.
+ */
+void mmc_retune_pause(struct mmc_host *host)
+{
+	if (!host->retune_paused) {
+		host->retune_paused = 1;
+		mmc_retune_needed(host);
+		mmc_retune_hold(host);
+	}
+}
+EXPORT_SYMBOL(mmc_retune_pause);
+
+void mmc_retune_unpause(struct mmc_host *host)
+{
+	if (host->retune_paused) {
+		host->retune_paused = 0;
+		mmc_retune_release(host);
+	}
+}
+EXPORT_SYMBOL(mmc_retune_unpause);
+
 void mmc_retune_disable(struct mmc_host *host)
 {
+	mmc_retune_unpause(host);
 	host->can_retune = 0;
 	del_timer_sync(&host->retune_timer);
 	host->retune_now = 0;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 85800b48241f..45cde8cd39f2 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -329,6 +329,7 @@ struct mmc_host {
 	unsigned int		can_retune:1;	/* re-tuning can be used */
 	unsigned int		doing_retune:1;	/* re-tuning in progress */
 	unsigned int		retune_now:1;	/* do re-tuning at next req */
+	unsigned int		retune_paused:1; /* re-tuning is temporarily disabled */
 
 	int			rescan_disable;	/* disable card detection */
 	int			rescan_entered;	/* used with nonremovable devices */
@@ -526,4 +527,7 @@ static inline void mmc_retune_recheck(struct mmc_host *host)
 		host->retune_now = 1;
 }
 
+void mmc_retune_pause(struct mmc_host *host);
+void mmc_retune_unpause(struct mmc_host *host);
+
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 39651abd28146fff2bfac63d68a7a56250a4aead Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Date: Tue, 17 May 2016 06:44:26 -0400
Subject: qed: add support for dcbx.

This patch adds the necessary driver support for Management Firmware to
configure the device/firmware with the dcbx results. Management Firmware
is responsible for communicating the DCBX and driving the negotiation,
but the driver has responsibility of receiving async notification and
configuring the results in hw/fw. This patch also adds the dcbx support for
future protocols (e.g., FCoE) as preparation to their imminent submission.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h             |   2 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |  10 +
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        | 562 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.h        |  80 +++
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 105 +++-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  21 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |  13 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  13 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  25 +
 include/linux/qed/qed_if.h                        |   9 +
 11 files changed, 834 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index a44874562cfd..d1f157e439cf 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -2,5 +2,5 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o
+	 qed_selftest.o qed_dcbx.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 77323fc70927..1042f2af854a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -367,6 +367,8 @@ struct qed_hwfn {
 	struct qed_pf_iov		*pf_iov_info;
 	struct qed_mcp_info		*mcp_info;
 
+	struct qed_dcbx_info		*p_dcbx_info;
+
 	struct qed_hw_cid_data		*p_tx_cids;
 	struct qed_hw_cid_data		*p_rx_cids;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 078ff3fd7920..234c0fa8db2a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -130,6 +130,16 @@ void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn);
 
 void qed_qm_init_pf(struct qed_hwfn *p_hwfn);
 
+/**
+ * @brief Reconfigures QM pf on the fly
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /**
  * @brief qed_cxt_release - Release a cid
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
new file mode 100644
index 000000000000..cbf58e1f9333
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -0,0 +1,562 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dcbx.h"
+#include "qed_hsi.h"
+#include "qed_sp.h"
+
+#define QED_DCBX_MAX_MIB_READ_TRY       (100)
+#define QED_ETH_TYPE_DEFAULT            (0)
+#define QED_ETH_TYPE_ROCE               (0x8915)
+#define QED_UDP_PORT_TYPE_ROCE_V2       (0x12B7)
+#define QED_ETH_TYPE_FCOE               (0x8906)
+#define QED_TCP_PORT_ISCSI              (0xCBC)
+
+#define QED_DCBX_INVALID_PRIORITY       0xFF
+
+/* Get Traffic Class from priority traffic class table, 4 bits represent
+ * the traffic class corresponding to the priority.
+ */
+#define QED_DCBX_PRIO2TC(prio_tc_tbl, prio) \
+	((u32)(prio_tc_tbl >> ((7 - prio) * 4)) & 0x7)
+
+static const struct qed_dcbx_app_metadata qed_dcbx_app_update[] = {
+	{DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_DEFAULT},
+	{DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH}
+};
+
+static bool qed_dcbx_app_ethtype(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_ETHTYPE);
+}
+
+static bool qed_dcbx_app_port(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_PORT);
+}
+
+static bool qed_dcbx_default_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_DEFAULT);
+}
+
+static bool qed_dcbx_iscsi_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_port(app_info_bitmap) &&
+		  proto_id == QED_TCP_PORT_ISCSI);
+}
+
+static bool qed_dcbx_fcoe_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_FCOE);
+}
+
+static bool qed_dcbx_roce_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_ethtype(app_info_bitmap) &&
+		  proto_id == QED_ETH_TYPE_ROCE);
+}
+
+static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id)
+{
+	return !!(qed_dcbx_app_port(app_info_bitmap) &&
+		  proto_id == QED_UDP_PORT_TYPE_ROCE_V2);
+}
+
+static void
+qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data)
+{
+	enum dcbx_protocol_type id;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "DCBX negotiated: %d\n",
+		   p_data->dcbx_enabled);
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+			   "%s info: update %d, enable %d, prio %d, tc %d, num_tc %d\n",
+			   qed_dcbx_app_update[i].name, p_data->arr[id].update,
+			   p_data->arr[id].enable, p_data->arr[id].priority,
+			   p_data->arr[id].tc, p_hwfn->hw_info.num_tc);
+	}
+}
+
+static void
+qed_dcbx_set_params(struct qed_dcbx_results *p_data,
+		    struct qed_hw_info *p_info,
+		    bool enable,
+		    bool update,
+		    u8 prio,
+		    u8 tc,
+		    enum dcbx_protocol_type type,
+		    enum qed_pci_personality personality)
+{
+	/* PF update ramrod data */
+	p_data->arr[type].update = update;
+	p_data->arr[type].enable = enable;
+	p_data->arr[type].priority = prio;
+	p_data->arr[type].tc = tc;
+
+	/* QM reconf data */
+	if (p_info->personality == personality) {
+		if (personality == QED_PCI_ETH)
+			p_info->non_offload_tc = tc;
+		else
+			p_info->offload_tc = tc;
+	}
+}
+
+/* Update app protocol data and hw_info fields with the TLV info */
+static void
+qed_dcbx_update_app_info(struct qed_dcbx_results *p_data,
+			 struct qed_hwfn *p_hwfn,
+			 bool enable,
+			 bool update,
+			 u8 prio, u8 tc, enum dcbx_protocol_type type)
+{
+	struct qed_hw_info *p_info = &p_hwfn->hw_info;
+	enum qed_pci_personality personality;
+	enum dcbx_protocol_type id;
+	char *name;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		if (type != id)
+			continue;
+
+		personality = qed_dcbx_app_update[i].personality;
+		name = qed_dcbx_app_update[i].name;
+
+		qed_dcbx_set_params(p_data, p_info, enable, update,
+				    prio, tc, type, personality);
+	}
+}
+
+static bool
+qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn,
+			       u32 app_prio_bitmap,
+			       u16 id, enum dcbx_protocol_type *type)
+{
+	if (qed_dcbx_fcoe_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_FCOE;
+	} else if (qed_dcbx_roce_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ROCE;
+	} else if (qed_dcbx_iscsi_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ISCSI;
+	} else if (qed_dcbx_default_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ETH;
+	} else if (qed_dcbx_roce_v2_tlv(app_prio_bitmap, id)) {
+		*type = DCBX_PROTOCOL_ROCE_V2;
+	} else {
+		*type = DCBX_MAX_PROTOCOL_TYPE;
+		DP_ERR(p_hwfn,
+		       "No action required, App TLV id = 0x%x app_prio_bitmap = 0x%x\n",
+		       id, app_prio_bitmap);
+		return false;
+	}
+
+	return true;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int
+qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
+		     struct qed_dcbx_results *p_data,
+		     struct dcbx_app_priority_entry *p_tbl,
+		     u32 pri_tc_tbl, int count, bool dcbx_enabled)
+{
+	u8 tc, priority, priority_map;
+	enum dcbx_protocol_type type;
+	u16 protocol_id;
+	bool enable;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
+
+	/* Parse APP TLV */
+	for (i = 0; i < count; i++) {
+		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						DCBX_APP_PROTOCOL_ID);
+		priority_map = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						 DCBX_APP_PRI_MAP);
+		priority = ffs(priority_map) - 1;
+		if (priority < 0) {
+			DP_ERR(p_hwfn, "Invalid priority\n");
+			return -EINVAL;
+		}
+
+		tc = QED_DCBX_PRIO2TC(pri_tc_tbl, priority);
+		if (qed_dcbx_get_app_protocol_type(p_hwfn, p_tbl[i].entry,
+						   protocol_id, &type)) {
+			/* ETH always have the enable bit reset, as it gets
+			 * vlan information per packet. For other protocols,
+			 * should be set according to the dcbx_enabled
+			 * indication, but we only got here if there was an
+			 * app tlv for the protocol, so dcbx must be enabled.
+			 */
+			enable = !!(type == DCBX_PROTOCOL_ETH);
+
+			qed_dcbx_update_app_info(p_data, p_hwfn, enable, true,
+						 priority, tc, type);
+		}
+	}
+
+	/* If RoCE-V2 TLV is not detected, driver need to use RoCE app
+	 * data for RoCE-v2 not the default app data.
+	 */
+	if (!p_data->arr[DCBX_PROTOCOL_ROCE_V2].update &&
+	    p_data->arr[DCBX_PROTOCOL_ROCE].update) {
+		tc = p_data->arr[DCBX_PROTOCOL_ROCE].tc;
+		priority = p_data->arr[DCBX_PROTOCOL_ROCE].priority;
+		qed_dcbx_update_app_info(p_data, p_hwfn, true, true,
+					 priority, tc, DCBX_PROTOCOL_ROCE_V2);
+	}
+
+	/* Update ramrod protocol data and hw_info fields
+	 * with default info when corresponding APP TLV's are not detected.
+	 * The enabled field has a different logic for ethernet as only for
+	 * ethernet dcb should disabled by default, as the information arrives
+	 * from the OS (unless an explicit app tlv was present).
+	 */
+	tc = p_data->arr[DCBX_PROTOCOL_ETH].tc;
+	priority = p_data->arr[DCBX_PROTOCOL_ETH].priority;
+	for (type = 0; type < DCBX_MAX_PROTOCOL_TYPE; type++) {
+		if (p_data->arr[type].update)
+			continue;
+
+		enable = (type == DCBX_PROTOCOL_ETH) ? false : dcbx_enabled;
+		qed_dcbx_update_app_info(p_data, p_hwfn, enable, true,
+					 priority, tc, type);
+	}
+
+	return 0;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn)
+{
+	struct dcbx_app_priority_feature *p_app;
+	struct dcbx_app_priority_entry *p_tbl;
+	struct qed_dcbx_results data = { 0 };
+	struct dcbx_ets_feature *p_ets;
+	struct qed_hw_info *p_info;
+	u32 pri_tc_tbl, flags;
+	bool dcbx_enabled;
+	int num_entries;
+	int rc = 0;
+
+	/* If DCBx version is non zero, then negotiation was
+	 * successfuly performed
+	 */
+	flags = p_hwfn->p_dcbx_info->operational.flags;
+	dcbx_enabled = !!QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION);
+
+	p_app = &p_hwfn->p_dcbx_info->operational.features.app;
+	p_tbl = p_app->app_pri_tbl;
+
+	p_ets = &p_hwfn->p_dcbx_info->operational.features.ets;
+	pri_tc_tbl = p_ets->pri_tc_tbl[0];
+
+	p_info = &p_hwfn->hw_info;
+	num_entries = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_NUM_ENTRIES);
+
+	rc = qed_dcbx_process_tlv(p_hwfn, &data, p_tbl, pri_tc_tbl,
+				  num_entries, dcbx_enabled);
+	if (rc)
+		return rc;
+
+	p_info->num_tc = QED_MFW_GET_FIELD(p_ets->flags, DCBX_ETS_MAX_TCS);
+	data.pf_id = p_hwfn->rel_pf_id;
+	data.dcbx_enabled = dcbx_enabled;
+
+	qed_dcbx_dp_protocol(p_hwfn, &data);
+
+	memcpy(&p_hwfn->p_dcbx_info->results, &data,
+	       sizeof(struct qed_dcbx_results));
+
+	return 0;
+}
+
+static int
+qed_dcbx_copy_mib(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt,
+		  struct qed_dcbx_mib_meta_data *p_data,
+		  enum qed_mib_read_type type)
+{
+	u32 prefix_seq_num, suffix_seq_num;
+	int read_count = 0;
+	int rc = 0;
+
+	/* The data is considered to be valid only if both sequence numbers are
+	 * the same.
+	 */
+	do {
+		if (type == QED_DCBX_REMOTE_LLDP_MIB) {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->lldp_remote,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->lldp_remote->prefix_seq_num;
+			suffix_seq_num = p_data->lldp_remote->suffix_seq_num;
+		} else {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->mib,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->mib->prefix_seq_num;
+			suffix_seq_num = p_data->mib->suffix_seq_num;
+		}
+		read_count++;
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "mib type = %d, try count = %d prefix seq num  = %d suffix seq num = %d\n",
+			   type, read_count, prefix_seq_num, suffix_seq_num);
+	} while ((prefix_seq_num != suffix_seq_num) &&
+		 (read_count < QED_DCBX_MAX_MIB_READ_TRY));
+
+	if (read_count >= QED_DCBX_MAX_MIB_READ_TRY) {
+		DP_ERR(p_hwfn,
+		       "MIB read err, mib type = %d, try count = %d prefix seq num = %d suffix seq num = %d\n",
+		       type, read_count, prefix_seq_num, suffix_seq_num);
+		rc = -EIO;
+	}
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_local_lldp_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_config_params);
+	data.lldp_local = p_hwfn->p_dcbx_info->lldp_local;
+	data.size = sizeof(struct lldp_config_params_s);
+	qed_memcpy_from(p_hwfn, p_ptt, data.lldp_local, data.addr, data.size);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_lldp_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_status_params);
+	data.lldp_remote = p_hwfn->p_dcbx_info->lldp_remote;
+	data.size = sizeof(struct lldp_status_params_s);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_operational_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, operational_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->operational;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_mib(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, remote_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->remote;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_local_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, local_admin_dcbx_mib);
+	data.local_admin = &p_hwfn->p_dcbx_info->local_admin;
+	data.size = sizeof(struct dcbx_local_params);
+	qed_memcpy_from(p_hwfn, p_ptt, data.local_admin, data.addr, data.size);
+
+	return rc;
+}
+
+static int qed_dcbx_read_mib(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = -EINVAL;
+
+	switch (type) {
+	case QED_DCBX_OPERATIONAL_MIB:
+		rc = qed_dcbx_read_operational_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_REMOTE_MIB:
+		rc = qed_dcbx_read_remote_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_MIB:
+		rc = qed_dcbx_read_local_mib(p_hwfn, p_ptt);
+		break;
+	case QED_DCBX_REMOTE_LLDP_MIB:
+		rc = qed_dcbx_read_remote_lldp_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_LLDP_MIB:
+		rc = qed_dcbx_read_local_lldp_mib(p_hwfn, p_ptt);
+		break;
+	default:
+		DP_ERR(p_hwfn, "MIB read err, unknown mib type %d\n", type);
+	}
+
+	return rc;
+}
+
+/* Read updated MIB.
+ * Reconfigure QM and invoke PF update ramrod command if operational MIB
+ * change is detected.
+ */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = 0;
+
+	rc = qed_dcbx_read_mib(p_hwfn, p_ptt, type);
+	if (rc)
+		return rc;
+
+	if (type == QED_DCBX_OPERATIONAL_MIB) {
+		rc = qed_dcbx_process_mib_info(p_hwfn);
+		if (!rc) {
+			/* reconfigure tcs of QM queues according
+			 * to negotiation results
+			 */
+			qed_qm_reconf(p_hwfn, p_ptt);
+
+			/* update storm FW with negotiation results */
+			qed_sp_pf_update(p_hwfn);
+		}
+	}
+
+	return rc;
+}
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn)
+{
+	int rc = 0;
+
+	p_hwfn->p_dcbx_info = kzalloc(sizeof(*p_hwfn->p_dcbx_info), GFP_KERNEL);
+	if (!p_hwfn->p_dcbx_info) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate 'struct qed_dcbx_info'\n");
+		rc = -ENOMEM;
+	}
+
+	return rc;
+}
+
+void qed_dcbx_info_free(struct qed_hwfn *p_hwfn,
+			struct qed_dcbx_info *p_dcbx_info)
+{
+	kfree(p_hwfn->p_dcbx_info);
+}
+
+static void qed_dcbx_update_protocol_data(struct protocol_dcb_data *p_data,
+					  struct qed_dcbx_results *p_src,
+					  enum dcbx_protocol_type type)
+{
+	p_data->dcb_enable_flag = p_src->arr[type].enable;
+	p_data->dcb_priority = p_src->arr[type].priority;
+	p_data->dcb_tc = p_src->arr[type].tc;
+}
+
+/* Set pf update ramrod command params */
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest)
+{
+	struct protocol_dcb_data *p_dcb_data;
+	bool update_flag = false;
+
+	p_dest->pf_id = p_src->pf_id;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_FCOE].update;
+	p_dest->update_fcoe_dcb_data_flag = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE].update;
+	p_dest->update_roce_dcb_data_flag = update_flag;
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE_V2].update;
+	p_dest->update_roce_dcb_data_flag = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ISCSI].update;
+	p_dest->update_iscsi_dcb_data_flag = update_flag;
+	update_flag = p_src->arr[DCBX_PROTOCOL_ETH].update;
+	p_dest->update_eth_dcb_data_flag = update_flag;
+
+	p_dcb_data = &p_dest->fcoe_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_FCOE);
+	p_dcb_data = &p_dest->roce_dcb_data;
+
+	if (p_src->arr[DCBX_PROTOCOL_ROCE].update)
+		qed_dcbx_update_protocol_data(p_dcb_data, p_src,
+					      DCBX_PROTOCOL_ROCE);
+	if (p_src->arr[DCBX_PROTOCOL_ROCE_V2].update)
+		qed_dcbx_update_protocol_data(p_dcb_data, p_src,
+					      DCBX_PROTOCOL_ROCE_V2);
+
+	p_dcb_data = &p_dest->iscsi_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ISCSI);
+	p_dcb_data = &p_dest->eth_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ETH);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
new file mode 100644
index 000000000000..e7f834dbda2d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
@@ -0,0 +1,80 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_DCBX_H
+#define _QED_DCBX_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+#define DCBX_CONFIG_MAX_APP_PROTOCOL    4
+
+enum qed_mib_read_type {
+	QED_DCBX_OPERATIONAL_MIB,
+	QED_DCBX_REMOTE_MIB,
+	QED_DCBX_LOCAL_MIB,
+	QED_DCBX_REMOTE_LLDP_MIB,
+	QED_DCBX_LOCAL_LLDP_MIB
+};
+
+struct qed_dcbx_app_data {
+	bool enable;		/* DCB enabled */
+	bool update;		/* Update indication */
+	u8 priority;		/* Priority */
+	u8 tc;			/* Traffic Class */
+};
+
+struct qed_dcbx_results {
+	bool dcbx_enabled;
+	u8 pf_id;
+	struct qed_dcbx_app_data arr[DCBX_MAX_PROTOCOL_TYPE];
+};
+
+struct qed_dcbx_app_metadata {
+	enum dcbx_protocol_type id;
+	char *name;
+	enum qed_pci_personality personality;
+};
+
+#define QED_MFW_GET_FIELD(name, field) \
+	(((name) & (field ## _MASK)) >> (field ## _SHIFT))
+
+struct qed_dcbx_info {
+	struct lldp_status_params_s lldp_remote[LLDP_MAX_LLDP_AGENTS];
+	struct lldp_config_params_s lldp_local[LLDP_MAX_LLDP_AGENTS];
+	struct dcbx_local_params local_admin;
+	struct qed_dcbx_results results;
+	struct dcbx_mib operational;
+	struct dcbx_mib remote;
+	u8 dcbx_cap;
+};
+
+struct qed_dcbx_mib_meta_data {
+	struct lldp_config_params_s *lldp_local;
+	struct lldp_status_params_s *lldp_remote;
+	struct dcbx_local_params *local_admin;
+	struct dcbx_mib *mib;
+	size_t size;
+	u32 addr;
+};
+
+/* QED local interface routines */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *,
+			  struct qed_ptt *, enum qed_mib_read_type);
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn);
+void qed_dcbx_info_free(struct qed_hwfn *, struct qed_dcbx_info *);
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 6fb6016409c6..089016f46f26 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -22,6 +22,7 @@
 #include <linux/qed/qed_if.h>
 #include "qed.h"
 #include "qed_cxt.h"
+#include "qed_dcbx.h"
 #include "qed_dev_api.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -33,6 +34,9 @@
 #include "qed_sriov.h"
 #include "qed_vf.h"
 
+static spinlock_t qm_lock;
+static bool qm_lock_init = false;
+
 /* API common to all protocols */
 enum BAR_ID {
 	BAR_ID_0,       /* used for GRC */
@@ -147,6 +151,7 @@ void qed_resc_free(struct qed_dev *cdev)
 		qed_int_free(p_hwfn);
 		qed_iov_free(p_hwfn);
 		qed_dmae_info_free(p_hwfn);
+		qed_dcbx_info_free(p_hwfn, p_hwfn->p_dcbx_info);
 	}
 }
 
@@ -200,13 +205,19 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 	vport_id = (u8)RESC_START(p_hwfn, QED_VPORT);
 
 	/* First init per-TC PQs */
-	for (i = 0; i < multi_cos_tcs; i++, curr_queue++) {
+	for (i = 0; i < multi_cos_tcs; i++) {
 		struct init_qm_pq_params *params =
-		    &qm_info->qm_pq_params[curr_queue];
+		    &qm_info->qm_pq_params[curr_queue++];
 
-		params->vport_id = vport_id;
-		params->tc_id = p_hwfn->hw_info.non_offload_tc;
-		params->wrr_group = 1;
+		if (p_hwfn->hw_info.personality == QED_PCI_ETH) {
+			params->vport_id = vport_id;
+			params->tc_id = p_hwfn->hw_info.non_offload_tc;
+			params->wrr_group = 1;
+		} else {
+			params->vport_id = vport_id;
+			params->tc_id = p_hwfn->hw_info.offload_tc;
+			params->wrr_group = 1;
+		}
 	}
 
 	/* Then init pure-LB PQ */
@@ -266,6 +277,63 @@ alloc_err:
 	return -ENOMEM;
 }
 
+/* This function reconfigures the QM pf on the fly.
+ * For this purpose we:
+ * 1. reconfigure the QM database
+ * 2. set new values to runtime arrat
+ * 3. send an sdm_qm_cmd through the rbc interface to stop the QM
+ * 4. activate init tool in QM_PF stage
+ * 5. send an sdm_qm_cmd through rbc interface to release the QM
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	bool b_rc;
+	int rc;
+
+	/* qm_info is allocated in qed_init_qm_info() which is already called
+	 * from qed_resc_alloc() or previous call of qed_qm_reconf().
+	 * The allocated size may change each init, so we free it before next
+	 * allocation.
+	 */
+	qed_qm_info_free(p_hwfn);
+
+	/* initialize qed's qm data structure */
+	rc = qed_init_qm_info(p_hwfn);
+	if (rc)
+		return rc;
+
+	/* stop PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, false, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	/* clear the QM_PF runtime phase leftovers from previous init */
+	qed_init_clear_rt_data(p_hwfn);
+
+	/* prepare QM portion of runtime array */
+	qed_qm_init_pf(p_hwfn);
+
+	/* activate init tool on runtime array */
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_QM_PF, p_hwfn->rel_pf_id,
+			  p_hwfn->hw_info.hw_mode);
+	if (rc)
+		return rc;
+
+	/* start PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, true, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	return 0;
+}
+
 int qed_resc_alloc(struct qed_dev *cdev)
 {
 	struct qed_consq *p_consq;
@@ -375,6 +443,14 @@ int qed_resc_alloc(struct qed_dev *cdev)
 				  "Failed to allocate memory for dmae_info structure\n");
 			goto alloc_err;
 		}
+
+		/* DCBX initialization */
+		rc = qed_dcbx_info_alloc(p_hwfn);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to allocate memory for dcbx structure\n");
+			goto alloc_err;
+		}
 	}
 
 	cdev->reset_stats = kzalloc(sizeof(*cdev->reset_stats), GFP_KERNEL);
@@ -780,6 +856,11 @@ int qed_hw_init(struct qed_dev *cdev,
 		p_hwfn->first_on_engine = (load_code ==
 					   FW_MSG_CODE_DRV_LOAD_ENGINE);
 
+		if (!qm_lock_init) {
+			spin_lock_init(&qm_lock);
+			qm_lock_init = true;
+		}
+
 		switch (load_code) {
 		case FW_MSG_CODE_DRV_LOAD_ENGINE:
 			rc = qed_hw_init_common(p_hwfn, p_hwfn->p_main_ptt,
@@ -821,6 +902,20 @@ int qed_hw_init(struct qed_dev *cdev,
 			return mfw_rc;
 		}
 
+		/* send DCBX attention request command */
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "sending phony dcbx set command to trigger DCBx attention handling\n");
+		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				     DRV_MSG_CODE_SET_DCBX,
+				     1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT,
+				     &load_code, &param);
+		if (mfw_rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to send DCBX attention request\n");
+			return mfw_rc;
+		}
+
 		p_hwfn->hw_init_done = true;
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 82b7727d090b..9afc15fdbb02 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -634,6 +634,14 @@ struct pf_start_ramrod_data {
 	u8				reserved0[4];
 };
 
+/* Data for port update ramrod */
+struct protocol_dcb_data {
+	u8 dcb_enable_flag;
+	u8 dcb_priority;
+	u8 dcb_tc;
+	u8 reserved;
+};
+
 /* tunnel configuration */
 struct pf_update_tunnel_config {
 	u8	update_rx_pf_clss;
@@ -656,8 +664,17 @@ struct pf_update_tunnel_config {
 };
 
 struct pf_update_ramrod_data {
-	u32				reserved[2];
-	u32				reserved_1[6];
+	u8 pf_id;
+	u8 update_eth_dcb_data_flag;
+	u8 update_fcoe_dcb_data_flag;
+	u8 update_iscsi_dcb_data_flag;
+	u8 update_roce_dcb_data_flag;
+	u8 update_mf_vlan_flag;
+	__le16 mf_vlan;
+	struct protocol_dcb_data eth_dcb_data;
+	struct protocol_dcb_data fcoe_dcb_data;
+	struct protocol_dcb_data iscsi_dcb_data;
+	struct protocol_dcb_data roce_dcb_data;
 	struct pf_update_tunnel_config	tunnel_config;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 2be943b91916..1182361798b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include "qed.h"
+#include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_mcp.h"
@@ -825,6 +826,18 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_VF_DISABLED:
 			qed_mcp_handle_vf_flr(p_hwfn, p_ptt);
 			break;
+		case MFW_DRV_MSG_LLDP_DATA_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_LLDP_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_OPERATIONAL_MIB);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index ab5549f4e5ea..ea4e9ce53e0a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -353,6 +353,19 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_tunn_start_params *p_tunn,
 		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
 
+/**
+ * @brief qed_sp_pf_update - PF Function Update Ramrod
+ *
+ * This ramrod updates function-related parameters. Every parameter can be
+ * updated independently, according to configuration flags.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn);
+
 /**
  * @brief qed_sp_pf_stop - PF Function Stop Ramrod
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 8c555ed1f949..67f6ce3c84c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -15,6 +15,7 @@
 #include "qed.h"
 #include <linux/qed/qed_chain.h>
 #include "qed_cxt.h"
+#include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_int.h"
@@ -384,6 +385,30 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_dcbx_set_pf_update_params(&p_hwfn->p_dcbx_info->results,
+				      &p_ent->ramrod.pf_update);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_tunn_update_params *p_tunn,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 0fd8f247e65f..4c29439f54bf 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -25,6 +25,15 @@
 #include <linux/qed/common_hsi.h>
 #include <linux/qed/qed_chain.h>
 
+enum dcbx_protocol_type {
+	DCBX_PROTOCOL_ISCSI,
+	DCBX_PROTOCOL_FCOE,
+	DCBX_PROTOCOL_ROCE,
+	DCBX_PROTOCOL_ROCE_V2,
+	DCBX_PROTOCOL_ETH,
+	DCBX_MAX_PROTOCOL_TYPE
+};
+
 enum qed_led_mode {
 	QED_LED_MODE_OFF,
 	QED_LED_MODE_ON,
-- 
cgit v1.2.3


From ec4c436652cfdf5dcad60f553020ee99596b937b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 17 May 2016 18:06:18 +0100
Subject: regulator: Silence build warnings from regulator_can_change_voltage()

Cut down on noise for mainstream users of the API and people doing build
testing by dropping the deprecated flag from regulator_can_change_voltage()
as it triggers even on the EXPORT_SYMBOL_GPL() which affects all builds
rather than just the remaining drivers with calls to it (for which fixes
are currently pending).

The function remains deprecated and is expected to be removed entirely
in v4.8.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 80dc4e51d14a..48603506f8de 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -224,7 +224,7 @@ int regulator_bulk_force_disable(int num_consumers,
 void regulator_bulk_free(int num_consumers,
 			 struct regulator_bulk_data *consumers);
 
-int __deprecated regulator_can_change_voltage(struct regulator *regulator);
+int regulator_can_change_voltage(struct regulator *regulator);
 int regulator_count_voltages(struct regulator *regulator);
 int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 int regulator_is_supported_voltage(struct regulator *regulator,
@@ -436,7 +436,7 @@ static inline void regulator_bulk_free(int num_consumers,
 {
 }
 
-static inline int __deprecated regulator_can_change_voltage(struct regulator *regulator)
+static inline int regulator_can_change_voltage(struct regulator *regulator)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 2e72448b07dc3ff1b7593e9bfff91db182262857 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 21 May 2013 16:53:03 -0400
Subject: NFS: Add COPY nfs operation

This adds the copy_range file_ops function pointer used by the
sys_copy_range() function call.  This patch only implements sync copies,
so if an async copy happens we decode the stateid and ignore it.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
---
 fs/nfs/nfs42.h            |   1 +
 fs/nfs/nfs42proc.c        | 105 +++++++++++++++++++++++++++++++++
 fs/nfs/nfs42xdr.c         | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4file.c         |  23 ++++++++
 fs/nfs/nfs4proc.c         |   1 +
 fs/nfs/nfs4xdr.c          |   1 +
 include/linux/nfs4.h      |   1 +
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_xdr.h   |  26 +++++++++
 9 files changed, 305 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd31083..b6cd15314bab 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
 
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff83460e5a6..579ee20e4120 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	return err;
 }
 
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+				struct nfs_lock_context *src_lock,
+				struct file *dst, loff_t pos_dst,
+				struct nfs_lock_context *dst_lock,
+				size_t count)
+{
+	struct nfs42_copy_args args = {
+		.src_fh		= NFS_FH(file_inode(src)),
+		.src_pos	= pos_src,
+		.dst_fh		= NFS_FH(file_inode(dst)),
+		.dst_pos	= pos_dst,
+		.count		= count,
+	};
+	struct nfs42_copy_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct inode *dst_inode = file_inode(dst);
+	struct nfs_server *server = NFS_SERVER(dst_inode);
+	int status;
+
+	status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+				     src_lock, FMODE_READ);
+	if (status)
+		return status;
+
+	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+				     dst_lock, FMODE_WRITE);
+	if (status)
+		return status;
+
+	status = nfs4_call_sync(server->client, server, &msg,
+				&args.seq_args, &res.seq_res, 0);
+	if (status == -ENOTSUPP)
+		server->caps &= ~NFS_CAP_COPY;
+	if (status)
+		return status;
+
+	if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+		status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+		if (status)
+			return status;
+	}
+
+	truncate_pagecache_range(dst_inode, pos_dst,
+				 pos_dst + res.write_res.count);
+
+	return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+			struct file *dst, loff_t pos_dst,
+			size_t count)
+{
+	struct nfs_server *server = NFS_SERVER(file_inode(dst));
+	struct nfs_lock_context *src_lock;
+	struct nfs_lock_context *dst_lock;
+	struct nfs4_exception src_exception = { };
+	struct nfs4_exception dst_exception = { };
+	ssize_t err, err2;
+
+	if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+		return -EOPNOTSUPP;
+
+	src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+	if (IS_ERR(src_lock))
+		return PTR_ERR(src_lock);
+
+	src_exception.inode = file_inode(src);
+	src_exception.state = src_lock->open_context->state;
+
+	dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+	if (IS_ERR(dst_lock)) {
+		err = PTR_ERR(dst_lock);
+		goto out_put_src_lock;
+	}
+
+	dst_exception.inode = file_inode(dst);
+	dst_exception.state = dst_lock->open_context->state;
+
+	do {
+		mutex_lock(&file_inode(dst)->i_mutex);
+		err = _nfs42_proc_copy(src, pos_src, src_lock,
+				       dst, pos_dst, dst_lock, count);
+		mutex_unlock(&file_inode(dst)->i_mutex);
+
+		if (err == -ENOTSUPP) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		err2 = nfs4_handle_exception(server, err, &src_exception);
+		err  = nfs4_handle_exception(server, err, &dst_exception);
+		if (!err)
+			err = err2;
+	} while (src_exception.retry || dst_exception.retry);
+
+	nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+	nfs_put_lock_context(src_lock);
+	return err;
+}
+
 static loff_t _nfs42_proc_llseek(struct file *filep,
 		struct nfs_lock_context *lock, loff_t offset, int whence)
 {
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a51e53..6dc6f2aea0d6 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
 #define encode_fallocate_maxsz		(encode_stateid_maxsz + \
 					 2 /* offset */ + \
 					 2 /* length */)
+#define NFS42_WRITE_RES_SIZE		(1 /* wr_callback_id size */ +\
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 2 /* wr_count */ + \
+					 1 /* wr_committed */ + \
+					 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
 #define encode_allocate_maxsz		(op_encode_hdr_maxsz + \
 					 encode_fallocate_maxsz)
 #define decode_allocate_maxsz		(op_decode_hdr_maxsz)
+#define encode_copy_maxsz		(op_encode_hdr_maxsz +          \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+					 2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz		(op_decode_hdr_maxsz + \
+					 NFS42_WRITE_RES_SIZE + \
+					 1 /* cr_consecutive */ + \
+					 1 /* cr_synchronous */)
 #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
 					 encode_fallocate_maxsz)
 #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
 					 decode_putfh_maxsz + \
 					 decode_allocate_maxsz + \
 					 decode_getattr_maxsz)
+#define NFS4_enc_copy_sz		(compound_encode_hdr_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_savefh_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_copy_maxsz)
+#define NFS4_dec_copy_sz		(compound_decode_hdr_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_savefh_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_copy_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
 	encode_fallocate(xdr, args);
 }
 
+static void encode_copy(struct xdr_stream *xdr,
+			struct nfs42_copy_args *args,
+			struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->src_stateid);
+	encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+	encode_uint64(xdr, args->src_pos);
+	encode_uint64(xdr, args->dst_pos);
+	encode_uint64(xdr, args->count);
+
+	encode_uint32(xdr, 1); /* consecutive = true */
+	encode_uint32(xdr, 1); /* synchronous = true */
+	encode_uint32(xdr, 0); /* src server list */
+}
+
 static void encode_deallocate(struct xdr_stream *xdr,
 			      struct nfs42_falloc_args *args,
 			      struct compound_hdr *hdr)
@@ -181,6 +221,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+			      struct xdr_stream *xdr,
+			      struct nfs42_copy_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->src_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dst_fh, &hdr);
+	encode_copy(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
 /*
  * Encode DEALLOCATE request
  */
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 	return decode_op_hdr(xdr, OP_ALLOCATE);
 }
 
+static int decode_write_response(struct xdr_stream *xdr,
+				 struct nfs42_write_res *res)
+{
+	__be32 *p;
+	int stateids;
+
+	p = xdr_inline_decode(xdr, 4 + 8 + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	stateids = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &res->count);
+	res->verifier.committed = be32_to_cpup(p);
+	return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+				    struct nfs42_copy_res *res) {
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->consecutive = be32_to_cpup(p++);
+	res->synchronous = be32_to_cpup(p++);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_COPY);
+	if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+		status = decode_copy_requirements(xdr, res);
+		if (status)
+			return status;
+		return NFS4ERR_OFFLOAD_NO_REQS;
+	} else if (status)
+		return status;
+
+	status = decode_write_response(xdr, &res->write_res);
+	if (status)
+		return status;
+
+	return decode_copy_requirements(xdr, res);
+}
+
 static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -330,6 +446,36 @@ out:
 	return status;
 }
 
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+			     struct xdr_stream *xdr,
+			     struct nfs42_copy_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_copy(xdr, res);
+out:
+	return status;
+}
+
 /*
  * Decode DEALLOCATE request
  */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d0390516467c..014b0e41ace5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
 }
 
 #ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t count, unsigned int flags)
+{
+	struct inode *in_inode = file_inode(file_in);
+	struct inode *out_inode = file_inode(file_out);
+	int ret;
+
+	if (in_inode == out_inode)
+		return -EINVAL;
+
+	/* flush any pending writes */
+	ret = nfs_sync_inode(in_inode);
+	if (ret)
+		return ret;
+	ret = nfs_sync_inode(out_inode);
+	if (ret)
+		return ret;
+
+	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
 	loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
 	.check_flags	= nfs_check_flags,
 	.setlease	= simple_nosetlease,
 #ifdef CONFIG_NFS_V4_2
+	.copy_file_range = nfs4_copy_file_range,
 	.llseek		= nfs4_file_llseek,
 	.fallocate	= nfs42_fallocate,
 	.clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bc2676c95e1b..4e83385ea6a9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8797,6 +8797,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
 		| NFS_CAP_ALLOCATE
+		| NFS_CAP_COPY
 		| NFS_CAP_DEALLOCATE
 		| NFS_CAP_SEEK
 		| NFS_CAP_LAYOUTSTATS
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 88474a4fc669..d1c96fc62c51 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7515,6 +7515,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
 	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
 	PROC(CLONE,		enc_clone,		dec_clone),
+	PROC(COPY,		enc_copy,		dec_copy),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 011433478a14..722509482e1a 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -504,6 +504,7 @@ enum {
 	NFSPROC4_CLNT_DEALLOCATE,
 	NFSPROC4_CLNT_LAYOUTSTATS,
 	NFSPROC4_CLNT_CLONE,
+	NFSPROC4_CLNT_COPY,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7fcc13c8cf1f..14a762d2734d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -246,5 +246,6 @@ struct nfs_server {
 #define NFS_CAP_DEALLOCATE	(1U << 21)
 #define NFS_CAP_LAYOUTSTATS	(1U << 22)
 #define NFS_CAP_CLONE		(1U << 23)
+#define NFS_CAP_COPY		(1U << 24)
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cb9982d8f38f..e70ed54dad94 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1343,6 +1343,32 @@ struct nfs42_falloc_res {
 	const struct nfs_server		*falloc_server;
 };
 
+struct nfs42_copy_args {
+	struct nfs4_sequence_args	seq_args;
+
+	struct nfs_fh			*src_fh;
+	nfs4_stateid			src_stateid;
+	u64				src_pos;
+
+	struct nfs_fh			*dst_fh;
+	nfs4_stateid			dst_stateid;
+	u64				dst_pos;
+
+	u64				count;
+};
+
+struct nfs42_write_res {
+	u64			count;
+	struct nfs_writeverf	verifier;
+};
+
+struct nfs42_copy_res {
+	struct nfs4_sequence_res	seq_res;
+	struct nfs42_write_res		write_res;
+	bool				consecutive;
+	bool				synchronous;
+};
+
 struct nfs42_seek_args {
 	struct nfs4_sequence_args	seq_args;
 
-- 
cgit v1.2.3


From 4b9c7f9db9a003f5c342184dc4401c1b7f2efb39 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:31 -0400
Subject: sunrpc: Update RPCBIND_MAXNETIDLEN

Commit 176e21ee2ec8 ("SUNRPC: Support for RPC over AF_LOCAL
transports") added a 5-character netid, but did not bump
RPCBIND_MAXNETIDLEN from 4 to 5.

Fixes: 176e21ee2ec8 ("SUNRPC: Support for RPC over AF_LOCAL ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/msg_prot.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 807371357160..59cbf16eaeb5 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -158,9 +158,9 @@ typedef __be32	rpc_fraghdr;
 
 /*
  * Note that RFC 1833 does not put any size restrictions on the
- * netid string, but all currently defined netid's fit in 4 bytes.
+ * netid string, but all currently defined netid's fit in 5 bytes.
  */
-#define RPCBIND_MAXNETIDLEN	(4u)
+#define RPCBIND_MAXNETIDLEN	(5u)
 
 /*
  * Universal addresses are introduced in RFC 1833 and further spelled
-- 
cgit v1.2.3


From 6b26cc8c8ead3636a18bfd9489984983f4ddd6f4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:40 -0400
Subject: sunrpc: Advertise maximum backchannel payload size

RPC-over-RDMA transports have a limit on how large a backward
direction (backchannel) RPC message can be. Ensure that the NFSv4.x
CREATE_SESSION operation advertises this limit to servers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c                 | 10 ++++++----
 include/linux/sunrpc/clnt.h       |  1 +
 include/linux/sunrpc/xprt.h       |  1 +
 net/sunrpc/clnt.c                 | 17 +++++++++++++++++
 net/sunrpc/xprtrdma/backchannel.c | 16 ++++++++++++++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  6 ++++++
 8 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9795725d708c..196e41e12621 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7371,9 +7371,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  * always set csa_cachethis to FALSE because the current implementation
  * of the back channel DRC only supports caching the CB_SEQUENCE operation.
  */
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+				    struct rpc_clnt *clnt)
 {
 	unsigned int max_rqst_sz, max_resp_sz;
+	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7391,8 +7393,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
 	/* Back channel attributes */
-	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
-	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_rqst_sz = max_bc_payload;
+	args->bc_attrs.max_resp_sz = max_bc_payload;
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7496,7 +7498,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	};
 	int status;
 
-	nfs4_init_channel_attrs(&args);
+	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 9a7ddbaf116e..19c659d1c0f8 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -176,6 +176,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 int		rpc_protocol(struct rpc_clnt *);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
+size_t		rpc_max_bc_payload(struct rpc_clnt *);
 unsigned long	rpc_get_timeout(struct rpc_clnt *clnt);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fb0d212e0d3a..5aa3834619a8 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	int		(*bc_up)(struct svc_serv *serv, struct net *net);
+	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
 				      unsigned int max_reqs);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..06b4df9faaa1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1413,6 +1413,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_payload);
 
+/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	size_t ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_maxpayload(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
 /**
  * rpc_get_timeout - Get timeout for transport in units of HZ
  * @clnt: RPC client to query
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -191,6 +191,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
 	return 0;
 }
 
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	size_t maxmsg;
+
+	maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+	return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
 /**
  * rpcrdma_bc_marshal_reply - Send backwards direction reply
  * @rqst: buffer containing RPC reply data
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..9954342924df 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -707,6 +707,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_up			= xprt_rdma_bc_up,
+	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..7723e5faff4d 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -534,6 +534,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 65e759569e48..f1faf6b9aaff 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1365,6 +1365,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
 		return ret;
 	return 0;
 }
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	return PAGE_SIZE;
+}
 #else
 static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
 					struct xdr_skb_reader *desc)
@@ -2660,6 +2665,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_up			= xs_tcp_bc_up,
+	.bc_maxpayload		= xs_tcp_bc_maxpayload,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
-- 
cgit v1.2.3


From 29c554227aeec48cde5c22f911e51763f096e125 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 2 May 2016 14:40:48 -0400
Subject: xprtrdma: Bound the inline threshold values

Currently the sysctls that allow setting the inline threshold allow
any value to be set.

Small values only make the transport run slower. The default 1KB
setting is as low as is reasonable. And the logic that decides how
to divide a Send buffer between RPC-over-RDMA header and RPC message
assumes (but does not check) that the lower bound is not crazy (say,
57 bytes).

Send and receive buffers share a page with some control information.
Values larger than about 3KB can't be supported, currently.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 4 +++-
 net/sunrpc/xprtrdma/transport.c | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 767190b01363..39267dc3486a 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -52,7 +52,9 @@
 #define RPCRDMA_DEF_SLOT_TABLE	(128U)
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
-#define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
+#define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
+#define RPCRDMA_DEF_INLINE  (1024)	/* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (3068)	/* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9954342924df..16595ff91994 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
 static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_inline_write_padding",
-- 
cgit v1.2.3


From 9a8f6b5ea275ff01fc8ef3b8630a3d4ed6b0a362 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 16 May 2016 17:42:42 -0400
Subject: SUNRPC: Ensure get_rpccred() and put_rpccred() can take NULL
 arguments

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 3 ++-
 net/sunrpc/auth.c           | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6f36b2bf3e05..899791573a40 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -202,7 +202,8 @@ char *			rpcauth_stringify_acceptor(struct rpc_cred *);
 static inline
 struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
 {
-	atomic_inc(&cred->cr_count);
+	if (cred != NULL)
+		atomic_inc(&cred->cr_count);
 	return cred;
 }
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index e0bb30fd2ed3..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 		new = rpcauth_bind_new_cred(task, lookupflags);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (req->rq_cred != NULL)
-		put_rpccred(req->rq_cred);
+	put_rpccred(req->rq_cred);
 	req->rq_cred = new;
 	return 0;
 }
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 void
 put_rpccred(struct rpc_cred *cred)
 {
+	if (cred == NULL)
+		return;
 	/* Fast path for unhashed credentials */
 	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
 		if (atomic_dec_and_test(&cred->cr_count))
-- 
cgit v1.2.3


From 93b717fd81bf6b9a73c3702e9b079b4de8148b34 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 16 May 2016 17:42:43 -0400
Subject: NFSv4: Label stateids with the type

In order to more easily distinguish what kind of stateid we are dealing
with, introduce a type that can be used to label the stateid structure.

The label will be useful both for debugging, but also when dealing with
operations like SETATTR, READ and WRITE that can take several different
types of stateid as arguments.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback_xdr.c                     | 17 ++++++++++---
 fs/nfs/flexfilelayout/flexfilelayout.c    |  7 +++---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c |  3 ++-
 fs/nfs/nfs4_fs.h                          |  7 ++++--
 fs/nfs/nfs4proc.c                         |  3 +++
 fs/nfs/nfs4state.c                        |  5 +++-
 fs/nfs/nfs4xdr.c                          | 42 ++++++++++++++++++++++++-------
 include/linux/nfs4.h                      | 25 ++++++++++++++----
 8 files changed, 85 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 976c90608e56..d81f96aacd51 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	p = read_buf(xdr, NFS4_STATEID_SIZE);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
-	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
 	return 0;
 }
 
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
 	__be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	__be32 *p;
 	__be32 status;
 
-	status = decode_stateid(xdr, &args->stateid);
+	status = decode_delegation_stateid(xdr, &args->stateid);
 	if (unlikely(status != 0))
 		goto out;
 	p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
 
 static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
 				       struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
 		}
 		p = xdr_decode_hyper(p, &args->cbl_range.offset);
 		p = xdr_decode_hyper(p, &args->cbl_range.length);
-		status = decode_stateid(xdr, &args->cbl_stateid);
+		status = decode_layout_stateid(xdr, &args->cbl_stateid);
 		if (unlikely(status != 0))
 			goto out;
 	} else if (args->cbl_recall_type == RETURN_FSID) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 60d690dbc947..53b6391e0eba 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -55,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 	kfree(FF_LAYOUT_FROM_HDR(lo));
 }
 
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
 	if (unlikely(p == NULL))
 		return -ENOBUFS;
-	memcpy(stateid, p, NFS4_STATEID_SIZE);
+	stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
 	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
 		p[0], p[1], p[2], p[3]);
 	return 0;
@@ -465,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 
 		/* stateid */
-		rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+		rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
 		if (rc)
 			goto out_err_free;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 56296f3df19c..eeef89359ad2 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
 		return e1->opnum < e2->opnum ? -1 : 1;
 	if (e1->status != e2->status)
 		return e1->status < e2->status ? -1 : 1;
-	ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+	ret = memcmp(e1->stateid.data, e2->stateid.data,
+			sizeof(e1->stateid.data));
 	if (ret != 0)
 		return ret;
 	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4afdee420d25..b5d9f345c9f2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -496,12 +496,15 @@ extern struct svc_version nfs4_callback_version4;
 
 static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
 {
-	memcpy(dst, src, sizeof(*dst));
+	memcpy(dst->data, src->data, sizeof(dst->data));
+	dst->type = src->type;
 }
 
 static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
 {
-	return memcmp(dst, src, sizeof(*dst)) == 0;
+	if (dst->type != src->type)
+		return false;
+	return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
 }
 
 static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 196e41e12621..2516467ff17f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8675,6 +8675,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
 		const nfs4_stateid *s2)
 {
+	if (s1->type != s2->type)
+		return false;
+
 	if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
 		return false;
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d854693a15b0..d630f9cca0f1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -65,7 +65,10 @@
 
 #define OPENOWNER_POOL_SIZE	8
 
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+	.data = { 0 },
+	.type = NFS4_SPECIAL_STATEID_TYPE,
+};
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1c96fc62c51..661e753fe1c9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
 }
 
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_OPEN_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LOCK_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
 	int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 	if (status == -EIO)
 		goto out;
 	if (status == 0) {
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_lock_stateid(xdr, &res->stateid);
 		if (unlikely(status))
 			goto out;
 	} else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	if (status != -EIO)
 		nfs_increment_lock_seqid(status, res->seqid);
 	if (status == 0)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_lock_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
 	__be32 *p;
 	int status;
 
-	status = decode_stateid(xdr, &res->delegation);
+	status = decode_delegation_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
 	p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	nfs_increment_open_seqid(status, res->seqid);
 	if (status)
 		return status;
-	status = decode_stateid(xdr, &res->stateid);
+	status = decode_open_stateid(xdr, &res->stateid);
 	if (unlikely(status))
 		return status;
 
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
 	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_open_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -5838,6 +5856,12 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+	return decode_stateid(xdr, stateid);
+}
+
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
 				struct nfs4_getdeviceinfo_res *res)
 {
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 	if (unlikely(!p))
 		goto out_overflow;
 	res->return_on_close = be32_to_cpup(p);
-	decode_stateid(xdr, &res->stateid);
+	decode_layout_stateid(xdr, &res->stateid);
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
 		goto out_overflow;
 	res->lrs_present = be32_to_cpup(p);
 	if (res->lrs_present)
-		status = decode_stateid(xdr, &res->stateid);
+		status = decode_layout_stateid(xdr, &res->stateid);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 722509482e1a..e1692c96cbc8 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -50,12 +50,27 @@ struct nfs4_label {
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
 
-struct nfs_stateid4 {
-	__be32 seqid;
-	char other[NFS4_STATEID_OTHER_SIZE];
-} __attribute__ ((packed));
+struct nfs4_stateid_struct {
+	union {
+		char data[NFS4_STATEID_SIZE];
+		struct {
+			__be32 seqid;
+			char other[NFS4_STATEID_OTHER_SIZE];
+		} __attribute__ ((packed));
+	};
+
+	enum {
+		NFS4_INVALID_STATEID_TYPE = 0,
+		NFS4_SPECIAL_STATEID_TYPE,
+		NFS4_OPEN_STATEID_TYPE,
+		NFS4_LOCK_STATEID_TYPE,
+		NFS4_DELEGATION_STATEID_TYPE,
+		NFS4_LAYOUT_STATEID_TYPE,
+		NFS4_PNFS_DS_STATEID_TYPE,
+	} type;
+};
 
-typedef struct nfs_stateid4 nfs4_stateid;
+typedef struct nfs4_stateid_struct nfs4_stateid;
 
 enum nfs_opnum4 {
 	OP_ACCESS = 3,
-- 
cgit v1.2.3


From 183d9e7b112aaed0d19c16ffcf0f8c3a86dc71e0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 17 May 2016 12:28:47 -0400
Subject: pnfs: rework LAYOUTGET retry handling

There are several problems in the way a stateid is selected for a
LAYOUTGET operation:

We pick a stateid to use in the RPC prepare op, but that makes
it difficult to serialize LAYOUTGETs that use the open stateid. That
serialization is done in pnfs_update_layout, which occurs well before
the rpc_prepare operation.

Between those two events, the i_lock is dropped and reacquired.
pnfs_update_layout can find that the list has lsegs in it and not do any
serialization, but then later pnfs_choose_layoutget_stateid ends up
choosing the open stateid.

This patch changes the client to select the stateid to use in the
LAYOUTGET earlier, when we're searching for a usable layout segment.
This way we can do it all while holding the i_lock the first time, and
ensure that we serialize any LAYOUTGET call that uses a non-layout
stateid.

This also means a rework of how LAYOUTGET replies are handled, as we
must now get the latest stateid if we want to retransmit in response
to a retryable error.

Most of those errors boil down to the fact that the layout state has
changed in some fashion. Thus, what we really want to do is to re-search
for a layout when it fails with a retryable error, so that we can avoid
reissuing the RPC at all if possible.

While the LAYOUTGET RPC is async, the initiating thread always waits for
it to complete, so it's effectively synchronous anyway. Currently, when
we need to retry a LAYOUTGET because of an error, we drive that retry
via the rpc state machine.

This means that once the call has been submitted, it runs until it
completes. So, we must move the error handling for this RPC out of the
rpc_call_done operation and into the caller.

In order to handle errors like NFS4ERR_DELAY properly, we must also
pass a pointer to the sliding timeout, which is now moved to the stack
in pnfs_update_layout.

The complicating errors are -NFS4ERR_RECALLCONFLICT and
-NFS4ERR_LAYOUTTRYLATER, as those involve a timeout after which we give
up and return NULL back to the caller. So, there is some special
handling for those errors to ensure that the layers driving the retries
can handle that appropriately.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c       | 115 ++++++++++++++++----------------------
 fs/nfs/nfs4trace.h      |  10 +++-
 fs/nfs/pnfs.c           | 144 +++++++++++++++++++++++++-----------------------
 fs/nfs/pnfs.h           |   6 +-
 include/linux/errno.h   |   1 +
 include/linux/nfs4.h    |   2 +
 include/linux/nfs_xdr.h |   2 -
 7 files changed, 136 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2a29f5d12aeb..62d67f040906 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -427,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
+		case -NFS4ERR_RECALLCONFLICT:
 			exception->delay = 1;
 			return 0;
 
@@ -7847,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
 	struct nfs4_session *session = nfs4_get_session(server);
-	int ret;
 
 	dprintk("--> %s\n", __func__);
-	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
-	 * right now covering the LAYOUTGET we are about to send.
-	 * However, that is not so catastrophic, and there seems
-	 * to be no way to prevent it completely.
-	 */
-	if (nfs41_setup_sequence(session, &lgp->args.seq_args,
-				&lgp->res.seq_res, task))
-		return;
-	ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
-					  NFS_I(lgp->args.inode)->layout,
-					  &lgp->args.range,
-					  lgp->args.ctx->state);
-	if (ret < 0)
-		rpc_exit(task, ret);
+	nfs41_setup_sequence(session, &lgp->args.seq_args,
+				&lgp->res.seq_res, task);
+	dprintk("<-- %s\n", __func__);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_sequence_done(task, &lgp->res.seq_res);
+	dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+		struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
-	struct nfs4_state *state = NULL;
-	unsigned long timeo, now, giveup;
+	int status = task->tk_status;
 
 	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
-	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
-		goto out;
-
-	switch (task->tk_status) {
+	switch (status) {
 	case 0:
 		goto out;
 
@@ -7890,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	 * retry go inband.
 	 */
 	case -NFS4ERR_LAYOUTUNAVAILABLE:
-		task->tk_status = -ENODATA;
+		status = -ENODATA;
 		goto out;
 	/*
 	 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
 	 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
 	 */
 	case -NFS4ERR_BADLAYOUT:
-		goto out_overflow;
+		status = -EOVERFLOW;
+		goto out;
 	/*
 	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
 	 * (or clients) writing to the same RAID stripe except when
 	 * the minlength argument is 0 (see RFC5661 section 18.43.3).
+	 *
+	 * Treat it like we would RECALLCONFLICT -- we retry for a little
+	 * while, and then eventually give up.
 	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
-		if (lgp->args.minlength == 0)
-			goto out_overflow;
-	/*
-	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
-	 * existing layout before getting a new one).
-	 */
-	case -NFS4ERR_RECALLCONFLICT:
-		timeo = rpc_get_timeout(task->tk_client);
-		giveup = lgp->args.timestamp + timeo;
-		now = jiffies;
-		if (time_after(giveup, now)) {
-			unsigned long delay;
-
-			/* Delay for:
-			 * - Not less then NFS4_POLL_RETRY_MIN.
-			 * - One last time a jiffie before we give up
-			 * - exponential backoff (time_now minus start_attempt)
-			 */
-			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
-				    min((giveup - now - 1),
-					now - lgp->args.timestamp));
-
-			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
-				__func__, delay);
-			rpc_delay(task, delay);
-			/* Do not call nfs4_async_handle_error() */
-			goto out_restart;
+		if (lgp->args.minlength == 0) {
+			status = -EOVERFLOW;
+			goto out;
 		}
-		break;
+		/* Fallthrough */
+	case -NFS4ERR_RECALLCONFLICT:
+		nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
+					exception);
+		status = -ERECALLCONFLICT;
+		goto out;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
+		exception->timeout = 0;
 		spin_lock(&inode->i_lock);
 		if (nfs4_stateid_match(&lgp->args.stateid,
 					&lgp->args.ctx->state->stateid)) {
 			spin_unlock(&inode->i_lock);
 			/* If the open stateid was bad, then recover it. */
-			state = lgp->args.ctx->state;
+			exception->state = lgp->args.ctx->state;
 			break;
 		}
 		lo = NFS_I(inode)->layout;
@@ -7958,20 +7939,16 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			pnfs_free_lseg_list(&head);
 		} else
 			spin_unlock(&inode->i_lock);
-		goto out_restart;
+		status = -EAGAIN;
+		goto out;
 	}
-	if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
-		goto out_restart;
+
+	status = nfs4_handle_exception(server, status, exception);
+	if (exception->retry)
+		status = -EAGAIN;
 out:
 	dprintk("<-- %s\n", __func__);
-	return;
-out_restart:
-	task->tk_status = 0;
-	rpc_restart_call_prepare(task);
-	return;
-out_overflow:
-	task->tk_status = -EOVERFLOW;
-	goto out;
+	return status;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
@@ -8040,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 };
 
 struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
 {
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -8060,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 		.flags = RPC_TASK_ASYNC,
 	};
 	struct pnfs_layout_segment *lseg = NULL;
+	struct nfs4_exception exception = { .timeout = *timeout };
 	int status = 0;
 
 	dprintk("--> %s\n", __func__);
@@ -8073,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 		return ERR_PTR(-ENOMEM);
 	}
 	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
-	lgp->args.timestamp = jiffies;
 
 	lgp->res.layoutp = &lgp->args.layout;
 	lgp->res.seq_res.sr_slot = NULL;
@@ -8083,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 	if (IS_ERR(task))
 		return ERR_CAST(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0)
-		status = task->tk_status;
+	if (status == 0) {
+		status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+		*timeout = exception.timeout;
+	}
+
 	trace_nfs4_layoutget(lgp->args.ctx,
 			&lgp->args.range,
 			&lgp->res.range,
 			&lgp->res.stateid,
 			status);
+
 	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
 	if (status == 0 && lgp->res.layoutp->len)
 		lseg = pnfs_layout_process(lgp);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b1..9c150b153782 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 		{ PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },	\
 		{ PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },		\
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
+		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
+		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
 		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
 
 TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
 			u64 count,
 			enum pnfs_iomode iomode,
 			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg,
 			enum pnfs_update_layout_reason reason
 		),
-		TP_ARGS(inode, pos, count, iomode, lo, reason),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
 			__field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
 			__field(enum pnfs_iomode, iomode)
 			__field(int, layoutstateid_seq)
 			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
 			__field(enum pnfs_update_layout_reason, reason)
 		),
 		TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
 				__entry->layoutstateid_seq = 0;
 				__entry->layoutstateid_hash = 0;
 			}
+			__entry->lseg = (long)lseg;
 		),
 		TP_printk(
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"iomode=%s pos=%llu count=%llu "
-			"layoutstateid=%d:0x%08x (%s)",
+			"layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
 			__entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
 			(unsigned long long)__entry->pos,
 			(unsigned long long)__entry->count,
 			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg,
 			show_pnfs_update_layout_reason(__entry->reason)
 		)
 );
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2a609af845fe..46339a7fb191 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -796,45 +796,18 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			      const struct pnfs_layout_range *range,
-			      struct nfs4_state *open_state)
-{
-	int status = 0;
-
-	dprintk("--> %s\n", __func__);
-	spin_lock(&lo->plh_inode->i_lock);
-	if (pnfs_layoutgets_blocked(lo)) {
-		status = -EAGAIN;
-	} else if (!nfs4_valid_open_stateid(open_state)) {
-		status = -EBADF;
-	} else if (list_empty(&lo->plh_segs) ||
-		   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
-		int seq;
-
-		do {
-			seq = read_seqbegin(&open_state->seqlock);
-			nfs4_stateid_copy(dst, &open_state->stateid);
-		} while (read_seqretry(&open_state->seqlock, seq));
-	} else
-		nfs4_stateid_copy(dst, &lo->plh_stateid);
-	spin_unlock(&lo->plh_inode->i_lock);
-	dprintk("<-- %s\n", __func__);
-	return status;
-}
-
 /*
-* Get layout from server.
-*    for now, assume that whole file layouts are requested.
-*    arg->offset: 0
-*    arg->length: all ones
-*/
+ * Get layout from server.
+ *    for now, assume that whole file layouts are requested.
+ *    arg->offset: 0
+ *    arg->length: all ones
+ */
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
+	   nfs4_stateid *stateid,
 	   const struct pnfs_layout_range *range,
-	   gfp_t gfp_flags)
+	   long *timeout, gfp_t gfp_flags)
 {
 	struct inode *ino = lo->plh_inode;
 	struct nfs_server *server = NFS_SERVER(ino);
@@ -868,10 +841,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	lgp->args.type = server->pnfs_curr_ld->id;
 	lgp->args.inode = ino;
 	lgp->args.ctx = get_nfs_open_context(ctx);
+	nfs4_stateid_copy(&lgp->args.stateid, stateid);
 	lgp->gfp_flags = gfp_flags;
 	lgp->cred = lo->plh_lc_cred;
 
-	return nfs4_proc_layoutget(lgp, gfp_flags);
+	return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
 }
 
 static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -1511,27 +1485,30 @@ pnfs_update_layout(struct inode *ino,
 		.offset = pos,
 		.length = count,
 	};
-	unsigned pg_offset;
+	unsigned pg_offset, seq;
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs_client *clp = server->nfs_client;
-	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_hdr *lo = NULL;
 	struct pnfs_layout_segment *lseg = NULL;
+	nfs4_stateid stateid;
+	long timeout = 0;
+	unsigned long giveup = jiffies + rpc_get_timeout(server->client);
 	bool first;
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_NO_PNFS);
 		goto out;
 	}
 
 	if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
 		goto out;
 	}
 
 	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
 		goto out;
 	}
@@ -1542,14 +1519,14 @@ lookup_again:
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
 	if (lo == NULL) {
 		spin_unlock(&ino->i_lock);
-		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_NOMEM);
 		goto out;
 	}
 
 	/* Do we even need to bother with this? */
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
 		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
@@ -1557,14 +1534,34 @@ lookup_again:
 
 	/* if LAYOUTGET already failed once we don't try again */
 	if (pnfs_layout_io_test_failed(lo, iomode)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
 		goto out_unlock;
 	}
 
-	first = list_empty(&lo->plh_segs);
-	if (first) {
-		/* The first layoutget for the file. Need to serialize per
+	lseg = pnfs_find_lseg(lo, &arg);
+	if (lseg) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+		goto out_unlock;
+	}
+
+	if (!nfs4_valid_open_stateid(ctx->state)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+		goto out_unlock;
+	}
+
+	/*
+	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
+	 * stateid, or it has been invalidated, then we must use the open
+	 * stateid.
+	 */
+	if (lo->plh_stateid.seqid == 0 ||
+	    test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+		/*
+		 * The first layoutget for the file. Need to serialize per
 		 * RFC 5661 Errata 3208.
 		 */
 		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1573,18 +1570,17 @@ lookup_again:
 			wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
 				    TASK_UNINTERRUPTIBLE);
 			pnfs_put_layout_hdr(lo);
+			dprintk("%s retrying\n", __func__);
 			goto lookup_again;
 		}
+
+		first = true;
+		do {
+			seq = read_seqbegin(&ctx->state->seqlock);
+			nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+		} while (read_seqretry(&ctx->state->seqlock, seq));
 	} else {
-		/* Check to see if the layout for the given range
-		 * already exists
-		 */
-		lseg = pnfs_find_lseg(lo, &arg);
-		if (lseg) {
-			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-					PNFS_UPDATE_LAYOUT_FOUND_CACHED);
-			goto out_unlock;
-		}
+		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
 	}
 
 	/*
@@ -1599,15 +1595,17 @@ lookup_again:
 				pnfs_clear_first_layoutget(lo);
 			pnfs_put_layout_hdr(lo);
 			dprintk("%s retrying\n", __func__);
+			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+					lseg, PNFS_UPDATE_LAYOUT_RETRY);
 			goto lookup_again;
 		}
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				PNFS_UPDATE_LAYOUT_RETURN);
 		goto out_put_layout_hdr;
 	}
 
 	if (pnfs_layoutgets_blocked(lo)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				PNFS_UPDATE_LAYOUT_BLOCKED);
 		goto out_unlock;
 	}
@@ -1632,26 +1630,36 @@ lookup_again:
 	if (arg.length != NFS4_MAX_UINT64)
 		arg.length = PAGE_ALIGN(arg.length);
 
-	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+	lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 	if (IS_ERR(lseg)) {
-		if (lseg == ERR_PTR(-EAGAIN)) {
+		switch(PTR_ERR(lseg)) {
+		case -ERECALLCONFLICT:
+			if (time_after(jiffies, giveup))
+				lseg = NULL;
+			/* Fallthrough */
+		case -EAGAIN:
+			pnfs_put_layout_hdr(lo);
 			if (first)
 				pnfs_clear_first_layoutget(lo);
-			pnfs_put_layout_hdr(lo);
-			goto lookup_again;
-		}
-
-		if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
-			pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
-			lseg = NULL;
+			if (lseg) {
+				trace_pnfs_update_layout(ino, pos, count,
+					iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+				goto lookup_again;
+			}
+			/* Fallthrough */
+		default:
+			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+				lseg = NULL;
+			}
 		}
 	} else {
 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 	}
 
 	atomic_dec(&lo->plh_outstanding);
-	trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 971068b58647..f9f3331bef49 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -228,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
 				   struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
 /* pnfs.c */
@@ -260,10 +260,6 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     const nfs4_stateid *new,
 			     bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
-				  struct pnfs_layout_hdr *lo,
-				  const struct pnfs_layout_range *range,
-				  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
 				const struct pnfs_layout_range *recall_range,
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 89627b9187f9..7ce9fb1b7d28 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,5 +28,6 @@
 #define EBADTYPE	527	/* Type not supported by server */
 #define EJUKEBOX	528	/* Request initiated, but will not complete before timeout */
 #define EIOCBQUEUED	529	/* iocb queued, will get completion event */
+#define ERECALLCONFLICT	530	/* conflict with recalled state */
 
 #endif
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e1692c96cbc8..bfed6b367350 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -637,7 +637,9 @@ enum pnfs_update_layout_reason {
 	PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
 	PNFS_UPDATE_LAYOUT_FOUND_CACHED,
 	PNFS_UPDATE_LAYOUT_RETURN,
+	PNFS_UPDATE_LAYOUT_RETRY,
 	PNFS_UPDATE_LAYOUT_BLOCKED,
+	PNFS_UPDATE_LAYOUT_INVALID_OPEN,
 	PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
 };
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e70ed54dad94..ccb2928a0e64 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
 	struct inode *inode;
 	struct nfs_open_context *ctx;
 	nfs4_stateid stateid;
-	unsigned long timestamp;
 	struct nfs4_layoutdriver_data layout;
 };
 
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_res res;
 	struct rpc_cred *cred;
 	gfp_t gfp_flags;
-	long timeout;
 };
 
 struct nfs4_getdeviceinfo_args {
-- 
cgit v1.2.3


From b52207ef4ea56f8c22288ec3387399aac72c26cf Mon Sep 17 00:00:00 2001
From: Chen Feng <puck.chen@hisilicon.com>
Date: Sun, 14 Feb 2016 14:29:21 +0800
Subject: mfd: hi655x: Add MFD driver for hi655x

Add PMIC MFD driver to support hisilicon hi665x.

Signed-off-by: Chen Feng <puck.chen@hisilicon.com>
Signed-off-by: Fei Wang <w.f@huawei.com>
Signed-off-by: Xinwei Kong <kong.kongxinwei@hisilicon.com>
Reviewed-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig             |  10 +++
 drivers/mfd/Makefile            |   1 +
 drivers/mfd/hi655x-pmic.c       | 162 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/hi655x-pmic.h |  55 ++++++++++++++
 4 files changed, 228 insertions(+)
 create mode 100644 drivers/mfd/hi655x-pmic.c
 create mode 100644 include/linux/mfd/hi655x-pmic.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index a49565cc1a6b..1bcf601de5bc 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -319,6 +319,16 @@ config MFD_HI6421_PMIC
 	  menus in order to enable them.
 	  We communicate with the Hi6421 via memory-mapped I/O.
 
+config MFD_HI655X_PMIC
+	tristate "HiSilicon Hi655X series PMU/Codec IC"
+	depends on ARCH_HISI || COMPILE_TEST
+	depends on OF
+	select MFD_CORE
+	select REGMAP_MMIO
+	select REGMAP_IRQ
+	help
+	  Select this option to enable Hisilicon hi655x series pmic driver.
+
 config HTC_EGPIO
 	bool "HTC EGPIO support"
 	depends on GPIOLIB && ARM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 921a08dad9b3..42a66e19e191 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -196,6 +196,7 @@ obj-$(CONFIG_MFD_STW481X)	+= stw481x.o
 obj-$(CONFIG_MFD_IPAQ_MICRO)	+= ipaq-micro.o
 obj-$(CONFIG_MFD_MENF21BMC)	+= menf21bmc.o
 obj-$(CONFIG_MFD_HI6421_PMIC)	+= hi6421-pmic-core.o
+obj-$(CONFIG_MFD_HI655X_PMIC)   += hi655x-pmic.o
 obj-$(CONFIG_MFD_DLN2)		+= dln2.o
 obj-$(CONFIG_MFD_RT5033)	+= rt5033.o
 obj-$(CONFIG_MFD_SKY81452)	+= sky81452.o
diff --git a/drivers/mfd/hi655x-pmic.c b/drivers/mfd/hi655x-pmic.c
new file mode 100644
index 000000000000..05ddc7882362
--- /dev/null
+++ b/drivers/mfd/hi655x-pmic.c
@@ -0,0 +1,162 @@
+/*
+ * Device driver for MFD hi655x PMIC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/hi655x-pmic.h>
+#include <linux/module.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+static const struct mfd_cell hi655x_pmic_devs[] = {
+	{ .name = "hi655x-regulator", },
+};
+
+static const struct regmap_irq hi655x_irqs[] = {
+	{ .reg_offset = 0, .mask = OTMP_D1R_INT },
+	{ .reg_offset = 0, .mask = VSYS_2P5_R_INT },
+	{ .reg_offset = 0, .mask = VSYS_UV_D3R_INT },
+	{ .reg_offset = 0, .mask = VSYS_6P0_D200UR_INT },
+	{ .reg_offset = 0, .mask = PWRON_D4SR_INT },
+	{ .reg_offset = 0, .mask = PWRON_D20F_INT },
+	{ .reg_offset = 0, .mask = PWRON_D20R_INT },
+	{ .reg_offset = 0, .mask = RESERVE_INT },
+};
+
+static const struct regmap_irq_chip hi655x_irq_chip = {
+	.name = "hi655x-pmic",
+	.irqs = hi655x_irqs,
+	.num_regs = 1,
+	.num_irqs = ARRAY_SIZE(hi655x_irqs),
+	.status_base = HI655X_IRQ_STAT_BASE,
+	.mask_base = HI655X_IRQ_MASK_BASE,
+};
+
+static struct regmap_config hi655x_regmap_config = {
+	.reg_bits = 32,
+	.reg_stride = HI655X_STRIDE,
+	.val_bits = 8,
+	.max_register = HI655X_BUS_ADDR(0xFFF),
+};
+
+static void hi655x_local_irq_clear(struct regmap *map)
+{
+	int i;
+
+	regmap_write(map, HI655X_ANA_IRQM_BASE, HI655X_IRQ_CLR);
+	for (i = 0; i < HI655X_IRQ_ARRAY; i++) {
+		regmap_write(map, HI655X_IRQ_STAT_BASE + i * HI655X_STRIDE,
+			     HI655X_IRQ_CLR);
+	}
+}
+
+static int hi655x_pmic_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct hi655x_pmic *pmic;
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	void __iomem *base;
+
+	pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL);
+	if (!pmic)
+		return -ENOMEM;
+	pmic->dev = dev;
+
+	pmic->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!pmic->res)
+		return -ENOENT;
+
+	base = devm_ioremap_resource(dev, pmic->res);
+	if (!base)
+		return -ENOMEM;
+
+	pmic->regmap = devm_regmap_init_mmio_clk(dev, NULL, base,
+						 &hi655x_regmap_config);
+
+	regmap_read(pmic->regmap, HI655X_BUS_ADDR(HI655X_VER_REG), &pmic->ver);
+	if ((pmic->ver < PMU_VER_START) || (pmic->ver > PMU_VER_END)) {
+		dev_warn(dev, "PMU version %d unsupported\n", pmic->ver);
+		return -EINVAL;
+	}
+
+	hi655x_local_irq_clear(pmic->regmap);
+
+	pmic->gpio = of_get_named_gpio(np, "pmic-gpios", 0);
+	if (!gpio_is_valid(pmic->gpio)) {
+		dev_err(dev, "Failed to get the pmic-gpios\n");
+		return -ENODEV;
+	}
+
+	ret = devm_gpio_request_one(dev, pmic->gpio, GPIOF_IN,
+				    "hi655x_pmic_irq");
+	if (ret < 0) {
+		dev_err(dev, "Failed to request gpio %d  ret = %d\n",
+			pmic->gpio, ret);
+		return ret;
+	}
+
+	ret = regmap_add_irq_chip(pmic->regmap, gpio_to_irq(pmic->gpio),
+				  IRQF_TRIGGER_LOW | IRQF_NO_SUSPEND, 0,
+				  &hi655x_irq_chip, &pmic->irq_data);
+	if (ret) {
+		dev_err(dev, "Failed to obtain 'hi655x_pmic_irq' %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, pmic);
+
+	ret = mfd_add_devices(dev, PLATFORM_DEVID_AUTO, hi655x_pmic_devs,
+			      ARRAY_SIZE(hi655x_pmic_devs), NULL, 0, NULL);
+	if (ret) {
+		dev_err(dev, "Failed to register device %d\n", ret);
+		regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int hi655x_pmic_remove(struct platform_device *pdev)
+{
+	struct hi655x_pmic *pmic = platform_get_drvdata(pdev);
+
+	regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+	mfd_remove_devices(&pdev->dev);
+	return 0;
+}
+
+static const struct of_device_id hi655x_pmic_match[] = {
+	{ .compatible = "hisilicon,hi655x-pmic", },
+	{},
+};
+
+static struct platform_driver hi655x_pmic_driver = {
+	.driver	= {
+		.name =	"hi655x-pmic",
+		.of_match_table = of_match_ptr(hi655x_pmic_match),
+	},
+	.probe  = hi655x_pmic_probe,
+	.remove = hi655x_pmic_remove,
+};
+module_platform_driver(hi655x_pmic_driver);
+
+MODULE_AUTHOR("Chen Feng <puck.chen@hisilicon.com>");
+MODULE_DESCRIPTION("Hisilicon hi655x PMIC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h
new file mode 100644
index 000000000000..dbbe9a644622
--- /dev/null
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -0,0 +1,55 @@
+/*
+ * Device driver for regulators in hi655x IC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __HI655X_PMIC_H
+#define __HI655X_PMIC_H
+
+/* Hi655x registers are mapped to memory bus in 4 bytes stride */
+#define HI655X_STRIDE                   4
+#define HI655X_BUS_ADDR(x)              ((x) << 2)
+
+#define HI655X_BITS                     8
+
+#define HI655X_NR_IRQ                   32
+
+#define HI655X_IRQ_STAT_BASE            (0x003 << 2)
+#define HI655X_IRQ_MASK_BASE            (0x007 << 2)
+#define HI655X_ANA_IRQM_BASE            (0x1b5 << 2)
+#define HI655X_IRQ_ARRAY                4
+#define HI655X_IRQ_MASK                 0xFF
+#define HI655X_IRQ_CLR                  0xFF
+#define HI655X_VER_REG                  0x00
+
+#define PMU_VER_START                   0x10
+#define PMU_VER_END                     0x38
+
+#define RESERVE_INT                     BIT(7)
+#define PWRON_D20R_INT                  BIT(6)
+#define PWRON_D20F_INT                  BIT(5)
+#define PWRON_D4SR_INT                  BIT(4)
+#define VSYS_6P0_D200UR_INT             BIT(3)
+#define VSYS_UV_D3R_INT                 BIT(2)
+#define VSYS_2P5_R_INT                  BIT(1)
+#define OTMP_D1R_INT                    BIT(0)
+
+struct hi655x_pmic {
+	struct resource *res;
+	struct device *dev;
+	struct regmap *regmap;
+	int gpio;
+	unsigned int ver;
+	struct regmap_irq_chip_data *irq_data;
+};
+
+#endif
-- 
cgit v1.2.3


From 94c6825e0ff75829207af6246782811b7c7af2c0 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Sun, 17 Apr 2016 17:08:40 +0300
Subject: net/mlx5_core: Use tasklet for user-space CQ completion events

Previously, we've fired all our completion callbacks straight from
our ISR.

Some of those callbacks were lightweight (for example, mlx5 Ethernet
napi callbacks), but some of them did more work (for example,
the user-space RDMA stack uverbs' completion handler). Besides that,
doing more than the minimal work in ISR is generally considered wrong,
it could even lead to a hard lockup of the system. Since when a lot
of completion events are generated by the hardware, the loop over
those events could be so long, that we'll get into a hard lockup by
the system watchdog.

In order to avoid that, add a new way of invoking completion events
callbacks. In the interrupt itself, we add the CQs which receive
completion event to a per-EQ list and schedule a tasklet. In the
tasklet context we loop over all the CQs in the list and invoke the
user callback.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c       | 59 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 12 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 17 +++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +
 include/linux/mlx5/cq.h                            |  5 ++
 include/linux/mlx5/driver.h                        | 10 ++++
 6 files changed, 104 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index b51e42d6fbec..873a631ad155 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -39,6 +39,53 @@
 #include <linux/mlx5/cq.h>
 #include "mlx5_core.h"
 
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx5_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data;
+	struct mlx5_core_cq *mcq;
+	struct mlx5_core_cq *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
+				 tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (atomic_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+{
+	unsigned long flags;
+	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		atomic_inc(&cq->refcount);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 {
 	struct mlx5_core_cq *cq;
@@ -96,6 +143,13 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	struct mlx5_create_cq_mbox_out out;
 	struct mlx5_destroy_cq_mbox_in din;
 	struct mlx5_destroy_cq_mbox_out dout;
+	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
+			   c_eqn);
+	struct mlx5_eq *eq;
+
+	eq = mlx5_eqn2eq(dev, eqn);
+	if (IS_ERR(eq))
+		return PTR_ERR(eq);
 
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ);
 	memset(&out, 0, sizeof(out));
@@ -111,6 +165,11 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->arm_sn     = 0;
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
+	if (!cq->comp)
+		cq->comp = mlx5_add_cq_to_tasklet;
+	/* assuming CQ will be deleted before the EQ */
+	cq->tasklet_ctx.priv = &eq->tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
 
 	spin_lock_irq(&table->lock);
 	err = radix_tree_insert(&table->tree, cq->cqn, cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 18fccec72c5d..0e30602ef76d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -202,7 +202,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	struct mlx5_eqe *eqe;
 	int eqes_found = 0;
 	int set_ci = 0;
-	u32 cqn;
+	u32 cqn = -1;
 	u32 rsn;
 	u8 port;
 
@@ -320,6 +320,9 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 
 	eq_update_ci(eq, 1);
 
+	if (cqn != -1)
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
 	return eqes_found;
 }
 
@@ -403,6 +406,12 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	if (err)
 		goto err_irq;
 
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
 	/* EQs are created in ARMED state
 	 */
 	eq_update_ci(eq, 1);
@@ -436,6 +445,7 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
 	synchronize_irq(eq->irqn);
+	tasklet_disable(&eq->tasklet_ctx.task);
 	mlx5_buf_free(dev, &eq->buf);
 
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6892746fd10d..aa98d0234bd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -660,6 +660,23 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 }
 EXPORT_SYMBOL(mlx5_vector2eqn);
 
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+	spin_lock(&table->lock);
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		if (eq->eqn == eqn) {
+			spin_unlock(&table->lock);
+			return eq;
+		}
+
+	spin_unlock(&table->lock);
+
+	return ERR_PTR(-ENOENT);
+}
+
 static void free_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 0b0b226c789e..f0d87046af8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -100,6 +100,8 @@ int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
 int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
 cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev);
 u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
+void mlx5_cq_tasklet_cb(unsigned long data);
 
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index b2c9fada8eac..2be976dd4966 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -53,6 +53,11 @@ struct mlx5_core_cq {
 	unsigned		arm_sn;
 	struct mlx5_rsc_debug	*dbg;
 	int			pid;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx5_core_cq *);
+		void		*priv;
+	} tasklet_ctx;
 };
 
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 369c837d40f5..5a41f9003941 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
+#include <linux/interrupt.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -304,6 +305,14 @@ struct mlx5_buf {
 	u8			page_shift;
 };
 
+struct mlx5_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
 struct mlx5_eq {
 	struct mlx5_core_dev   *dev;
 	__be32 __iomem	       *doorbell;
@@ -317,6 +326,7 @@ struct mlx5_eq {
 	struct list_head	list;
 	int			index;
 	struct mlx5_rsc_debug	*dbg;
+	struct mlx5_eq_tasklet	tasklet_ctx;
 };
 
 struct mlx5_core_psv {
-- 
cgit v1.2.3


From dd1a4cc1fbdf516bb38ca31b65c76e720d414d0d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 4 May 2016 14:09:44 -0500
Subject: KVM: split kvm_vcpu_wake_up from kvm_vcpu_kick
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AVIC has a use for kvm_vcpu_wake_up.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bbcd921d7cb0..b1fa8f11c95b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -657,6 +657,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7e22998a1ceb..dd4ac9d9e8f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2076,13 +2076,8 @@ out:
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
 #ifndef CONFIG_S390
-/*
- * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
- */
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
-	int me;
-	int cpu = vcpu->cpu;
 	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
@@ -2091,6 +2086,18 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
+
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+
+	kvm_vcpu_wake_up(vcpu);
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 		if (kvm_arch_vcpu_should_kick(vcpu))
-- 
cgit v1.2.3


From 45c04704e467fffe3525205454d9627325dae308 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 18 May 2016 16:19:01 +0300
Subject: ASoC: twl6040: Disconnect AUX output pads on digital mute

Disconnect also the path to AUXL from the HF path during digital_mute to
avoid pop noise leakage to Line-out pads.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/twl6040.h | 1 +
 sound/soc/codecs/twl6040.c  | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h
index 8f9fc3d26e6d..8e95cd87cd74 100644
--- a/include/linux/mfd/twl6040.h
+++ b/include/linux/mfd/twl6040.h
@@ -134,6 +134,7 @@
 #define TWL6040_HFDACENA		(1 << 0)
 #define TWL6040_HFPGAENA		(1 << 1)
 #define TWL6040_HFDRVENA		(1 << 4)
+#define TWL6040_HFSWENA			(1 << 6)
 
 /* VIBCTLL/R (0x18/0x1A) fields */
 
diff --git a/sound/soc/codecs/twl6040.c b/sound/soc/codecs/twl6040.c
index bc3de2e844e6..d1e3a932cbf3 100644
--- a/sound/soc/codecs/twl6040.c
+++ b/sound/soc/codecs/twl6040.c
@@ -983,9 +983,9 @@ static void twl6040_mute_path(struct snd_soc_codec *codec, enum twl6040_dai_id i
 		if (mute) {
 			/* Power down drivers and DACs */
 			hflctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-				    TWL6040_HFDRVENA);
+				    TWL6040_HFDRVENA | TWL6040_HFSWENA);
 			hfrctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-				    TWL6040_HFDRVENA);
+				    TWL6040_HFDRVENA | TWL6040_HFSWENA);
 		}
 
 		twl6040_reg_write(twl6040, TWL6040_REG_HFLCTL, hflctl);
-- 
cgit v1.2.3


From 0a70bd43053331d99881211e1d09f32de531432f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 24 Feb 2016 14:02:11 -0800
Subject: dax: enable dax in the presence of known media errors (badblocks)

1/ If a mapping overlaps a bad sector fail the request.

2/ Do not opportunistically report more dax-capable capacity than is
   requested when errors present.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
[vishal: fix a conflict with system RAM collision patches]
[vishal: add a 'size' parameter to ->direct_access]
[vishal: fix a conflict with DAX alignment check patches]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 arch/powerpc/sysdev/axonram.c |  2 +-
 drivers/block/brd.c           |  2 +-
 drivers/nvdimm/pmem.c         | 10 +++++++++-
 drivers/s390/block/dcssblk.c  |  4 ++--
 fs/block_dev.c                | 13 +------------
 include/linux/blkdev.h        |  2 +-
 6 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 0d112b94d91d..ff75d70f7285 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void __pmem **kaddr, pfn_t *pfn)
+		       void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
 	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 51a071e32221..c04bd9bc39fd 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void __pmem **kaddr, pfn_t *pfn)
+			void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f798899338ed..c447579bd853 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -182,14 +182,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void __pmem **kaddr, pfn_t *pfn)
+		      void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	resource_size_t offset = sector * 512 + pmem->data_offset;
 
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+		return -EIO;
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+	/*
+	 * If badblocks are present, limit known good range to the
+	 * requested range.
+	 */
+	if (unlikely(pmem->bb.count))
+		return size;
 	return pmem->size - pmem->pfn_pad - offset;
 }
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 1bce9cf51b1e..6ac33984bc0f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void __pmem **kaddr, pfn_t *pfn);
+			 void __pmem **kaddr, pfn_t *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -883,7 +883,7 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void __pmem **kaddr, pfn_t *pfn)
+			void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
 	unsigned long offset, dev_sz;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8477d4501b1e..45839b27972c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,7 +29,6 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
-#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -501,7 +500,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 	sector += get_start_sect(bdev);
 	if (sector % (PAGE_SIZE / 512))
 		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
 	if (!avail)
 		return -ERANGE;
 	if (avail > 0 && avail & ~PAGE_MASK)
@@ -561,7 +560,6 @@ EXPORT_SYMBOL_GPL(bdev_dax_supported);
  */
 bool bdev_dax_capable(struct block_device *bdev)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	struct blk_dax_ctl dax = {
 		.size = PAGE_SIZE,
 	};
@@ -577,15 +575,6 @@ bool bdev_dax_capable(struct block_device *bdev)
 	if (bdev_direct_access(bdev, &dax) < 0)
 		return false;
 
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
 	return true;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 27cbefe8c985..cf7c13c2c38d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1668,7 +1668,7 @@ struct block_device_operations {
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-			pfn_t *);
+			pfn_t *, long);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
-- 
cgit v1.2.3


From 3dc29161070ab14d065554c0ad58988ab77a7bfd Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 15 Mar 2016 11:20:41 -0600
Subject: dax: use sb_issue_zerout instead of calling dax_clear_sectors

dax_clear_sectors() cannot handle poisoned blocks.  These must be
zeroed using the BIO interface instead.  Convert ext2 and XFS to use
only sb_issue_zerout().

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
[vishal: Also remove the dax_clear_sectors function entirely]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c               | 32 --------------------------------
 fs/ext2/inode.c        |  8 ++++----
 fs/xfs/xfs_bmap_util.c | 15 ++++-----------
 include/linux/dax.h    |  1 -
 4 files changed, 8 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index d602410d8e52..0abbbb62981e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -87,38 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 	return page;
 }
 
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
-	struct blk_dax_ctl dax = {
-		.sector = _sector,
-		.size = _size,
-	};
-
-	might_sleep();
-	do {
-		long count, sz;
-
-		count = dax_map_atomic(bdev, &dax);
-		if (count < 0)
-			return count;
-		sz = min_t(long, count, SZ_128K);
-		clear_pmem(dax.addr, sz);
-		dax.size -= sz;
-		dax.sector += sz / 512;
-		dax_unmap_atomic(bdev, &dax);
-		cond_resched();
-	} while (dax.size);
-
-	wmb_pmem();
-	return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
 static bool buffer_written(struct buffer_head *bh)
 {
 	return buffer_mapped(bh) && !buffer_unwritten(bh);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9a14af3b1a69..17cbd6b696f2 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/dax.h>
+#include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
@@ -737,10 +738,9 @@ static int ext2_get_blocks(struct inode *inode,
 		 * so that it's not found by another thread before it's
 		 * initialised
 		 */
-		err = dax_clear_sectors(inode->i_sb->s_bdev,
-				le32_to_cpu(chain[depth-1].key) <<
-				(inode->i_blkbits - 9),
-				count << inode->i_blkbits);
+		err = sb_issue_zeroout(inode->i_sb,
+				le32_to_cpu(chain[depth-1].key), count,
+				GFP_NOFS);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3b6309865c65..930ac6a17ce3 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
 	struct xfs_mount *mp = ip->i_mount;
 	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
 	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
-	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
-
-	if (IS_DAX(VFS_I(ip)))
-		return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
-				sector, size);
-
-	/*
-	 * let the block layer decide on the fastest method of
-	 * implementing the zeroing.
-	 */
-	return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
 
+	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+		block << (mp->m_super->s_blocksize_bits - 9),
+		count_fsb << (mp->m_super->s_blocksize_bits - 9),
+		GFP_NOFS, true);
 }
 
 /*
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7c45ac7ea1d1..7f853ffaa987 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,7 +7,6 @@
 
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
-- 
cgit v1.2.3


From 679c8bd3b29428e736eabb7fc66a978f312f0c86 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 May 2016 10:47:04 +0200
Subject: dax: export a low-level __dax_zero_page_range helper

This allows XFS to perform zeroing using the iomap infrastructure and
avoid buffer heads.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[vishal: fix conflicts with dax-error-handling]
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 fs/dax.c            | 35 ++++++++++++++++++++---------------
 include/linux/dax.h |  7 +++++++
 2 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0abbbb62981e..651d4b18ac29 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -947,6 +947,23 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+		unsigned int offset, unsigned int length)
+{
+	struct blk_dax_ctl dax = {
+		.sector		= sector,
+		.size		= PAGE_SIZE,
+	};
+
+	if (dax_map_atomic(bdev, &dax) < 0)
+		return PTR_ERR(dax.addr);
+	clear_pmem(dax.addr + offset, length);
+	wmb_pmem();
+	dax_unmap_atomic(bdev, &dax);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
 /**
  * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
@@ -982,23 +999,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 	err = get_block(inode, index, &bh, 0);
-	if (err < 0)
+	if (err < 0 || !buffer_written(&bh))
 		return err;
-	if (buffer_written(&bh)) {
-		struct block_device *bdev = bh.b_bdev;
-		struct blk_dax_ctl dax = {
-			.sector = to_sector(&bh, inode),
-			.size = PAGE_SIZE,
-		};
 
-		if (dax_map_atomic(bdev, &dax) < 0)
-			return PTR_ERR(dax.addr);
-		clear_pmem(dax.addr + offset, length);
-		wmb_pmem();
-		dax_unmap_atomic(bdev, &dax);
-	}
-
-	return 0;
+	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+			offset, length);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7f853ffaa987..90fbc99e5313 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,12 +14,19 @@ int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+		unsigned int offset, unsigned int length);
 #else
 static inline struct page *read_dax_sector(struct block_device *bdev,
 		sector_t n)
 {
 	return ERR_PTR(-ENXIO);
 }
+static inline int __dax_zero_page_range(struct block_device *bdev,
+		sector_t sector, unsigned int offset, unsigned int length)
+{
+	return -ENXIO;
+}
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-- 
cgit v1.2.3


From 348e967ab07c96a9e7a6a194812254a8df2045c0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:15 +0200
Subject: dax: Make huge page handling depend of CONFIG_BROKEN

Currently the handling of huge pages for DAX is racy. For example the
following can happen:

CPU0 (THP write fault)			CPU1 (normal read fault)

__dax_pmd_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
					get_block(inode, block, &bh, 0)
					  -> not mapped
  if (!buffer_mapped(&bh) && write)
    get_block(inode, block, &bh, 1) -> allocates blocks
  truncate_pagecache_range(inode, lstart, lend);
					dax_load_hole();

This results in data corruption since process on CPU1 won't see changes
into the file done by CPU0.

The race can happen even if two normal faults race however with THP the
situation is even worse because the two faults don't operate on the same
entries in the radix tree and we want to use these entries for
serialization. So make THP support in DAX code depend on CONFIG_BROKEN
for now.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/Kconfig          | 1 +
 fs/dax.c            | 2 +-
 include/linux/dax.h | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59c18e6..b8fcb416be72 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -52,6 +52,7 @@ config FS_DAX_PMD
 	depends on FS_DAX
 	depends on ZONE_DEVICE
 	depends on TRANSPARENT_HUGEPAGE
+	depends on BROKEN
 
 endif # BLOCK
 
diff --git a/fs/dax.c b/fs/dax.c
index bdad05213e4b..0433a2b5e484 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -675,7 +675,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 /*
  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
  * more often than one might expect in the below function.
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 90fbc99e5313..72dc81de3ddb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -29,7 +29,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 				unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-- 
cgit v1.2.3


From e804315dd0f574b56155c5a2406ab5e0318104f7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:16 +0200
Subject: dax: Define DAX lock bit for radix tree exceptional entry

We will use lowest available bit in the radix tree exceptional entry for
locking of the entry. Define it. Also clean up definitions of DAX entry
type bits in DAX exceptional entries to use defined constants instead of
hardcoding numbers and cleanup checking of these bits to not rely on how
other bits in the entry are set.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 17 +++++++++++------
 include/linux/dax.h |  3 +++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0433a2b5e484..351afd3cf8be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,14 +32,19 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+		RADIX_TREE_EXCEPTIONAL_ENTRY))
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 72dc81de3ddb..70600b63083f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,6 +5,9 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
+/* We use lowest available exceptional entry bit for locking */
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-- 
cgit v1.2.3


From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:17 +0200
Subject: dax: Allow DAX code to replace exceptional entries

Currently we forbid page_cache_tree_insert() to replace exceptional radix
tree entries for DAX inodes. However to make DAX faults race free we will
lock radix tree entries and when hole is created, we need to replace
such locked radix tree entry with a hole page. So modify
page_cache_tree_insert() to allow that.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 include/linux/dax.h |  1 +
 mm/filemap.c        | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 70600b63083f..aa148937bb3f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
 /* We use lowest available exceptional entry bit for locking */
diff --git a/mm/filemap.c b/mm/filemap.c
index f2479af09da9..dfe55c2cfb34 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -597,14 +597,21 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
 
-		if (WARN_ON(dax_mapping(mapping)))
-			return -EINVAL;
-
-		if (shadowp)
-			*shadowp = p;
 		mapping->nrexceptional--;
-		if (node)
-			workingset_node_shadows_dec(node);
+		if (!dax_mapping(mapping)) {
+			if (shadowp)
+				*shadowp = p;
+			if (node)
+				workingset_node_shadows_dec(node);
+		} else {
+			/* DAX can replace empty locked entry with a hole */
+			WARN_ON_ONCE(p !=
+				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+					 RADIX_DAX_ENTRY_LOCK));
+			/* DAX accounts exceptional entries as normal pages */
+			if (node)
+				workingset_node_pages_dec(node);
+		}
 	}
 	radix_tree_replace_slot(slot, page);
 	mapping->nrpages++;
-- 
cgit v1.2.3


From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:18 +0200
Subject: dax: New fault locking

Currently DAX page fault locking is racy.

CPU0 (write fault)		CPU1 (read fault)

__dax_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
				  get_block(inode, block, &bh, 0)
				    -> not mapped
  if (!buffer_mapped(&bh))
    if (vmf->flags & FAULT_FLAG_WRITE)
      get_block(inode, block, &bh, 1) -> allocates blocks
  if (page) -> no
				  if (!buffer_mapped(&bh))
				    if (vmf->flags & FAULT_FLAG_WRITE) {
				    } else {
				      dax_load_hole();
				    }
  dax_insert_mapping()

And we are in a situation where we fail in dax_radix_entry() with -EIO.

Another problem with the current DAX page fault locking is that there is
no race-free way to clear dirty tag in the radix tree. We can always
end up with clean radix tree and dirty data in CPU cache.

We fix the first problem by introducing locking of exceptional radix
tree entries in DAX mappings acting very similarly to page lock and thus
synchronizing properly faults against the same mapping index. The same
lock can later be used to avoid races when clearing radix tree dirty
tag.

Reviewed-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 553 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/dax.h |   3 +
 mm/filemap.c        |   9 +-
 mm/truncate.c       |  62 +++---
 4 files changed, 447 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 351afd3cf8be..f43c3d806fb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -46,6 +46,30 @@
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
 		RADIX_TREE_EXCEPTIONAL_ENTRY))
 
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+	int i;
+
+	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+		init_waitqueue_head(wait_table + i);
+	return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+					      pgoff_t index)
+{
+	unsigned long hash = hash_long((unsigned long)mapping ^ index,
+				       DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -267,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+	struct address_space *mapping;
+	unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+	wait_queue_t wait;
+	struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+				       int sync, void *keyp)
+{
+	struct exceptional_entry_key *key = keyp;
+	struct wait_exceptional_entry_queue *ewait =
+		container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+	if (key->mapping != ewait->key.mapping ||
+	    key->index != ewait->key.index)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+	return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry |= RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+					pgoff_t index, void ***slotp)
+{
+	void *ret, **slot;
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+	ewait.key.mapping = mapping;
+	ewait.key.index = index;
+
+	for (;;) {
+		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+					  &slot);
+		if (!ret || !radix_tree_exceptional_entry(ret) ||
+		    !slot_locked(mapping, slot)) {
+			if (slotp)
+				*slotp = slot;
+			return ret;
+		}
+		prepare_to_wait_exclusive(wq, &ewait.wait,
+					  TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mapping->tree_lock);
+		schedule();
+		finish_wait(wq, &ewait.wait);
+		spin_lock_irq(&mapping->tree_lock);
+	}
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+restart:
+	spin_lock_irq(&mapping->tree_lock);
+	ret = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* No entry for given index? Make sure radix tree is big enough. */
+	if (!ret) {
+		int err;
+
+		spin_unlock_irq(&mapping->tree_lock);
+		err = radix_tree_preload(
+				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+		if (err)
+			return ERR_PTR(err);
+		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+			       RADIX_DAX_ENTRY_LOCK);
+		spin_lock_irq(&mapping->tree_lock);
+		err = radix_tree_insert(&mapping->page_tree, index, ret);
+		radix_tree_preload_end();
+		if (err) {
+			spin_unlock_irq(&mapping->tree_lock);
+			/* Someone already created the entry? */
+			if (err == -EEXIST)
+				goto restart;
+			return ERR_PTR(err);
+		}
+		/* Good, we have inserted empty locked entry into the tree. */
+		mapping->nrexceptional++;
+		spin_unlock_irq(&mapping->tree_lock);
+		return ret;
+	}
+	/* Normal page in radix tree? */
+	if (!radix_tree_exceptional_entry(ret)) {
+		struct page *page = ret;
+
+		get_page(page);
+		spin_unlock_irq(&mapping->tree_lock);
+		lock_page(page);
+		/* Page got truncated? Retry... */
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			put_page(page);
+			goto restart;
+		}
+		return page;
+	}
+	ret = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all)
+{
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	/*
+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
+	 * under mapping->tree_lock, ditto for entry handling in our callers.
+	 * So at this point all tasks that could have seen our entry locked
+	 * must be in the waitqueue and the following check will see them.
+	 */
+	if (waitqueue_active(wq)) {
+		struct exceptional_entry_key key;
+
+		key.mapping = mapping;
+		key.index = index;
+		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+	}
+}
+
+static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+	spin_lock_irq(&mapping->tree_lock);
+	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+			 !slot_locked(mapping, slot))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return;
+	}
+	unlock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+				     pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry)) {
+		unlock_page(entry);
+		put_page(entry);
+	} else {
+		unlock_mapping_entry(mapping, index);
+	}
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+				       pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry))
+		return;
+
+	/* We have to wake up next waiter for the radix tree entry lock */
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *entry;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	/*
+	 * This gets called from truncate / punch_hole path. As such, the caller
+	 * must hold locks protecting against concurrent modifications of the
+	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
+	 * caller has seen exceptional entry for this index, we better find it
+	 * at that index as well...
+	 */
+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	radix_tree_delete(&mapping->page_tree, index);
+	mapping->nrexceptional--;
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, true);
+
+	return 1;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating
  * a new page in the file would cause excessive storage usage for
@@ -275,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-							struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+			 struct vm_fault *vmf)
 {
-	if (!page)
-		page = find_or_create_page(mapping, vmf->pgoff,
-						GFP_KERNEL | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
+	struct page *page;
+
+	/* Hole page already exists? Return it...  */
+	if (!radix_tree_exceptional_entry(entry)) {
+		vmf->page = entry;
+		return VM_FAULT_LOCKED;
+	}
 
+	/* This will replace locked radix tree entry with a hole page */
+	page = find_or_create_page(mapping, vmf->pgoff,
+				   vmf->gfp_mask | __GFP_ZERO);
+	if (!page) {
+		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+		return VM_FAULT_OOM;
+	}
 	vmf->page = page;
 	return VM_FAULT_LOCKED;
 }
@@ -307,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 	return 0;
 }
 
-#define NO_SECTOR -1
 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-		sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+				      struct vm_fault *vmf,
+				      void *entry, sector_t sector)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
-	int type, error = 0;
-	void *entry;
+	int error = 0;
+	bool hole_fill = false;
+	void *new_entry;
+	pgoff_t index = vmf->pgoff;
 
-	WARN_ON_ONCE(pmd_entry && !dirty);
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	spin_lock_irq(&mapping->tree_lock);
-
-	entry = radix_tree_lookup(page_tree, pmd_index);
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-		index = pmd_index;
-		goto dirty;
+	/* Replacing hole page with block mapping? */
+	if (!radix_tree_exceptional_entry(entry)) {
+		hole_fill = true;
+		/*
+		 * Unmap the page now before we remove it from page cache below.
+		 * The page is locked so it cannot be faulted in again.
+		 */
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+				    PAGE_SIZE, 0);
+		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+		if (error)
+			return ERR_PTR(error);
 	}
 
-	entry = radix_tree_lookup(page_tree, index);
-	if (entry) {
-		type = RADIX_DAX_TYPE(entry);
-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-					type != RADIX_DAX_PMD)) {
-			error = -EIO;
+	spin_lock_irq(&mapping->tree_lock);
+	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+		       RADIX_DAX_ENTRY_LOCK);
+	if (hole_fill) {
+		__delete_from_page_cache(entry, NULL);
+		/* Drop pagecache reference */
+		put_page(entry);
+		error = radix_tree_insert(page_tree, index, new_entry);
+		if (error) {
+			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
+		mapping->nrexceptional++;
+	} else {
+		void **slot;
+		void *ret;
 
-		if (!pmd_entry || type == RADIX_DAX_PMD)
-			goto dirty;
-
-		/*
-		 * We only insert dirty PMD entries into the radix tree.  This
-		 * means we don't need to worry about removing a dirty PTE
-		 * entry and inserting a clean PMD entry, thus reducing the
-		 * range we would flush with a follow-up fsync/msync call.
-		 */
-		radix_tree_delete(&mapping->page_tree, index);
-		mapping->nrexceptional--;
-	}
-
-	if (sector == NO_SECTOR) {
-		/*
-		 * This can happen during correct operation if our pfn_mkwrite
-		 * fault raced against a hole punch operation.  If this
-		 * happens the pte that was hole punched will have been
-		 * unmapped and the radix tree entry will have been removed by
-		 * the time we are called, but the call will still happen.  We
-		 * will return all the way up to wp_pfn_shared(), where the
-		 * pte_same() check will fail, eventually causing page fault
-		 * to be retried by the CPU.
-		 */
-		goto unlock;
+		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+		WARN_ON_ONCE(ret != entry);
+		radix_tree_replace_slot(slot, new_entry);
 	}
-
-	error = radix_tree_insert(page_tree, index,
-			RADIX_DAX_ENTRY(sector, pmd_entry));
-	if (error)
-		goto unlock;
-
-	mapping->nrexceptional++;
- dirty:
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  unlock:
 	spin_unlock_irq(&mapping->tree_lock);
-	return error;
+	if (hole_fill) {
+		radix_tree_preload_end();
+		/*
+		 * We don't need hole page anymore, it has been replaced with
+		 * locked radix tree entry now.
+		 */
+		if (mapping->a_ops->freepage)
+			mapping->a_ops->freepage(entry);
+		unlock_page(entry);
+		put_page(entry);
+	}
+	return new_entry;
 }
 
 static int dax_writeback_one(struct block_device *bdev,
@@ -503,17 +788,19 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+			struct buffer_head *bh, void **entryp,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct address_space *mapping = inode->i_mapping;
 	struct block_device *bdev = bh->b_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
+		.sector = to_sector(bh, mapping->host),
 		.size = bh->b_size,
 	};
 	int error;
+	void *ret;
+	void *entry = *entryp;
 
 	i_mmap_lock_read(mapping);
 
@@ -523,16 +810,16 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 	dax_unmap_atomic(bdev, &dax);
 
-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-			vmf->flags & FAULT_FLAG_WRITE);
-	if (error)
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	if (IS_ERR(ret)) {
+		error = PTR_ERR(ret);
 		goto out;
+	}
+	*entryp = ret;
 
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
  out:
 	i_mmap_unlock_read(mapping);
-
 	return error;
 }
 
@@ -552,7 +839,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *page;
+	void *entry;
 	struct buffer_head bh;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	unsigned blkbits = inode->i_blkbits;
@@ -561,6 +848,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	int error;
 	int major = 0;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is supposed
+	 * to hold locks serializing us with truncate / punch hole so this is
+	 * a reliable test.
+	 */
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -570,40 +862,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
- repeat:
-	page = find_get_page(mapping, vmf->pgoff);
-	if (page) {
-		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-			put_page(page);
-			return VM_FAULT_RETRY;
-		}
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
+	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		error = PTR_ERR(entry);
+		goto out;
 	}
 
 	error = get_block(inode, block, &bh, 0);
 	if (!error && (bh.b_size < PAGE_SIZE))
 		error = -EIO;		/* fs corruption? */
 	if (error)
-		goto unlock_page;
-
-	if (!buffer_mapped(&bh) && !vmf->cow_page) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_SIZE))
-				error = -EIO;
-			if (error)
-				goto unlock_page;
-		} else {
-			return dax_load_hole(mapping, page, vmf);
-		}
-	}
+		goto unlock_entry;
 
 	if (vmf->cow_page) {
 		struct page *new_page = vmf->cow_page;
@@ -612,30 +881,37 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
-			goto unlock_page;
-		vmf->page = page;
-		if (!page)
+			goto unlock_entry;
+		if (!radix_tree_exceptional_entry(entry)) {
+			vmf->page = entry;
+		} else {
+			unlock_mapping_entry(mapping, vmf->pgoff);
 			i_mmap_lock_read(mapping);
+			vmf->page = NULL;
+		}
 		return VM_FAULT_LOCKED;
 	}
 
-	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
-		page = find_lock_page(mapping, vmf->pgoff);
-
-	if (page) {
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-							PAGE_SIZE, 0);
-		delete_from_page_cache(page);
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
+	if (!buffer_mapped(&bh)) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_entry;
+		} else {
+			return dax_load_hole(mapping, entry, vmf);
+		}
 	}
 
 	/* Filesystem should not return unwritten buffers to us! */
 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(inode, &bh, vma, vmf);
-
+	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
@@ -643,13 +919,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if ((error < 0) && (error != -EBUSY))
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
-
- unlock_page:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
 
@@ -718,7 +987,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct block_device *bdev;
 	pgoff_t size, pgoff;
 	sector_t block;
-	int error, result = 0;
+	int result = 0;
 	bool alloc = false;
 
 	/* dax pmd mappings require pfn_t_devmap() */
@@ -865,13 +1134,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		 * the write to insert a dirty entry.
 		 */
 		if (write) {
-			error = dax_radix_entry(mapping, pgoff, dax.sector,
-					true, true);
-			if (error) {
-				dax_pmd_dbg(&bh, address,
-						"PMD radix insertion failed");
-				goto fallback;
-			}
+			/*
+			 * We should insert radix-tree entry and dirty it here.
+			 * For now this is broken...
+			 */
 		}
 
 		dev_dbg(part_to_dev(bdev->bd_part),
@@ -931,23 +1197,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
-	int error;
-
-	/*
-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
-	 * previous call to __dax_fault().  We just want to look up that PTE
-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
-	 * saves us from having to make a call to get_block() here to look
-	 * up the sector.
-	 */
-	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
-			true);
+	struct address_space *mapping = file->f_mapping;
+	void *entry;
+	pgoff_t index = vmf->pgoff;
 
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (error)
-		return VM_FAULT_SIGBUS;
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!entry || !radix_tree_exceptional_entry(entry))
+		goto out;
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	put_unlocked_mapping_entry(mapping, index, entry);
+out:
+	spin_unlock_irq(&mapping->tree_lock);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index aa148937bb3f..756625c6d0dd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
diff --git a/mm/filemap.c b/mm/filemap.c
index dfe55c2cfb34..7b9a4b180cae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
 			return;
 
 	/*
-	 * Track node that only contains shadow entries.
+	 * Track node that only contains shadow entries. DAX mappings contain
+	 * no shadow entries and may contain other exceptional entries so skip
+	 * those.
 	 *
 	 * Avoid acquiring the list_lru lock if already tracked.  The
 	 * list_empty() test is safe as node->private_list is
 	 * protected by mapping->tree_lock.
 	 */
-	if (!workingset_node_pages(node) &&
+	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
 		list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			/* DAX accounts exceptional entries as normal pages */
 			if (node)
 				workingset_node_pages_dec(node);
+			/* Wakeup waiters for exceptional entry lock */
+			dax_wake_mapping_entry_waiter(mapping, page->index,
+						      false);
 		}
 	}
 	radix_tree_replace_slot(slot, page);
diff --git a/mm/truncate.c b/mm/truncate.c
index b00272810871..4064f8f53daa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
 	if (shmem_mapping(mapping))
 		return;
 
-	spin_lock_irq(&mapping->tree_lock);
-
 	if (dax_mapping(mapping)) {
-		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
-			mapping->nrexceptional--;
-	} else {
-		/*
-		 * Regular page slots are stabilized by the page lock even
-		 * without the tree itself locked.  These unlocked entries
-		 * need verification under the tree lock.
-		 */
-		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
-					&slot))
-			goto unlock;
-		if (*slot != entry)
-			goto unlock;
-		radix_tree_replace_slot(slot, NULL);
-		mapping->nrexceptional--;
-		if (!node)
-			goto unlock;
-		workingset_node_shadows_dec(node);
-		/*
-		 * Don't track node without shadow entries.
-		 *
-		 * Avoid acquiring the list_lru lock if already untracked.
-		 * The list_empty() test is safe as node->private_list is
-		 * protected by mapping->tree_lock.
-		 */
-		if (!workingset_node_shadows(node) &&
-		    !list_empty(&node->private_list))
-			list_lru_del(&workingset_shadow_nodes,
-					&node->private_list);
-		__radix_tree_delete_node(&mapping->page_tree, node);
+		dax_delete_mapping_entry(mapping, index);
+		return;
 	}
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+				&slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+	radix_tree_replace_slot(slot, NULL);
+	mapping->nrexceptional--;
+	if (!node)
+		goto unlock;
+	workingset_node_shadows_dec(node);
+	/*
+	 * Don't track node without shadow entries.
+	 *
+	 * Avoid acquiring the list_lru lock if already untracked.
+	 * The list_empty() test is safe as node->private_list is
+	 * protected by mapping->tree_lock.
+	 */
+	if (!workingset_node_shadows(node) &&
+	    !list_empty(&node->private_list))
+		list_lru_del(&workingset_shadow_nodes,
+				&node->private_list);
+	__radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 }
-- 
cgit v1.2.3


From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:19 +0200
Subject: dax: Use radix tree entry lock to protect cow faults

When doing cow faults, we cannot directly fill in PTE as we do for other
faults as we rely on generic code to do proper accounting of the cowed page.
We also have no page to lock to protect against races with truncate as
other faults have and we need the protection to extend until the moment
generic code inserts cowed page into PTE thus at that point we have no
protection of fs-specific i_mmap_sem. So far we relied on using
i_mmap_lock for the protection however that is completely special to cow
faults. To make fault locking more uniform use DAX entry lock instead.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 12 +++++-------
 include/linux/dax.h |  7 +++++++
 include/linux/mm.h  |  7 +++++++
 mm/memory.c         | 38 ++++++++++++++++++--------------------
 4 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index f43c3d806fb6..be74635e05a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -478,7 +478,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 	}
 }
 
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
 	void *ret, **slot;
 
@@ -501,7 +501,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 		unlock_page(entry);
 		put_page(entry);
 	} else {
-		unlock_mapping_entry(mapping, index);
+		dax_unlock_mapping_entry(mapping, index);
 	}
 }
 
@@ -884,12 +884,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			goto unlock_entry;
 		if (!radix_tree_exceptional_entry(entry)) {
 			vmf->page = entry;
-		} else {
-			unlock_mapping_entry(mapping, vmf->pgoff);
-			i_mmap_lock_read(mapping);
-			vmf->page = NULL;
+			return VM_FAULT_LOCKED;
 		}
-		return VM_FAULT_LOCKED;
+		vmf->entry = entry;
+		return VM_FAULT_DAX_LOCKED;
 	}
 
 	if (!buffer_mapped(&bh)) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 756625c6d0dd..7bf12277c006 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -21,6 +21,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
@@ -29,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 {
 	return ERR_PTR(-ENXIO);
 }
+/* Shouldn't ever be called when dax is disabled. */
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+					    pgoff_t index)
+{
+	BUG();
+}
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		sector_t sector, unsigned int offset, unsigned int length)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a55e5be0894f..0ef9dc720ec3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -299,6 +299,12 @@ struct vm_fault {
 					 * is set (which is also implied by
 					 * VM_FAULT_ERROR).
 					 */
+	void *entry;			/* ->fault handler can alternatively
+					 * return locked DAX entry. In that
+					 * case handler should return
+					 * VM_FAULT_DAX_LOCKED and fill in
+					 * entry here.
+					 */
 	/* for ->map_pages() only */
 	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
 					 * max_pgoff inclusive */
@@ -1084,6 +1090,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
+#define VM_FAULT_DAX_LOCKED 0x1000	/* ->fault has locked DAX entry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
diff --git a/mm/memory.c b/mm/memory.c
index 93897f23cc11..f09cdb8d48fa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -63,6 +63,7 @@
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2785,7 +2786,8 @@ oom:
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 			pgoff_t pgoff, unsigned int flags,
-			struct page *cow_page, struct page **page)
+			struct page *cow_page, struct page **page,
+			void **entry)
 {
 	struct vm_fault vmf;
 	int ret;
@@ -2800,8 +2802,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
-	if (!vmf.page)
-		goto out;
+	if (ret & VM_FAULT_DAX_LOCKED) {
+		*entry = vmf.entry;
+		return ret;
+	}
 
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
@@ -2815,7 +2819,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
- out:
 	*page = vmf.page;
 	return ret;
 }
@@ -2987,7 +2990,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
@@ -3010,6 +3013,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	struct page *fault_page, *new_page;
+	void *fault_entry;
 	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pte_t *pte;
@@ -3027,26 +3031,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+			 &fault_entry);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
 
-	if (fault_page)
+	if (!(ret & VM_FAULT_DAX_LOCKED))
 		copy_user_highpage(new_page, fault_page, address, vma);
 	__SetPageUptodate(new_page);
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
-		if (fault_page) {
+		if (!(ret & VM_FAULT_DAX_LOCKED)) {
 			unlock_page(fault_page);
 			put_page(fault_page);
 		} else {
-			/*
-			 * The fault handler has no page to lock, so it holds
-			 * i_mmap_lock for read to protect against truncate.
-			 */
-			i_mmap_unlock_read(vma->vm_file->f_mapping);
+			dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+						 pgoff);
 		}
 		goto uncharge_out;
 	}
@@ -3054,15 +3056,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
-	if (fault_page) {
+	if (!(ret & VM_FAULT_DAX_LOCKED)) {
 		unlock_page(fault_page);
 		put_page(fault_page);
 	} else {
-		/*
-		 * The fault handler has no page to lock, so it holds
-		 * i_mmap_lock for read to protect against truncate.
-		 */
-		i_mmap_unlock_read(vma->vm_file->f_mapping);
+		dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
 	}
 	return ret;
 uncharge_out:
@@ -3082,7 +3080,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int dirtied = 0;
 	int ret, tmp;
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-- 
cgit v1.2.3


From 35e481761cdc688dbee0ef552a13f49af8eba6cc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 19 May 2016 17:08:59 -0700
Subject: fsnotify: avoid spurious EMFILE errors from inotify_init()

Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance).  However marks are
freed only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance.  Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view.  This problem is exposed by LTP inotify06 testcase on
some machines.

We fix the problem by making sure all group marks are properly freed
while destroying inotify instance.  We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance.  So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.h             |  7 ++++
 fs/notify/group.c                | 17 ++++++---
 fs/notify/mark.c                 | 78 +++++++++++++++++++++++++++++++---------
 include/linux/fsnotify_backend.h |  2 --
 4 files changed, 81 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e7..0a3bc2cf192c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
 			       &mnt->mnt_root->d_lock);
 }
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb2854..3e2dd85be5dd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-	/* clear all inode marks for this group */
-	fsnotify_clear_marks_by_group(group);
+	/* clear all inode marks for this group, attach them to destroy_list */
+	fsnotify_detach_group_marks(group);
 
-	synchronize_srcu(&fsnotify_mark_srcu);
+	/*
+	 * Wait for fsnotify_mark_srcu period to end and free all marks in
+	 * destroy_list
+	 */
+	fsnotify_mark_destroy_list();
 
-	/* clear the notification queue of all events */
+	/*
+	 * Since we have waited for fsnotify_mark_srcu in
+	 * fsnotify_mark_destroy_list() there can be no outstanding event
+	 * notification against this group. So clearing the notification queue
+	 * of all events is reliable now.
+	 */
 	fsnotify_flush_notify(group);
 
 	/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373..d3fea0bd89e2 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
 
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 }
 
 /*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
  */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group = mark->group;
 
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	/* something else already called this function on this mark */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
 		spin_unlock(&mark->lock);
-		return;
+		return false;
 	}
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);
 
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	queue_delayed_work(system_unbound_wq, &reaper_work,
-				FSNOTIFY_REAPER_DELAY);
-
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
 	 * callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+
+	return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+	if (__fsnotify_free_mark(mark)) {
+		queue_delayed_work(system_unbound_wq, &reaper_work,
+				   FSNOTIFY_REAPER_DELAY);
+	}
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 }
 
 /*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
  */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+	struct fsnotify_mark *mark;
+
+	while (1) {
+		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+		if (list_empty(&group->marks_list)) {
+			mutex_unlock(&group->mark_mutex);
+			break;
+		}
+		mark = list_first_entry(&group->marks_list,
+					struct fsnotify_mark, g_list);
+		fsnotify_get_mark(mark);
+		fsnotify_detach_mark(mark);
+		mutex_unlock(&group->mark_mutex);
+		__fsnotify_free_mark(mark);
+		fsnotify_put_mark(mark);
+	}
 }
 
 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	mark->free_mark = free_mark;
 }
 
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
 {
 	struct fsnotify_mark *mark, *next;
 	struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
 		fsnotify_put_mark(mark);
 	}
 }
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+	fsnotify_mark_destroy_list();
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53d9296..29f917517299 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
-- 
cgit v1.2.3


From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:02 -0700
Subject: time: add missing implementation for timespec64_add_safe()

timespec64_add_safe() has been defined in time64.h for 64 bit systems.
But, 32 bit systems only have an extern function prototype defined.
Provide a definition for the above function.

The function will be necessary as part of y2038 changes.  struct
timespec is not y2038 safe.  All references to timespec will be replaced
by struct timespec64.  The function is meant to be a replacement for
timespec_add_safe().

The implementation is similar to timespec_add_safe().

Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h |  4 +---
 kernel/time/time.c     | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af899e8..1778937221bf 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n
 
 /*
  * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
+ * overflow. It will return TIME64_MAX in case of overflow.
  */
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 					 const struct timespec64 rhs);
 
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..cb1f83eb5599 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 
 	return res;
 }
+
+#if __BITS_PER_LONG != 64
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+				const struct timespec64 rhs)
+{
+	struct timespec64 res;
+
+	set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+			lhs.tv_nsec + rhs.tv_nsec);
+
+	if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+		res.tv_sec = TIME64_MAX;
+		res.tv_nsec = 0;
+	}
+
+	return res;
+}
+
+#endif
-- 
cgit v1.2.3


From 766b9f928bd5b9b185d986d40355d1f143484136 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:05 -0700
Subject: fs: poll/select/recvmmsg: use timespec64 for timeout events

struct timespec is not y2038 safe.  Even though timespec might be
sufficient to represent timeouts, use struct timespec64 here as the plan
is to get rid of all timespec reference in the kernel.

The patch transitions the common functions: poll_select_set_timeout()
and select_estimate_accuracy() to use timespec64.  And, all the syscalls
that use these functions are transitioned in the same patch.

The restart block parameters for poll uses monotonic time.  Use
timespec64 here as well to assign timeout value.  This parameter in the
restart block need not change because this only holds the monotonic
timestamp at which timeout should occur.  And, unsigned long data type
should be big enough for this timestamp.

The system call interfaces will be handled in a separate series.

Compat interfaces need not change as timespec64 is an alias to struct
timespec on a 64 bit system.

Link: http://lkml.kernel.org/r/1461947989-21926-3-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c       | 12 +++++-----
 fs/select.c          | 67 +++++++++++++++++++++++++++++-----------------------
 include/linux/poll.h | 11 +++++----
 net/socket.c         |  8 ++++---
 4 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a52e0f..10db91218933 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
 	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
 }
 
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
 {
-	struct timespec now, ts = {
+	struct timespec64 now, ts = {
 		.tv_sec = ms / MSEC_PER_SEC,
 		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
 	};
 
-	ktime_get_ts(&now);
-	return timespec_add_safe(now, ts);
+	ktime_get_ts64(&now);
+	return timespec64_add_safe(now, ts);
 }
 
 /**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
-		struct timespec end_time = ep_set_mstimeout(timeout);
+		struct timespec64 end_time = ep_set_mstimeout(timeout);
 
 		slack = select_estimate_accuracy(&end_time);
 		to = &expires;
-		*to = timespec_to_ktime(end_time);
+		*to = timespec64_to_ktime(end_time);
 	} else if (timeout == 0) {
 		/*
 		 * Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/select.c b/fs/select.c
index 869293988c2a..8ed9da50896a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
 
 #define MAX_SLACK	(100 * NSEC_PER_MSEC)
 
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
 {
 	long slack;
 	int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
 {
 	u64 ret;
-	struct timespec now;
+	struct timespec64 now;
 
 	/*
 	 * Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
 	if (rt_task(current))
 		return 0;
 
-	ktime_get_ts(&now);
-	now = timespec_sub(*tv, now);
+	ktime_get_ts64(&now);
+	now = timespec64_sub(*tv, now);
 	ret = __estimate_accuracy(&now);
 	if (ret < current->timer_slack_ns)
 		return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
- * @to:		pointer to timespec variable for the final timeout
+ * @to:		pointer to timespec64 variable for the final timeout
  * @sec:	seconds (from user space)
  * @nsec:	nanoseconds (from user space)
  *
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
  *
  * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
  */
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
 {
-	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
 
-	if (!timespec_valid(&ts))
+	if (!timespec64_valid(&ts))
 		return -EINVAL;
 
 	/* Optimize for the zero timeout value here */
 	if (!sec && !nsec) {
 		to->tv_sec = to->tv_nsec = 0;
 	} else {
-		ktime_get_ts(to);
-		*to = timespec_add_safe(*to, ts);
+		ktime_get_ts64(to);
+		*to = timespec64_add_safe(*to, ts);
 	}
 	return 0;
 }
 
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+				      void __user *p,
 				      int timeval, int ret)
 {
+	struct timespec64 rts64;
 	struct timespec rts;
 	struct timeval rtv;
 
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts(&rts);
-	rts = timespec_sub(*end_time, rts);
-	if (rts.tv_sec < 0)
-		rts.tv_sec = rts.tv_nsec = 0;
+	ktime_get_ts64(&rts64);
+	rts64 = timespec64_sub(*end_time, rts64);
+	if (rts64.tv_sec < 0)
+		rts64.tv_sec = rts64.tv_nsec = 0;
+
+	rts = timespec64_to_timespec(rts64);
 
 	if (timeval) {
 		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
 			memset(&rtv, 0, sizeof(rtv));
-		rtv.tv_sec = rts.tv_sec;
-		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+		rtv.tv_sec = rts64.tv_sec;
+		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
 
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		 * pointer to the expiry value.
 		 */
 		if (end_time && !to) {
-			expire = timespec_to_ktime(*end_time);
+			expire = timespec64_to_ktime(*end_time);
 			to = &expire;
 		}
 
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
  * I'm trying ERESTARTNOHAND which restart only when you want to.
  */
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, struct timespec *end_time)
+			   fd_set __user *exp, struct timespec64 *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -622,7 +626,7 @@ out_nofds:
 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
 		fd_set __user *, exp, struct timeval __user *, tvp)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts, end_time, *to = NULL;
+	struct timespec ts;
+	struct timespec64 ts64, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
+		ts64 = timespec_to_timespec64(ts);
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 }
 
 static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
-		   struct timespec *end_time)
+		   struct timespec64 *end_time)
 {
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 		 * pointer to the expiry value.
 		 */
 		if (end_time && !to) {
-			expire = timespec_to_ktime(*end_time);
+			expire = timespec64_to_ktime(*end_time);
 			to = &expire;
 		}
 
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 			sizeof(struct pollfd))
 
 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
-		struct timespec *end_time)
+		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
 {
 	struct pollfd __user *ufds = restart_block->poll.ufds;
 	int nfds = restart_block->poll.nfds;
-	struct timespec *to = NULL, end_time;
+	struct timespec64 *to = NULL, end_time;
 	int ret;
 
 	if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		int, timeout_msecs)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	int ret;
 
 	if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts, end_time, *to = NULL;
+	struct timespec ts;
+	struct timespec64 end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 9fb4f40d9a26..37b057b63b46 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 
 static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
 extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       struct timespec *end_time);
+		       struct timespec64 *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, struct timespec *end_time);
+			   fd_set __user *exp, struct timespec64 *end_time);
 
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+				   long nsec);
 
 #endif /* _LINUX_POLL_H */
diff --git a/net/socket.c b/net/socket.c
index e7793f5601ae..a1bd16106625 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	struct mmsghdr __user *entry;
 	struct compat_mmsghdr __user *compat_entry;
 	struct msghdr msg_sys;
-	struct timespec end_time;
+	struct timespec64 end_time;
+	struct timespec64 timeout64;
 
 	if (timeout &&
 	    poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			flags |= MSG_DONTWAIT;
 
 		if (timeout) {
-			ktime_get_ts(timeout);
-			*timeout = timespec_sub(end_time, *timeout);
+			ktime_get_ts64(&timeout64);
+			*timeout = timespec64_to_timespec(
+					timespec64_sub(end_time, timeout64));
 			if (timeout->tv_sec < 0) {
 				timeout->tv_sec = timeout->tv_nsec = 0;
 				break;
-- 
cgit v1.2.3


From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:08 -0700
Subject: time: remove timespec_add_safe()

All references to timespec_add_safe() now use timespec64_add_safe().

The plan is to replace struct timespec references with struct timespec64
throughout the kernel as timespec is not y2038 safe.

Drop timespec_add_safe() and use timespec64_add_safe() for all
architectures.

Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h | 15 +++++++--------
 kernel/time/time.c     |  4 ----
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 1778937221bf..7e5d2fa9ac46 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add_safe		timespec_add_safe
 # define timespec64_add			timespec_add
 # define timespec64_sub			timespec_sub
 # define timespec64_valid		timespec_valid
@@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
 
 extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
 
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME64_MAX in case of overflow.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
-					 const struct timespec64 rhs);
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
@@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 
 #endif
 
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+					 const struct timespec64 rhs);
+
 #endif /* _LINUX_TIME64_H */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index cb1f83eb5599..667b9335f5d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 	return res;
 }
 
-#if __BITS_PER_LONG != 64
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
@@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 
 	return res;
 }
-
-#endif
-- 
cgit v1.2.3


From b1e4d9d82df8ab9097f80aa208c40eab6fc29858 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:20 -0700
Subject: debugobjects: make fixup functions return bool instead of int

I am going to introduce debugobjects infrastructure to USB subsystem.
But before this, I found the code of debugobjects could be improved.
This patchset will make fixup functions return bool type instead of int.
Because fixup only need report success or no.  boolean is the 'real'
type.

This patch (of 7):

The object debugging infrastructure core provides some fixup callbacks
for the subsystem who use it.  These callbacks are called from the debug
code whenever a problem in debug_object_init is detected.  And
debugobjects core suppose them returns 1 when the fixup was successful,
otherwise 0.  So the return type is boolean.

A bad thing is that debug_object_fixup use the return value for
arithmetic operation.  It confused me that what is the reall return
type.

Reading over the whole code, I found some place do use the return value
incorrectly(see next patch).  So why use bool type instead?

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h | 15 ++++++++-------
 lib/debugobjects.c           | 43 +++++++++++++++++++++----------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 98ffcbd4888e..a899f10c9365 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -39,7 +39,8 @@ struct debug_obj {
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
  * @fixup_init:		fixup function, which is called when the init check
- *			fails
+ *			fails. All fixup functions must return true if fixup
+ *			was successful, otherwise return false
  * @fixup_activate:	fixup function, which is called when the activate check
  *			fails
  * @fixup_destroy:	fixup function, which is called when the destroy check
@@ -51,12 +52,12 @@ struct debug_obj {
  */
 struct debug_obj_descr {
 	const char		*name;
-	void *(*debug_hint)	(void *addr);
-	int (*fixup_init)	(void *addr, enum debug_obj_state state);
-	int (*fixup_activate)	(void *addr, enum debug_obj_state state);
-	int (*fixup_destroy)	(void *addr, enum debug_obj_state state);
-	int (*fixup_free)	(void *addr, enum debug_obj_state state);
-	int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+	void *(*debug_hint)(void *addr);
+	bool (*fixup_init)(void *addr, enum debug_obj_state state);
+	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+	bool (*fixup_free)(void *addr, enum debug_obj_state state);
+	bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
 };
 
 #ifdef CONFIG_DEBUG_OBJECTS
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a10fd70..a9cee165cf25 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
  * Try to repair the damage, so we have a better chance to get useful
  * debug output.
  */
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
 		   void * addr, enum debug_obj_state state)
 {
-	int fixed = 0;
-
-	if (fixup)
-		fixed = fixup(addr, state);
-	debug_objects_fixups += fixed;
-	return fixed;
+	if (fixup && fixup(addr, state)) {
+		debug_objects_fixups++;
+		return true;
+	}
+	return false;
 }
 
 static void debug_object_is_on_stack(void *addr, int onstack)
@@ -797,7 +796,7 @@ static __initdata struct debug_obj_descr descr_type_test;
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -805,9 +804,9 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_init(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -816,7 +815,7 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -825,17 +824,17 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
 		if (obj->static_init == 1) {
 			debug_object_init(obj, &descr_type_test);
 			debug_object_activate(obj, &descr_type_test);
-			return 0;
+			return false;
 		}
-		return 1;
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
-		return 1;
+		return true;
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -843,7 +842,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_destroy is called when:
  * - an active object is destroyed
  */
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -851,9 +850,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_destroy(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -861,7 +860,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct self_test *obj = addr;
 
@@ -869,9 +868,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_free(obj, &descr_type_test);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:41 -0700
Subject: debugobjects: insulate non-fixup logic related to static obj from
 fixup callbacks

When activating a static object we need make sure that the object is
tracked in the object tracker.  If it is a non-static object then the
activation is illegal.

In previous implementation, each subsystem need take care of this in
their fixup callbacks.  Actually we can put it into debugobjects core.
Thus we can save duplicated code, and have *pure* fixup callbacks.

To achieve this, a new callback "is_static_object" is introduced to let
the type specific code decide whether a object is static or not.  If
yes, we take it into object tracker, otherwise give warning and invoke
fixup callback.

This change has paassed debugobjects selftest, and I also do some test
with all debugobjects supports enabled.

At last, I have a concern about the fixups that can it change the object
which is in incorrect state on fixup? Because the 'addr' may not point
to any valid object if a non-static object is not tracked.  Then Change
such object can overwrite someone's memory and cause unexpected
behaviour.  For example, the timer_fixup_activate bind timer to function
stub_timer.

Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com
[changbin.du@intel.com: improve code comments where invoke the new is_static_object callback]
  Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com
Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h |  2 ++
 kernel/rcu/update.c          | 26 +++--------------------
 kernel/time/hrtimer.c        |  7 +------
 kernel/time/timer.c          | 43 +++++++++++++-------------------------
 kernel/workqueue.c           | 42 ++++++++-----------------------------
 lib/debugobjects.c           | 49 +++++++++++++++++++++++++++++---------------
 6 files changed, 60 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index a899f10c9365..46056cb161fc 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,6 +38,7 @@ struct debug_obj {
  * @name:		name of the object typee
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
+ * @is_static_object	return true if the obj is static, otherwise return false
  * @fixup_init:		fixup function, which is called when the init check
  *			fails. All fixup functions must return true if fixup
  *			was successful, otherwise return false
@@ -53,6 +54,7 @@ struct debug_obj {
 struct debug_obj_descr {
 	const char		*name;
 	void *(*debug_hint)(void *addr);
+	bool (*is_static_object)(void *addr);
 	bool (*fixup_init)(void *addr, enum debug_obj_state state);
 	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
 	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a9df198eb22d..3e888cd5a594 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
 	debug_object_free(head, &rcuhead_debug_descr);
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
 {
-	struct rcu_head *head = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. We just make sure that it is
-		 * tracked in the object tracker.
-		 */
-		debug_object_init(head, &rcuhead_debug_descr);
-		debug_object_activate(head, &rcuhead_debug_descr);
-		return false;
-	default:
-		return true;
-	}
+	return true;
 }
 
 /**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
 
 struct debug_obj_descr rcuhead_debug_descr = {
 	.name = "rcu_head",
-	.fixup_activate = rcuhead_fixup_activate,
+	.is_static_object = rcuhead_is_static_object,
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f962a58c0957..8c7392c4fdbd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		WARN_ON_ONCE(1);
-		return false;
-
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index be33481a4da1..3a95f9728778 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr)
 	return ((struct timer_list *) addr)->function;
 }
 
+static bool timer_is_static_object(void *addr)
+{
+	struct timer_list *timer = addr;
+
+	return (timer->entry.pprev == NULL &&
+		timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -516,30 +524,16 @@ static void stub_timer(unsigned long data)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
 	switch (state) {
-
 	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The timer was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (timer->entry.pprev == NULL &&
-		    timer->entry.next == TIMER_ENTRY_STATIC) {
-			debug_object_init(timer, &timer_debug_descr);
-			debug_object_activate(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
-		return false;
+		setup_timer(timer, stub_timer, 0);
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
@@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.next == TIMER_ENTRY_STATIC) {
-			/*
-			 * This is not really a fixup. The timer was
-			 * statically initialized. We just make sure that it
-			 * is tracked in the object tracker.
-			 */
-			debug_object_init(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
+		setup_timer(timer, stub_timer, 0);
+		return true;
 	default:
 		return false;
 	}
@@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
 	.name			= "timer_list",
 	.debug_hint		= timer_debug_hint,
+	.is_static_object	= timer_is_static_object,
 	.fixup_init		= timer_fixup_init,
 	.fixup_activate		= timer_fixup_activate,
 	.fixup_free		= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6751b18fd9ac..e1c0e996b5ae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr)
 	return ((struct work_struct *) addr)->func;
 }
 
+static bool work_is_static_object(void *addr)
+{
+	struct work_struct *work = addr;
+
+	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state)
 	}
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- */
-static bool work_fixup_activate(void *addr, enum debug_obj_state state)
-{
-	struct work_struct *work = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The work struct was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
-			debug_object_init(work, &work_debug_descr);
-			debug_object_activate(work, &work_debug_descr);
-			return false;
-		}
-		WARN_ON_ONCE(1);
-		return false;
-
-	case ODEBUG_STATE_ACTIVE:
-		WARN_ON(1);
-
-	default:
-		return false;
-	}
-}
-
 /*
  * fixup_free is called when:
  * - an active object is freed
@@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
 	.debug_hint	= work_debug_hint,
+	.is_static_object = work_is_static_object,
 	.fixup_init	= work_fixup_init,
-	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
 };
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 2f07c8c697b8..a8e12601eb37 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 	/*
-	 * This happens when a static object is activated. We
-	 * let the type specific code decide whether this is
-	 * true or not.
+	 * We are here when a static object is activated. We
+	 * let the type specific code confirm whether this is
+	 * true or not. if true, we just make sure that the
+	 * static object is tracked in the object tracker. If
+	 * not, this must be a bug, so we try to fix it up.
 	 */
-	if (debug_object_fixup(descr->fixup_activate, addr,
-			   ODEBUG_STATE_NOTAVAILABLE)) {
+	if (descr->is_static_object && descr->is_static_object(addr)) {
+		/* track this static object */
+		debug_object_init(addr, descr);
+		debug_object_activate(addr, descr);
+	} else {
 		debug_print_object(&o, "activate");
-		return -EINVAL;
+		ret = debug_object_fixup(descr->fixup_activate, addr,
+					ODEBUG_STATE_NOTAVAILABLE);
+		return ret ? 0 : -EINVAL;
 	}
 	return 0;
 }
@@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 
 		raw_spin_unlock_irqrestore(&db->lock, flags);
 		/*
-		 * Maybe the object is static.  Let the type specific
-		 * code decide what to do.
+		 * Maybe the object is static, and we let the type specific
+		 * code confirm. Track this static object if true, else invoke
+		 * fixup.
 		 */
-		if (debug_object_fixup(descr->fixup_assert_init, addr,
-				       ODEBUG_STATE_NOTAVAILABLE))
+		if (descr->is_static_object && descr->is_static_object(addr)) {
+			/* Track this static object */
+			debug_object_init(addr, descr);
+		} else {
 			debug_print_object(&o, "assert_init");
+			debug_object_fixup(descr->fixup_assert_init, addr,
+					   ODEBUG_STATE_NOTAVAILABLE);
+		}
 		return;
 	}
 
@@ -792,6 +805,13 @@ struct self_test {
 
 static __initdata struct debug_obj_descr descr_type_test;
 
+static bool __init is_static_object(void *addr)
+{
+	struct self_test *obj = addr;
+
+	return obj->static_init;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
@@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (obj->static_init == 1) {
-			debug_object_init(obj, &descr_type_test);
-			debug_object_activate(obj, &descr_type_test);
-			return false;
-		}
 		return true;
-
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
@@ -916,6 +930,7 @@ out:
 
 static __initdata struct debug_obj_descr descr_type_test = {
 	.name			= "selftest",
+	.is_static_object	= is_static_object,
 	.fixup_init		= fixup_init,
 	.fixup_activate		= fixup_activate,
 	.fixup_destroy		= fixup_destroy,
-- 
cgit v1.2.3


From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Thu, 19 May 2016 17:09:56 -0700
Subject: kernel/padata.c: removed unused code

By accident I stumbled across code that has never been used.  This
driver has EXPORT_SYMBOL functions, and the only user of the code is
pcrypt.c, but this only uses a subset of the exported symbols.

According to 'git log -G', the functions, padata_set_cpumasks,
padata_add_cpu, and padata_remove_cpu have never been used since they
were first introduced.  This patch removes the unused code.

On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than
4k smaller.

  kbuild_hp> size $KBUILD_OUTPUT/vmlinux
      text    data     bss      dec hex    filename
  10566658 4678360 1122304 16367322 f9beda vmlinux
  10561984 4678360 1122304 16362648 f9ac98 vmlinux

On another config, 32 bit, the saving is about 0.5k bytes.

  kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux
  6012005 2409513 2785280 11206798 ab008e vmlinux
  6011491 2409513 2785280 11206284 aafe8c vmlinux

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/padata.h |  5 ----
 kernel/padata.c        | 64 --------------------------------------------------
 2 files changed, 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 438694650471..113ee626a4dc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
 extern void padata_do_serial(struct padata_priv *padata);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
-			       cpumask_var_t pcpumask,
-			       cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..67ddd4acde9d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -606,33 +606,6 @@ out_replace:
 	return 0;
 }
 
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                       one is used by parallel workers and the second one
- *                       by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
-			cpumask_var_t cbcpumask)
-{
-	int err;
-
-	mutex_lock(&pinst->lock);
-	get_online_cpus();
-
-	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
-	put_online_cpus();
-	mutex_unlock(&pinst->lock);
-
-	return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
 /**
  * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
  *                     equivalent to @cpumask.
@@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- *                  padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- *        The @mask may be any combination of the following flags:
- *          PADATA_CPU_SERIAL   - serial cpumask
- *          PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
-	int err;
-
-	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
-		return -EINVAL;
-
-	mutex_lock(&pinst->lock);
-
-	get_online_cpus();
-	if (mask & PADATA_CPU_SERIAL)
-		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
-	if (mask & PADATA_CPU_PARALLEL)
-		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
-	err = __padata_add_cpu(pinst, cpu);
-	put_online_cpus();
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
 static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd = NULL;
@@ -1091,7 +1028,6 @@ err_free_inst:
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3


From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 19 May 2016 17:10:37 -0700
Subject: mm: SLAB freelist randomization

Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize
the SLAB freelist.  The list is randomized during initialization of a
new set of pages.  The order on different freelist sizes is pre-computed
at boot for performance.  Each kmem_cache has its own randomized
freelist.  Before pre-computed lists are available freelists are
generated dynamically.  This security feature reduces the predictability
of the kernel SLAB allocator against heap overflows rendering attacks
much less stable.

For example this attack against SLUB (also applicable against SLAB)
would be affected:

  https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/

Also, since v4.6 the freelist was moved at the end of the SLAB.  It
means a controllable heap is opened to new attacks not yet publicly
discussed.  A kernel heap overflow can be transformed to multiple
use-after-free.  This feature makes this type of attack harder too.

To generate entropy, we use get_random_bytes_arch because 0 bits of
entropy is available in the boot stage.  In the worse case this function
will fallback to the get_random_bytes sub API.  We also generate a shift
random number to shift pre-computed freelist for each new set of pages.

The config option name is not specific to the SLAB as this approach will
be extended to other allocators like SLUB.

Performance results highlighted no major changes:

Hackbench (running 90 10 times):

  Before average: 0.0698
  After average: 0.0663 (-5.01%)

slab_test 1 run on boot.  Difference only seen on the 2048 size test
being the worse case scenario covered by freelist randomization.  New
slab pages are constantly being created on the 10000 allocations.
Variance should be mainly due to getting new pages every few
allocations.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles
  10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles
  10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles
  10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles
  10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles
  10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles
  10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles
  10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles
  10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles
  10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles
  10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles
  10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 121 cycles
  10000 times kmalloc(64)/kfree -> 121 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 121 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles
  10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles
  10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles
  10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles
  10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles
  10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles
  10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles
  10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles
  10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles
  10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles
  10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles
  10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 123 cycles
  10000 times kmalloc(64)/kfree -> 142 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 119 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

[akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()]
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h |   4 ++
 init/Kconfig             |   9 +++
 mm/slab.c                | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 178 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf352340..8694f7a5d92b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
 	struct kasan_cache kasan_info;
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	void *random_seq;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index 0dfd09d54c65..79a91a2c0444 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1742,6 +1742,15 @@ config SLOB
 
 endchoice
 
+config SLAB_FREELIST_RANDOM
+	default n
+	depends on SLAB
+	bool "SLAB freelist randomization"
+	help
+	  Randomizes the freelist order used on creating new SLABs. This
+	  security feature reduces the predictability of the kernel slab
+	  allocator against heap overflows.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/slab.c b/mm/slab.c
index 8133ebea77a4..1ee26a0d358f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
 	}
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+			size_t count)
+{
+	size_t i;
+	unsigned int rand;
+
+	for (i = 0; i < count; i++)
+		list[i] = i;
+
+	/* Fisher-Yates shuffle */
+	for (i = count - 1; i > 0; i--) {
+		rand = prandom_u32_state(state);
+		rand %= (i + 1);
+		swap(list[i], list[rand]);
+	}
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	unsigned int seed, count = cachep->num;
+	struct rnd_state state;
+
+	if (count < 2)
+		return 0;
+
+	/* If it fails, we will just use the global lists */
+	cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+	if (!cachep->random_seq)
+		return -ENOMEM;
+
+	/* Get best entropy at this stage */
+	get_random_bytes_arch(&seed, sizeof(seed));
+	prandom_seed_state(&state, seed);
+
+	freelist_randomize(&state, cachep->random_seq, count);
+	return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+	kfree(cachep->random_seq);
+	cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
@@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
 	int i;
 	struct kmem_cache_node *n;
 
+	cache_random_seq_destroy(cachep);
+
 	free_percpu(cachep->cpu_cache);
 
 	/* NUMA: free the node structures */
@@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 #endif
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+	struct {
+		unsigned int pos;
+		freelist_idx_t *list;
+		unsigned int count;
+		unsigned int rand;
+	};
+	struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+				struct kmem_cache *cachep,
+				unsigned int count)
+{
+	bool ret;
+	unsigned int rand;
+
+	/* Use best entropy available to define a random shift */
+	get_random_bytes_arch(&rand, sizeof(rand));
+
+	/* Use a random state if the pre-computed list is not available */
+	if (!cachep->random_seq) {
+		prandom_seed_state(&state->rnd_state, rand);
+		ret = false;
+	} else {
+		state->list = cachep->random_seq;
+		state->count = count;
+		state->pos = 0;
+		state->rand = rand;
+		ret = true;
+	}
+	return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+	return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+	unsigned int objfreelist = 0, i, count = cachep->num;
+	union freelist_init_state state;
+	bool precomputed;
+
+	if (count < 2)
+		return false;
+
+	precomputed = freelist_state_initialize(&state, cachep, count);
+
+	/* Take a random entry as the objfreelist */
+	if (OBJFREELIST_SLAB(cachep)) {
+		if (!precomputed)
+			objfreelist = count - 1;
+		else
+			objfreelist = next_random_slot(&state);
+		page->freelist = index_to_obj(cachep, page, objfreelist) +
+						obj_offset(cachep);
+		count--;
+	}
+
+	/*
+	 * On early boot, generate the list dynamically.
+	 * Later use a pre-computed list for speed.
+	 */
+	if (!precomputed) {
+		freelist_randomize(&state.rnd_state, page->freelist, count);
+	} else {
+		for (i = 0; i < count; i++)
+			set_free_obj(page, i, next_random_slot(&state));
+	}
+
+	if (OBJFREELIST_SLAB(cachep))
+		set_free_obj(page, cachep->num - 1, objfreelist);
+
+	return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+				struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct page *page)
 {
 	int i;
 	void *objp;
+	bool shuffled;
 
 	cache_init_objs_debug(cachep, page);
 
-	if (OBJFREELIST_SLAB(cachep)) {
+	/* Try to randomize the freelist if enabled */
+	shuffled = shuffle_freelist(cachep, page);
+
+	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
 		page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
 						obj_offset(cachep);
 	}
@@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			kasan_poison_object_data(cachep, objp);
 		}
 
-		set_free_obj(page, i, i);
+		if (!shuffled)
+			set_free_obj(page, i, i);
 	}
 }
 
@@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	int shared = 0;
 	int batchcount = 0;
 
+	err = cache_random_seq_create(cachep, gfp);
+	if (err)
+		goto end;
+
 	if (!is_root_cache(cachep)) {
 		struct kmem_cache *root = memcg_root_cache(cachep);
 		limit = root->limit;
@@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	batchcount = (limit + 1) / 2;
 skip_setup:
 	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
 	if (err)
 		pr_err("enable_cpucache failed for %s, error %d\n",
 		       cachep->name, -err);
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt                  | 10 +++++-----
 arch/tile/mm/init.c                             |  2 +-
 drivers/block/aoe/aoecmd.c                      |  2 +-
 drivers/hwtracing/intel_th/msu.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++----------
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  4 ++--
 fs/proc/page.c                                  |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/mm_types.h                        | 14 ++++++++-----
 include/linux/page_ref.h                        | 26 ++++++++++++-------------
 include/linux/pagemap.h                         |  8 ++++----
 kernel/kexec_core.c                             |  2 +-
 mm/huge_memory.c                                |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/page_alloc.c                                 |  4 ++--
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  4 ++--
 17 files changed, 58 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65cf5cfd..fb0e1f2a19cc 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
 Refcounting on THP is mostly consistent with refcounting on other compound
 pages:
 
-  - get_page()/put_page() and GUP operate in head page's ->_count.
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
 
-  - ->_count in tail pages is always zero: get_page_unless_zero() never
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
     succeed on tail pages.
 
   - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
 sum of mapcount of all sub-pages plus one (split_huge_page caller must
 have reference for head page).
 
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
 page->_mapcount.
 
 We safe against physical memory scanners too: the only legitimate way
 scanner can get reference to a page is get_page_unless_zero().
 
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
 from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value.  We already known how many references
+we don't care about ->_refcount value.  We already known how many references
 with should uncharge from head page.
 
 For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7f41d3..adce25462b0d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 			 * Hacky direct set to avoid unnecessary
 			 * lock take/release for EVERY page here.
 			 */
-			p->_count.counter = 0;
+			p->_refcount.counter = 0;
 			p->_mapcount.counter = -1;
 		}
 		init_page_count(page);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a822f44..d597e432e195 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
  * discussion.
  *
  * We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition.  So we use _count directly.
+ * positive page count as a precondition.  So we use _refcount directly.
  */
 static void
 bio_pageinc(struct bio *bio)
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index d9d6022c5aca..d2209147dc89 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
 		return;
 
-	/* drop page _counts */
+	/* drop page _refcounts */
 	for (pg = 0; pg < msc->nr_pages; pg++) {
 		struct page *page = msc_buffer_get_page(msc, pg);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f3456798c596..bd947704b59c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
 			goto err_unmap;
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_add(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -452,8 +452,8 @@ err_unmap:
 	while (--i >= 0) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	 */
 	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->dma_info.page[i]._count);
+		page_ref_add(&wi->dma_info.page[i],
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
 		       PCI_DMA_FROMDEVICE);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->dma_info.page[i]._count);
+		page_ref_sub(&wi->dma_info.page[i],
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(&wi->dma_info.page[i]);
 	}
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3aabfc0adefe..73dd525fbf08 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 		/* Incr page ref count to reuse on allocation failure
 		 * so that it doesn't get freed while freeing SKB.
 		 */
-		atomic_inc(&current_bd->data->_count);
+		page_ref_inc(current_bd->data);
 		goto out;
 	}
 
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				 * freeing SKB.
 				 */
 
-				atomic_inc(&sw_rx_data->data->_count);
+				page_ref_inc(sw_rx_data->data);
 				rxq->rx_alloc_errors++;
 				qede_recycle_rx_bd_ring(rxq, edev,
 							fp_cqe->bd_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
 
 
 	/*
-	 * Caveats on high order pages: page->_count will only be set
+	 * Caveats on high order pages: page->_refcount will only be set
 	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
 	 * SLOB won't set PG_slab at all on compound pages.
 	 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..1193a54ea2b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
 	page = compound_head(page);
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count.
+	 * requires to already have an elevated page->_refcount.
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..1fda9c99ef95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
 			unsigned long counters;
 #else
 			/*
-			 * Keep _count separate from slub cmpxchg_double data.
-			 * As the rest of the double word is protected by
-			 * slab_lock but _count is not.
+			 * Keep _refcount separate from slub cmpxchg_double
+			 * data.  As the rest of the double word is protected by
+			 * slab_lock but _refcount is not.
 			 */
 			unsigned counters;
 #endif
@@ -97,7 +97,11 @@ struct page {
 					};
 					int units;	/* SLOB */
 				};
-				atomic_t _count;		/* Usage count, see below. */
+				/*
+				 * Usage count, *USE WRAPPER FUNCTION*
+				 * when manual accounting. See page_ref.h
+				 */
+				atomic_t _refcount;
 			};
 			unsigned int active;	/* SLAB */
 		};
@@ -248,7 +252,7 @@ struct page_frag_cache {
 	__u32 offset;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
+	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
 	bool pfmemalloc;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 static inline int page_ref_count(struct page *page)
 {
-	return atomic_read(&page->_count);
+	return atomic_read(&page->_refcount);
 }
 
 static inline int page_count(struct page *page)
 {
-	return atomic_read(&compound_head(page)->_count);
+	return atomic_read(&compound_head(page)->_refcount);
 }
 
 static inline void set_page_count(struct page *page, int v)
 {
-	atomic_set(&page->_count, v);
+	atomic_set(&page->_refcount, v);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
 		__page_ref_set(page, v);
 }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
 
 static inline void page_ref_add(struct page *page, int nr)
 {
-	atomic_add(nr, &page->_count);
+	atomic_add(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, nr);
 }
 
 static inline void page_ref_sub(struct page *page, int nr)
 {
-	atomic_sub(nr, &page->_count);
+	atomic_sub(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -nr);
 }
 
 static inline void page_ref_inc(struct page *page)
 {
-	atomic_inc(&page->_count);
+	atomic_inc(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, 1);
 }
 
 static inline void page_ref_dec(struct page *page)
 {
-	atomic_dec(&page->_count);
+	atomic_dec(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -1);
 }
 
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
-	int ret = atomic_sub_and_test(nr, &page->_count);
+	int ret = atomic_sub_and_test(nr, &page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 
 static inline int page_ref_dec_and_test(struct page *page)
 {
-	int ret = atomic_dec_and_test(&page->_count);
+	int ret = atomic_dec_and_test(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
 
 static inline int page_ref_dec_return(struct page *page)
 {
-	int ret = atomic_dec_return(&page->_count);
+	int ret = atomic_dec_return(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
 		__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
 
 static inline int page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_count, nr, u);
+	int ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
 
 static inline int page_ref_freeze(struct page *page, int count)
 {
-	int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
 		__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
 	VM_BUG_ON(count == 0);
 
-	atomic_set(&page->_count, count);
+	atomic_set(&page->_refcount, count);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
 		__page_ref_unfreeze(page, count);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..fe1513ffb7bf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
  *
  * This function must be called inside the same rcu_read_lock() section as has
  * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
  *
  * Unless an RCU grace period has passed, the count of all pages coming out
  * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
  * 2. conditionally increment refcount
  * 3. check the page is still in pagecache (if no, goto 1)
  *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
  * following (with tree_lock held for write):
  * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
  * B. remove page from pagecache
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..1c03dfb4abfd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_STRUCT_SIZE(list_head);
 	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, _refcount);
 	VMCOREINFO_OFFSET(page, mapping);
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee126d4d1..f8ac8f582fd8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
 	/*
-	 * tail_page->_count is zero and not changing from under us. But
+	 * tail_page->_refcount is zero and not changing from under us. But
 	 * get_page_unless_zero() may be running from under us on the
 	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
 	 * would then run atomic_set() concurrently with
@@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (mlocked)
 		lru_add_drain();
 
-	/* Prevent deferred_split_scan() touching ->_count */
+	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 	count = page_count(head);
 	mapcount = total_mapcount(head);
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..098a89e3b97c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
 }
 
 /*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..4ce57f938b7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
-		bad_reason = "nonzero _count";
+		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
-		 * because their page->_count is zero at all time.
+		 * because their page->_refcount is zero at all time.
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
diff --git a/mm/slub.c b/mm/slub.c
index 8671de2e5b12..cf1faa4d3992 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
 	tmp.counters = counters_new;
 	/*
 	 * page->counters can cover frozen/inuse/objects as well
-	 * as page->_count.  If we assign to ->counters directly
-	 * we run the risk of losing updates to page->_count, so
+	 * as page->_refcount.  If we assign to ->counters directly
+	 * we run the risk of losing updates to page->_refcount, so
 	 * be careful and only assign to the fields we need.
 	 */
 	page->frozen  = tmp.frozen;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..d3a02ac3eed7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
-	 * load is not satisfied before that of page->_count.
+	 * load is not satisfied before that of page->_refcount.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
@@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
  * But we had to alter page->flags anyway.
  */
 
-- 
cgit v1.2.3


From d64e85d3e1c59c3664b9ec1183052ec4641ea1e2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:52 -0700
Subject: compiler.h: add support for malloc attribute

gcc as far back as at least 3.04 documents the function attribute
__malloc__.  Add a shorthand for attaching that to a function
declaration.  This was also suggested by Andi Kleen way back in 2002
[1], but didn't get applied, perhaps because gcc at that time generated
the exact same code with and without this attribute.

This attribute tells the compiler that the return value (if non-NULL)
can be assumed not to alias any other valid pointers at the time of the
call.

Please note that the documentation for a range of gcc versions (starting
from around 4.7) contained a somewhat confusing and self-contradicting
text:

  The malloc attribute is used to tell the compiler that a function may
  be treated as if any non-NULL pointer it returns cannot alias any other
  pointer valid when the function returns and *that the memory has
  undefined content*.  [...] Standard functions with this property include
  malloc and *calloc*.

(emphasis mine). The intended meaning has later been clarified [2]:

  This tells the compiler that a function is malloc-like, i.e., that the
  pointer P returned by the function cannot alias any other pointer valid
  when the function returns, and moreover no pointers to valid objects
  occur in any storage addressed by P.

What this means is that we can apply the attribute to kmalloc and
friends, and it is ok for the returned memory to have well-defined
contents (__GFP_ZERO).  But it is not ok to apply it to kmemdup(), nor
to other functions which both allocate and possibly initialize the
memory with existing pointers.  So unless someone is doing something
pretty perverted kstrdup() should also be a fine candidate.

[1] http://thread.gmane.org/gmane.linux.kernel/57172
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56955

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 include/linux/compiler.h     | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3d5202eda22f..e2949397c19b 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
 
 #if GCC_VERSION >= 30400
 #define __must_check		__attribute__((warn_unused_result))
+#define __malloc		__attribute__((__malloc__))
 #endif
 
 #if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..793c0829e3a3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __deprecated_for_modules
 #endif
 
+#ifndef __malloc
+#define __malloc
+#endif
+
 /*
  * Allow us to avoid 'defined but not used' warnings on functions and data,
  * as well as force them to be emitted to the assembly file.
-- 
cgit v1.2.3


From 48a270554a3251681ae11173f2fd6389d943e183 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:55 -0700
Subject: include/linux: apply __malloc attribute

Attach the malloc attribute to a few allocation functions.  This helps
gcc generate better code by telling it that the return value doesn't
alias any existing pointers (which is even more valuable given the
pessimizations implied by -fno-strict-aliasing).

A simple example of what this allows gcc to do can be seen by looking at
the last part of drm_atomic_helper_plane_reset:

	plane->state = kzalloc(sizeof(*plane->state), GFP_KERNEL);

	if (plane->state) {
		plane->state->plane = plane;
		plane->state->rotation = BIT(DRM_ROTATE_0);
	}

which compiles to

    e8 99 bf d6 ff          callq  ffffffff8116d540 <kmem_cache_alloc_trace>
    48 85 c0                test   %rax,%rax
    48 89 83 40 02 00 00    mov    %rax,0x240(%rbx)
    74 11                   je     ffffffff814015c4 <drm_atomic_helper_plane_reset+0x64>
    48 89 18                mov    %rbx,(%rax)
    48 8b 83 40 02 00 00    mov    0x240(%rbx),%rax [*]
    c7 40 40 01 00 00 00    movl   $0x1,0x40(%rax)

With this patch applied, the instruction at [*] is elided, since the
store to plane->state->plane is known to not alter the value of
plane->state.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 16 ++++++++--------
 include/linux/device.h  | 12 ++++++------
 include/linux/kernel.h  |  4 ++--
 include/linux/mempool.h |  3 ++-
 include/linux/slab.h    | 16 ++++++++--------
 include/linux/string.h  |  2 +-
 6 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f94d2d2..f9be32691718 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
-				     unsigned long goal);
+				     unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *__alloc_bootmem_node_high(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit) __malloc;
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 void *__alloc_bootmem_low_nopanic(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
-				      unsigned long goal);
+				      unsigned long goal) __malloc;
 
 #ifdef CONFIG_NO_BOOTMEM
 /* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/device.h b/include/linux/device.h
index b130304f9b1b..ca90ad8bcd61 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
 
 #ifdef CONFIG_DEBUG_DEVRES
 extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-				 int nid, const char *name);
+				 int nid, const char *name) __malloc;
 #define devres_alloc(release, size, gfp) \
 	__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
 #define devres_alloc_node(release, size, gfp, nid) \
 	__devres_alloc_node(release, size, gfp, nid, #release)
 #else
 extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-			       int nid);
+			       int nid) __malloc;
 static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
 {
 	return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
 extern int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
 extern __printf(3, 0)
 char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-		      va_list ap);
+		      va_list ap) __malloc;
 extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
 static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
 {
 	return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
 	return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
 }
 extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
 extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
 			  gfp_t gfp);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e229b0..cc7398287fdd 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,9 +412,9 @@ extern __printf(3, 4)
 int scnprintf(char *buf, size_t size, const char *fmt, ...);
 extern __printf(3, 0)
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 extern __printf(2, 0)
 const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951e8fd2..b1086c936507 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
 #define _LINUX_MEMPOOL_H
 
 #include <linux/wait.h>
+#include <linux/compiler.h>
 
 struct kmem_cache;
 
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 508bd827e6dc..aeb3e6d00a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
 }
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
 void kmem_cache_free(struct kmem_cache *, void *);
 
 /*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 					   gfp_t gfpflags,
-					   int node, size_t size) __assume_slab_alignment;
+					   int node, size_t size) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *
 kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 #else
 static __always_inline void *
 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/string.h b/include/linux/string.h
index d3993a79a325..26b6f6a66f83 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
 
 extern void kfree_const(const void *x);
 
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
 extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/kernel/setup.c |  4 +---
 arch/x86/mm/numa.c       |  4 +---
 include/linux/nodemask.h | 11 ++++++++++-
 kernel/cpuset.c          |  8 +-------
 lib/Makefile             |  2 +-
 lib/nodemask.c           | 30 ++++++++++++++++++++++++++++++
 mm/hugetlb.c             |  4 +---
 mm/memcontrol.c          |  4 +---
 mm/mempolicy.c           | 24 ++----------------------
 mm/page_isolation.c      |  9 +++------
 mm/slab.c                | 13 +++----------
 11 files changed, 54 insertions(+), 59 deletions(-)
 create mode 100644 lib/nodemask.c

(limited to 'include/linux')

diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238e9b58..153020abd2f5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
 		cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
 		cpu_2_node[best_cpu] = node;
 		cpumask_clear_cpu(best_cpu, &unbound_cpus);
-		node = next_node(node, default_nodes);
-		if (node == MAX_NUMNODES)
-			node = first_node(default_nodes);
+		node = next_node_in(node, default_nodes);
 	}
 
 	/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff46125..9c086c57105c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 			continue;
 		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
+		rr = next_node_in(rr, node_online_map);
 	}
 }
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
  *
  * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
  * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)		Next node past 'node', or wrap to first,
+ *					or MAX_NUMNODES
  * int first_unset_node(mask)		First node not set in mask, or 
- *					MAX_NUMNODES.
+ *					MAX_NUMNODES
  *
  * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
  * NODE_MASK_ALL			Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
 	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..611cc69af8f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 
 static int cpuset_spread_node(int *rotor)
 {
-	int node;
-
-	node = next_node(*rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	*rotor = node;
-	return node;
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
 }
 
 int cpuset_mem_spread_node(void)
diff --git a/lib/Makefile b/lib/Makefile
index 931396ada5eb..42b69185f963 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000000..e42a5bf44d33
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+	int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..5856093f9062 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -937,9 +937,7 @@ err:
  */
 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-	nid = next_node(nid, *nodes_allowed);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(*nodes_allowed);
+	nid = next_node_in(nid, *nodes_allowed);
 	VM_BUG_ON(nid >= MAX_NUMNODES);
 
 	return nid;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..6740c4c2b550 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 
-	node = next_node(node, memcg->scan_nodes);
-	if (node == MAX_NUMNODES)
-		node = first_node(memcg->scan_nodes);
+	node = next_node_in(node, memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..8d369cee0cd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <linux/random.h>
 
 #include "internal.h"
 
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		BUG();
 
 	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = first_node(tmp);
+		current->il_next = next_node_in(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = numa_node_id();
 	}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	next = next_node_in(nid, policy->v.nodes);
 	if (next < MAX_NUMNODES)
 		me->il_next = next;
 	return nid;
@@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-	int w, bit = NUMA_NO_NODE;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..67bedd18429c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 	 * accordance with memory policy of the user process if possible. For
 	 * now as a simple work-around, we use the next node for destination.
 	 */
-	if (PageHuge(page)) {
-		int node = next_online_node(page_to_nid(page));
-		if (node == MAX_NUMNODES)
-			node = first_online_node;
+	if (PageHuge(page))
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
-					    node);
-	}
+					    next_node_in(page_to_nid(page),
+							 node_online_map));
 
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/slab.c b/mm/slab.c
index d81565a92864..c11bf5007952 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 
 static void init_reap_node(int cpu)
 {
-	int node;
-
-	node = next_node(cpu_to_mem(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	per_cpu(slab_reap_node, cpu) = node;
+	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+						    node_online_map);
 }
 
 static void next_reap_node(void)
 {
 	int node = __this_cpu_read(slab_reap_node);
 
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
+	node = next_node_in(node, node_online_map);
 	__this_cpu_write(slab_reap_node, node);
 }
 
-- 
cgit v1.2.3


From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001
From: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Date: Thu, 19 May 2016 17:11:04 -0700
Subject: mm/hugetlb: introduce hugetlb_bad_size()

When any unsupported hugepage size is specified, 'hugepagesz=' and
'hugepages=' should be ignored during command line parsing until any
supported hugepage size is found.  But currently incorrect number of
hugepages are allocated when unsupported size is specified as it fails
to ignore the 'hugepages=' command.

Test case:

Note that this is specific to x86 architecture.

Boot the kernel with command line option 'hugepagesz=256M hugepages=X'.
After boot, dmesg output shows that X number of hugepages of the size 2M
is pre-allocated instead of 0.

So, to handle such command line options, introduce new routine
hugetlb_bad_size.  The routine hugetlb_bad_size sets the global variable
parsed_valid_hugepagesz.  We are using parsed_valid_hugepagesz to save
the state when unsupported hugepagesize is found so that we can ignore
the 'hugepages=' parameters after that and then reset the variable when
supported hugepage size is found.

The routine hugetlb_bad_size can be called while setting 'hugepagesz='
parameter in an architecture specific code.

Signed-off-by: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  1 +
 mm/hugetlb.c            | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2542a8..e44c57876e89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
+void __init hugetlb_bad_size(void);
 void __init hugetlb_add_hstate(unsigned order);
 struct hstate *size_to_hstate(unsigned long size);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb37ef810655..0adb74d0a4e1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
 static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void)
 subsys_initcall(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+	parsed_valid_hugepagesz = false;
+}
+
 void __init hugetlb_add_hstate(unsigned int order)
 {
 	struct hstate *h;
@@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s)
 	unsigned long *mhp;
 	static unsigned long *last_mhp;
 
+	if (!parsed_valid_hugepagesz) {
+		pr_warn("hugepages = %s preceded by "
+			"an unsupported hugepagesz, ignoring\n", s);
+		parsed_valid_hugepagesz = true;
+		return 1;
+	}
 	/*
 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
 	 * so this hugepages= parameter goes to the "default hstate".
 	 */
-	if (!hugetlb_max_hstate)
+	else if (!hugetlb_max_hstate)
 		mhp = &default_hstate_max_huge_pages;
 	else
 		mhp = &parsed_hstate->max_huge_pages;
-- 
cgit v1.2.3


From 32f6271dbdc351dce96b11c5f3567bae8188004f Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:23 -0700
Subject: mm/hugetlb: is_vm_hugetlb_page() can return bool

Make is_vm_hugetlb_page() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_inline.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..a4e7ca0f3585 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
 
 #include <linux/mm.h>
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return 0;
+	return false;
 }
 
 #endif
-- 
cgit v1.2.3


From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:26 -0700
Subject: mm/memory_hotplug: is_mem_section_removable() can return bool

Make is_mem_section_removable() return bool to improve readability due
to this particular function only using either one or zero as its return
value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 6 +++---
 mm/memory_hotplug.c            | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef586e696..20d8a5d4d133 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
 
 #else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
 					unsigned long nr_pages)
 {
-	return 0;
+	return false;
 }
 
 static inline void try_offline_node(int nid) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431c3f31..b21d8895ea41 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page)
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct page *page = pfn_to_page(start_pfn);
 	struct page *end_page = page + nr_pages;
@@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {
 		if (!is_pageblock_removable_nolock(page))
-			return 0;
+			return false;
 		cond_resched();
 	}
 
 	/* All pageblocks in the memory block are likely to be hot-removable */
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From bb00a789e565b96c52b2224c2280f7ac83175bec Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:29 -0700
Subject: mm/vmalloc.c: is_vmalloc_addr() can return bool

Make is_vmalloc_addr() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1193a54ea2b3..5b375133c695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
  * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
  * is no special casing required.
  */
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
 {
 #ifdef CONFIG_MMU
 	unsigned long addr = (unsigned long)x;
 
 	return addr >= VMALLOC_START && addr < VMALLOC_END;
 #else
-	return 0;
+	return false;
 #endif
 }
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From 4ee815be1d34a6f254b3d09bdebcb27f294f2bd3 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:32 -0700
Subject: mm/mempolicy.c: vma_migratable() can return bool

Make vma_migratable() return bool due to this particular function only
using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2696c1f05ed1..6978a99e571f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
 extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 
 /* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-		return 0;
+		return false;
 
 #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	if (vma->vm_flags & VM_HUGETLB)
-		return 0;
+		return false;
 #endif
 
 	/*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	if (vma->vm_file &&
 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
 								< policy_zone)
-			return 0;
-	return 1;
+			return false;
+	return true;
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
-- 
cgit v1.2.3


From 29f9cb53d25cd9916537b44b0af7f0b95a2e4438 Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Thu, 19 May 2016 17:11:57 -0700
Subject: mm/highmem: simplify is_highmem()

is_highmem() can be simplified by use of is_highmem_idx().  This patch
removes redundant code and will make it easier to maintain if the zone
policy is changed or a new zone is added.

(akpm: saves me 25 bytes of text per is_highmem() callsite)

Signed-off-by: Chanho Min <chanho.min@lge.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60df9257cc7..150c6049f961 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -828,10 +828,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 static inline int is_highmem(struct zone *zone)
 {
 #ifdef CONFIG_HIGHMEM
-	int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
-	return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
-	       (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
-		zone_movable_is_highmem());
+	return is_highmem_idx(zone_idx(zone));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:12:00 -0700
Subject: mm: uninline page_mapped()

It's huge.  Uninlining it saves 206 bytes per callsite.  Shaves 4924
bytes from the x86_64 allmodconfig vmlinux.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Steve Capper <steve.capper@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +--------------------
 mm/util.c          | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5b375133c695..9c2852cabf01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page)
 	return page->index;
 }
 
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
-	int i;
-	if (likely(!PageCompound(page)))
-		return atomic_read(&page->_mapcount) >= 0;
-	page = compound_head(page);
-	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
-		return true;
-	if (PageHuge(page))
-		return false;
-	for (i = 0; i < hpage_nr_pages(page); i++) {
-		if (atomic_read(&page[i]._mapcount) >= 0)
-			return true;
-	}
-	return false;
-}
+bool page_mapped(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7b8705..8a1b3a1fb595 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
 	return __page_rmapping(page);
 }
 
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+	int i;
+
+	if (likely(!PageCompound(page)))
+		return atomic_read(&page->_mapcount) >= 0;
+	page = compound_head(page);
+	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+		return true;
+	if (PageHuge(page))
+		return false;
+	for (i = 0; i < hpage_nr_pages(page); i++) {
+		if (atomic_read(&page[i]._mapcount) >= 0)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
 struct anon_vma *page_anon_vma(struct page *page)
 {
 	unsigned long mapping;
-- 
cgit v1.2.3


From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:35 -0700
Subject: mm: update_lru_size warn and reset bad lru_size

Though debug kernels have a VM_BUG_ON to help protect from misaccounting
lru_size, non-debug kernels are liable to wrap it around: and then the
vast unsigned long size draws page reclaim into a loop of repeatedly
doing nothing on an empty list, without even a cond_resched().

That soft lockup looks confusingly like an over-busy reclaim scenario,
with lots of contention on the lru_lock in shrink_inactive_list(): yet
has a totally different origin.

Help differentiate with a custom warning in
mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the
size to avoid the lockup.  But the particular bug which suggested this
change was mine alone, and since fixed.

Make it a WARN_ONCE: the first occurrence is the most informative, a
flurry may follow, yet even when rate-limited little more is learnt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  2 +-
 mm/memcontrol.c           | 24 ++++++++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c37a200..d8cea81ab1ac 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
 	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	list_del(&page->lru);
+	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 011dac8ab5d7..6a0199706f00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,38 @@ out:
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
+	long size;
+	bool empty;
 
 	if (mem_cgroup_disabled())
 		return;
 
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
-	*lru_size += nr_pages;
-	VM_BUG_ON((long)(*lru_size) < 0);
+	empty = list_empty(lruvec->lists + lru);
+
+	if (nr_pages < 0)
+		*lru_size += nr_pages;
+
+	size = *lru_size;
+	if (WARN_ONCE(size < 0 || empty != !size,
+		"%s(%p, %d, %d): lru_size %ld but %sempty\n",
+		__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+		VM_BUG_ON(1);
+		*lru_size = 0;
+	}
+
+	if (nr_pages > 0)
+		*lru_size += nr_pages;
 }
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:38 -0700
Subject: mm: update_lru_size do the __mod_zone_page_state

Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy
reclaim was removed) that lru_size can be updated by -nr_taken once per
call to isolate_lru_pages(), instead of page by page.

Update it inside isolate_lru_pages(), or at its two callsites? I chose
to update it at the callsites, rearranging and grouping the updates by
nr_taken and nr_scanned together in both.

With one exception, mem_cgroup_update_lru_size(,lru,) is then used where
__mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding
some more calls in a future commit.  Make the code a little smaller and
simpler by incorporating stat update in lru_size update.

The exception was move_active_pages_to_lru(), which aggregated the
pgmoved stat update separately from the individual lru_size updates; but
I still think this a simplification worth making.

However, the __mod_zone_page_state is not peculiar to mem_cgroups: so
better use the name update_lru_size, calls mem_cgroup_update_lru_size
when CONFIG_MEMCG.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 include/linux/mm_inline.h  | 24 ++++++++++++++++++------
 mm/memcontrol.c            |  2 ++
 mm/vmscan.c                | 23 ++++++++++-------------
 4 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79aa495..94da96738df3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return 0;
 }
 
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-			      int increment)
-{
-}
-
 static inline unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			     int nid, unsigned int lru_mask)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index d8cea81ab1ac..5bd29ba4f174 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
 	return !PageSwapBacked(page);
 }
 
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+	__update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	update_lru_size(lruvec, lru, hpage_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
 	list_del(&page->lru);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+	update_lru_size(lruvec, lru, -hpage_nr_pages(page));
 }
 
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a0199706f00..1b40dcad2b90 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 	long size;
 	bool empty;
 
+	__update_lru_size(lruvec, lru, nr_pages);
+
 	if (mem_cgroup_disabled())
 		return;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d3a02ac3eed7..dcfdfc1a0942 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
 					!list_empty(src); scan++) {
 		struct page *page;
-		int nr_pages;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			nr_pages = hpage_nr_pages(page);
-			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			nr_taken += hpage_nr_pages(page);
 			list_move(&page->lru, dst);
-			nr_taken += nr_pages;
 			break;
 
 		case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+	update_lru_size(lruvec, lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	reclaim_stat->recent_scanned[file] += nr_taken;
-
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 		SetPageLRU(page);
 
 		nr_pages = hpage_nr_pages(page);
-		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+		update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 				list_add(&page->lru, pages_to_free);
 		}
 	}
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
-	if (global_reclaim(sc))
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 
+	update_lru_size(lruvec, lru, -nr_taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
+	if (global_reclaim(sc))
+		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
-- 
cgit v1.2.3


From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:44 -0700
Subject: tmpfs: preliminary minor tidyups

Make a few cleanups in mm/shmem.c, before going on to complicate it.

shmem_alloc_page() will become more complicated: we can't afford to to
have that complication duplicated between a CONFIG_NUMA version and a
!CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a
single shmem_swapin() and a single shmem_alloc_page().

Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA
configurations, but eliminating it is a larger cleanup: I have an
alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and
bug-prone, and changed yet again since my last version.

Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to
shmem_alloc_page(): that SwapBacked flag will be useful in future, to
help to distinguish different cases appropriately.

And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of
little use (IIRC it dates back to when shmem_getpage() returned the page
unlocked): kill it and do the necessary in shmem_file_read_iter().

But an arm64 build then complained that info may be uninitialized (where
shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and
advancing to an "sgp <= SGP_CACHE" test jogged it back to reality.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  6 +++++
 mm/shmem.c                | 69 ++++++++++++++++++-----------------------------
 2 files changed, 32 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6978a99e571f..4429d255c8ab 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
 {
 }
 
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	return NULL;
+}
+
 #define vma_policy(vma) NULL
 
 static inline int
diff --git a/mm/shmem.c b/mm/shmem.c
index 9e609d58df73..6d2de2c1bf11 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
-	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
 	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
@@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags,
 
 /*
  * ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
@@ -947,8 +946,7 @@ redirty:
 	return 0;
 }
 
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 	char buffer[64];
@@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 	}
 	return mpol;
 }
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+	return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
 
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
-	page = alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+	if (page) {
+		__SetPageLocked(page);
+		__SetPageSwapBacked(page);
+	}
 
 	/* Drop reference taken by mpol_shared_policy_lookup() */
 	mpol_cond_put(pvma.vm_policy);
 
 	return page;
 }
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
-	return NULL;
-}
-#endif
 
 /*
  * When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	copy_highpage(newpage, oldpage);
 	flush_dcache_page(newpage);
 
-	__SetPageLocked(newpage);
-	__SetPageSwapBacked(newpage);
 	SetPageUptodate(newpage);
 	set_page_private(newpage, swap_index);
 	SetPageSwapCache(newpage);
@@ -1155,7 +1140,7 @@ repeat:
 		page = NULL;
 	}
 
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto unlock;
@@ -1275,9 +1260,6 @@ repeat:
 			error = -ENOMEM;
 			goto decused;
 		}
-
-		__SetPageLocked(page);
-		__SetPageSwapBacked(page);
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
@@ -1321,12 +1303,10 @@ clear:
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 		}
-		if (sgp == SGP_DIRTY)
-			set_page_dirty(page);
 	}
 
 	/* Perhaps the file has been truncated since we checked */
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		if (alloced) {
 			ClearPageDirty(page);
@@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
 	 */
 	if (!iter_is_iovec(to))
-		sgp = SGP_DIRTY;
+		sgp = SGP_CACHE;
 
 	index = *ppos >> PAGE_SHIFT;
 	offset = *ppos & ~PAGE_MASK;
@@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				error = 0;
 			break;
 		}
-		if (page)
+		if (page) {
+			if (sgp == SGP_CACHE)
+				set_page_dirty(page);
 			unlock_page(page);
+		}
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 14 +++++++++++
 include/linux/vmstat.h      |  4 +++
 kernel/sysctl.c             |  7 ++++++
 mm/vmstat.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
 - panic_on_oom
 - percpu_pagelist_fraction
 - stat_interval
+- stat_refresh
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
 
 ==============================================================
 
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
+
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
 int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "stat_refresh",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0600,
+		.proc_handler	= vmstat_refresh,
+	},
 #endif
 #ifdef CONFIG_MMU
 	{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
 
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+	refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	long val;
+	int err;
+	int i;
+
+	/*
+	 * The regular update, every sysctl_stat_interval, may come later
+	 * than expected: leaving a significant amount in per_cpu buckets.
+	 * This is particularly misleading when checking a quantity of HUGE
+	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+	 * which can equally be echo'ed to or cat'ted from (by root),
+	 * can be used to update the stats just before reading them.
+	 *
+	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * transiently negative values, report an error here if any of
+	 * the stats is negative, so we know to go looking for imbalance.
+	 */
+	err = schedule_on_each_cpu(refresh_vm_stats);
+	if (err)
+		return err;
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+		val = atomic_long_read(&vm_stat[i]);
+		if (val < 0) {
+			switch (i) {
+			case NR_ALLOC_BATCH:
+			case NR_PAGES_SCANNED:
+				/*
+				 * These are often seen to go negative in
+				 * recent kernels, but not to go permanently
+				 * negative.  Whilst it would be nicer not to
+				 * have exceptions, rooting them out would be
+				 * another task, of rather low priority.
+				 */
+				break;
+			default:
+				pr_warn("%s: %s %ld\n",
+					__func__, vmstat_text[i], val);
+				err = -EINVAL;
+				break;
+			}
+		}
+	}
+	if (err)
+		return err;
+	if (write)
+		*ppos += *lenp;
+	else
+		*lenp = 0;
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
 static void vmstat_update(struct work_struct *w)
 {
 	if (refresh_cpu_vm_stats(true)) {
-- 
cgit v1.2.3


From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:54 -0700
Subject: huge mm: move_huge_pmd does not need new_vma

Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was
a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from
the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as
it was in the old vma, alignment and size permitting.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 +---
 mm/huge_memory.c        | 7 ++-----
 mm/mremap.c             | 5 ++---
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e5346fba..419fb9e03447 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
-			 struct vm_area_struct *new_vma,
-			 unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8ac8f582fd8..66675eed67be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return 1;
 }
 
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
-		  unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	pmd_t pmd;
-
 	struct mm_struct *mm = vma->vm_mm;
 
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
-	    old_end - old_addr < HPAGE_PMD_SIZE ||
-	    (new_vma->vm_flags & VM_NOHUGEPAGE))
+	    old_end - old_addr < HPAGE_PMD_SIZE)
 		return false;
 
 	/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467df66..7d98fe1adc12 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 				/* See comment in move_ptes() */
 				if (need_rmap_locks)
 					anon_vma_lock_write(vma->anon_vma);
-				moved = move_huge_pmd(vma, new_vma, old_addr,
-						    new_addr, old_end,
-						    old_pmd, new_pmd);
+				moved = move_huge_pmd(vma, old_addr, new_addr,
+						    old_end, old_pmd, new_pmd);
 				if (need_rmap_locks)
 					anon_vma_unlock_write(vma->anon_vma);
 				if (moved) {
-- 
cgit v1.2.3


From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:12 -0700
Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path

If either the current task is already killed or PF_EXITING or a selected
task is PF_EXITING then the oom killer is suppressed and so is the oom
reaper.  This patch adds try_oom_reaper which checks the given task and
queues it for the oom reaper if that is safe to be done meaning that the
task doesn't share the mm with an alive process.

This might help to release the memory pressure while the task tries to
exit.

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  8 +++++
 mm/memcontrol.c     |  1 +
 mm/oom_kill.c       | 86 ++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 77 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a43242a34..83b9c39bd8b7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b40dcad2b90..d71d387868e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		goto unlock;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 32d8210b8773..850b6ff66bdf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		struct mm_struct *t_mm = READ_ONCE(t->mm);
+		if (t_mm)
+			return t_mm == mm;
+	}
+	return false;
+}
+
+
 #ifdef CONFIG_MMU
 /*
  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
 	wake_up(&oom_reaper_wait);
 }
 
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+	struct task_struct *p;
+
+	if (!mm)
+		return;
+
+	/*
+	 * There might be other threads/processes which are either not
+	 * dying or even not killable.
+	 */
+	if (atomic_read(&mm->mm_users) > 1) {
+		rcu_read_lock();
+		for_each_process(p) {
+			bool exiting;
+
+			if (!process_shares_mm(p, mm))
+				continue;
+			if (same_thread_group(p, tsk))
+				continue;
+			if (fatal_signal_pending(p))
+				continue;
+
+			/*
+			 * If the task is exiting make sure the whole thread group
+			 * is exiting and cannot acces mm anymore.
+			 */
+			spin_lock_irq(&p->sighand->siglock);
+			exiting = signal_group_exit(p->signal);
+			spin_unlock_irq(&p->sighand->siglock);
+			if (exiting)
+				continue;
+
+			/* Give up */
+			rcu_read_unlock();
+			return;
+		}
+		rcu_read_unlock();
+	}
+
+	wake_oom_reaper(tsk);
+}
+
 static int __init oom_init(void)
 {
 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -652,24 +718,6 @@ void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-	struct task_struct *t;
-
-	for_each_thread(p, t) {
-		struct mm_struct *t_mm = READ_ONCE(t->mm);
-		if (t_mm)
-			return t_mm == mm;
-	}
-	return false;
-}
-
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
@@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
 		mark_oom_victim(p);
+		try_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc)
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		return true;
 	}
 
-- 
cgit v1.2.3


From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:21 -0700
Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast
 path

The PageAnon check always checks for compound_head but this is a
relatively expensive check if the caller already knows the page is a
head page.  This patch creates a helper and uses it in the page free
path which only operates on head pages.

With this patch and "Only check PageCompound for high-order pages", the
performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                               vanilla           nocompound-v1r20
  Min      alloc-odr0-1               425.00 (  0.00%)           417.00 (  1.88%)
  Min      alloc-odr0-2               313.00 (  0.00%)           308.00 (  1.60%)
  Min      alloc-odr0-4               257.00 (  0.00%)           253.00 (  1.56%)
  Min      alloc-odr0-8               224.00 (  0.00%)           221.00 (  1.34%)
  Min      alloc-odr0-16              208.00 (  0.00%)           205.00 (  1.44%)
  Min      alloc-odr0-32              199.00 (  0.00%)           199.00 (  0.00%)
  Min      alloc-odr0-64              195.00 (  0.00%)           193.00 (  1.03%)
  Min      alloc-odr0-128             192.00 (  0.00%)           191.00 (  0.52%)
  Min      alloc-odr0-256             204.00 (  0.00%)           200.00 (  1.96%)
  Min      alloc-odr0-512             213.00 (  0.00%)           212.00 (  0.47%)
  Min      alloc-odr0-1024            219.00 (  0.00%)           219.00 (  0.00%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           225.00 (  0.00%)
  Min      alloc-odr0-4096            230.00 (  0.00%)           231.00 ( -0.43%)
  Min      alloc-odr0-8192            235.00 (  0.00%)           234.00 (  0.43%)
  Min      alloc-odr0-16384           235.00 (  0.00%)           234.00 (  0.43%)
  Min      free-odr0-1                215.00 (  0.00%)           191.00 ( 11.16%)
  Min      free-odr0-2                152.00 (  0.00%)           136.00 ( 10.53%)
  Min      free-odr0-4                119.00 (  0.00%)           107.00 ( 10.08%)
  Min      free-odr0-8                106.00 (  0.00%)            96.00 (  9.43%)
  Min      free-odr0-16                97.00 (  0.00%)            87.00 ( 10.31%)
  Min      free-odr0-32                91.00 (  0.00%)            83.00 (  8.79%)
  Min      free-odr0-64                89.00 (  0.00%)            81.00 (  8.99%)
  Min      free-odr0-128               88.00 (  0.00%)            80.00 (  9.09%)
  Min      free-odr0-256              106.00 (  0.00%)            95.00 ( 10.38%)
  Min      free-odr0-512              116.00 (  0.00%)           111.00 (  4.31%)
  Min      free-odr0-1024             125.00 (  0.00%)           118.00 (  5.60%)
  Min      free-odr0-2048             133.00 (  0.00%)           126.00 (  5.26%)
  Min      free-odr0-4096             136.00 (  0.00%)           130.00 (  4.41%)
  Min      free-odr0-8192             138.00 (  0.00%)           130.00 (  5.80%)
  Min      free-odr0-16384            137.00 (  0.00%)           130.00 (  5.11%)

There is a sizable boost to the free allocator performance.  While there
is an apparent boost on the allocation side, it's likely a co-incidence
or due to the patches slightly reducing cache footprint.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 7 ++++++-
 mm/page_alloc.c            | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa7b5b7..a61e06e5fbce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #define PAGE_MAPPING_KSM	2
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
+static __always_inline int PageAnonHead(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return PageAnonHead(page);
 }
 
 #ifdef CONFIG_KSM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 087ba3e417ec..7be1ce8b6be0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 			bad += free_pages_check(page + i);
 		}
 	}
-	if (PageAnon(page))
+	if (PageAnonHead(page))
 		page->mapping = NULL;
 	bad += free_pages_check(page);
 	if (bad)
-- 
cgit v1.2.3


From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:27 -0700
Subject: mm, page_alloc: inline zone_statistics

zone_statistics has one call-site but it's a public function.  Make it
static and inline.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statbranch-v1r20           statinline-v1r20
  Min      alloc-odr0-1               419.00 (  0.00%)           412.00 (  1.67%)
  Min      alloc-odr0-2               305.00 (  0.00%)           301.00 (  1.31%)
  Min      alloc-odr0-4               250.00 (  0.00%)           247.00 (  1.20%)
  Min      alloc-odr0-8               219.00 (  0.00%)           215.00 (  1.83%)
  Min      alloc-odr0-16              203.00 (  0.00%)           199.00 (  1.97%)
  Min      alloc-odr0-32              195.00 (  0.00%)           191.00 (  2.05%)
  Min      alloc-odr0-64              191.00 (  0.00%)           187.00 (  2.09%)
  Min      alloc-odr0-128             189.00 (  0.00%)           185.00 (  2.12%)
  Min      alloc-odr0-256             198.00 (  0.00%)           193.00 (  2.53%)
  Min      alloc-odr0-512             210.00 (  0.00%)           207.00 (  1.43%)
  Min      alloc-odr0-1024            216.00 (  0.00%)           213.00 (  1.39%)
  Min      alloc-odr0-2048            221.00 (  0.00%)           220.00 (  0.45%)
  Min      alloc-odr0-4096            227.00 (  0.00%)           226.00 (  0.44%)
  Min      alloc-odr0-8192            232.00 (  0.00%)           229.00 (  1.29%)
  Min      alloc-odr0-16384           232.00 (  0.00%)           229.00 (  1.29%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  2 --
 mm/page_alloc.c        | 31 +++++++++++++++++++++++++++++++
 mm/vmstat.c            | 29 -----------------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 02fce415b3d9..d2da8e053210 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 #ifdef CONFIG_NUMA
 
 extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7be1ce8b6be0..36384baa74e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2354,6 +2354,37 @@ int split_free_page(struct page *page)
 	return nr_pages;
 }
 
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+								gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+	int local_nid = numa_node_id();
+	enum zone_stat_item local_stat = NUMA_LOCAL;
+
+	if (unlikely(flags & __GFP_OTHER_NODE)) {
+		local_stat = NUMA_OTHER;
+		local_nid = preferred_zone->node;
+	}
+
+	if (z->node == local_nid) {
+		__inc_zone_state(z, NUMA_HIT);
+		__inc_zone_state(z, local_stat);
+	} else {
+		__inc_zone_state(z, NUMA_MISS);
+		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
+	}
+#endif
+}
+
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d585de27e960..f1a73bfb77b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -569,35 +569,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 #endif
 
 #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z 	    = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-	int local_nid = numa_node_id();
-	enum zone_stat_item local_stat = NUMA_LOCAL;
-
-	if (unlikely(flags & __GFP_OTHER_NODE)) {
-		local_stat = NUMA_OTHER;
-		local_nid = preferred_zone->node;
-	}
-
-	if (z->node == local_nid) {
-		__inc_zone_state(z, NUMA_HIT);
-		__inc_zone_state(z, local_stat);
-	} else {
-		__inc_zone_state(z, NUMA_MISS);
-		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
-	}
-}
-
 /*
  * Determine the per node value of a stat item.
  */
-- 
cgit v1.2.3


From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:30 -0700
Subject: mm, page_alloc: inline the fast path of the zonelist iterator

The page allocator iterates through a zonelist for zones that match the
addressing limitations and nodemask of the caller but many allocations
will not be restricted.  Despite this, there is always functional call
overhead which builds up.

This patch inlines the optimistic basic case and only calls the iterator
function for the complex case.  A hindrance was the fact that
cpuset_current_mems_allowed is used in the fastpath as the allowed
nodemask even though all nodes are allowed on most systems.  The patch
handles this by only considering cpuset_current_mems_allowed if a cpuset
exists.  As well as being faster in the fast-path, this removes some
junk in the slowpath.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statinline-v1r20              optiter-v1r20
  Min      alloc-odr0-1               412.00 (  0.00%)           382.00 (  7.28%)
  Min      alloc-odr0-2               301.00 (  0.00%)           282.00 (  6.31%)
  Min      alloc-odr0-4               247.00 (  0.00%)           233.00 (  5.67%)
  Min      alloc-odr0-8               215.00 (  0.00%)           203.00 (  5.58%)
  Min      alloc-odr0-16              199.00 (  0.00%)           188.00 (  5.53%)
  Min      alloc-odr0-32              191.00 (  0.00%)           182.00 (  4.71%)
  Min      alloc-odr0-64              187.00 (  0.00%)           177.00 (  5.35%)
  Min      alloc-odr0-128             185.00 (  0.00%)           175.00 (  5.41%)
  Min      alloc-odr0-256             193.00 (  0.00%)           184.00 (  4.66%)
  Min      alloc-odr0-512             207.00 (  0.00%)           197.00 (  4.83%)
  Min      alloc-odr0-1024            213.00 (  0.00%)           203.00 (  4.69%)
  Min      alloc-odr0-2048            220.00 (  0.00%)           209.00 (  5.00%)
  Min      alloc-odr0-4096            226.00 (  0.00%)           214.00 (  5.31%)
  Min      alloc-odr0-8192            229.00 (  0.00%)           218.00 (  4.80%)
  Min      alloc-odr0-16384           229.00 (  0.00%)           219.00 (  4.37%)

perf indicated that next_zones_zonelist disappeared in the profile and
__next_zones_zonelist did not appear.  This is expected as the
micro-benchmark would hit the inlined fast-path every time.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++++++--
 mm/mmzone.c            |  2 +-
 mm/page_alloc.c        | 26 +++++++++-----------------
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 150c6049f961..cfcd7723edb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
 #endif /* CONFIG_NUMA */
 }
 
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes);
+
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
  * @z - The cursor used as a starting point for the search
@@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
  */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes);
+					nodemask_t *nodes)
+{
+	if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+		return z;
+	return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb4de6f..5652be858e5e 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 }
 
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
 					nodemask_t *nodes)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36384baa74e1..789e5f065e8d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3192,17 +3192,6 @@ retry:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-	/*
-	 * Find the true preferred zone if the allocation is unconstrained by
-	 * cpusets.
-	 */
-	if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
-		struct zoneref *preferred_zoneref;
-		preferred_zoneref = first_zones_zonelist(ac->zonelist,
-				ac->high_zoneidx, NULL, &ac->preferred_zone);
-		ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
-	}
-
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
+		.zonelist = zonelist,
 		.nodemask = nodemask,
 		.migratetype = gfpflags_to_migratetype(gfp_mask),
 	};
 
+	if (cpusets_enabled()) {
+		alloc_flags |= ALLOC_CPUSET;
+		if (!ac.nodemask)
+			ac.nodemask = &cpuset_current_mems_allowed;
+	}
+
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
@@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
-	/* We set it here, as __alloc_pages_slowpath might have changed it */
-	ac.zonelist = zonelist;
-
 	/* Dirty zone balancing only done in the fast path */
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask ? : &cpuset_current_mems_allowed,
-				&ac.preferred_zone);
+				ac.nodemask, &ac.preferred_zone);
 	if (!ac.preferred_zone)
 		goto out;
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
-- 
cgit v1.2.3


From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:38 -0700
Subject: mm, page_alloc: convert alloc_flags to unsigned

alloc_flags is a bitmask of flags but it is signed which does not
necessarily generate the best code depending on the compiler.  Even
without an impact, it makes more sense that this be unsigned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  6 +++---
 include/linux/mmzone.h     |  3 ++-
 mm/compaction.c            | 12 +++++++-----
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de583a23..242b660f64e6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended);
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cfcd7723edb6..327f0fa1e1ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
-		unsigned long mark, int classzone_idx, int alloc_flags);
+		unsigned long mark, int classzone_idx,
+		unsigned int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx);
 enum memmap_context {
diff --git a/mm/compaction.c b/mm/compaction.c
index 7487067b4613..8f339ca25621 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_CONTINUE - If compaction should run now
  */
 static unsigned long __compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 }
 
 unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	unsigned long ret;
 
@@ -1530,7 +1532,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
-		int alloc_flags, int classzone_idx)
+		unsigned int alloc_flags, int classzone_idx)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended)
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended)
 {
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
diff --git a/mm/internal.h b/mm/internal.h
index 098a89e3b97c..114593aab55c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,7 +175,7 @@ struct compact_control {
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
-	const int alloc_flags;		/* alloc flags of a direct compactor */
+	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 	const int classzone_idx;	/* zone index of a direct compactor */
 	struct zone *zone;
 	int contended;			/* Signal need_sched() or lock
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f328cfb137d..094587a4ed81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned)
 }
 
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-								int alloc_flags)
+							unsigned int alloc_flags)
 {
 	int i;
 	bool poisoned = true;
@@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
-			gfp_t gfp_flags, int alloc_flags, int migratetype)
+			gfp_t gfp_flags, unsigned int alloc_flags,
+			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
@@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * to check in the allocation paths if no pages are free.
  */
 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx, int alloc_flags,
+			unsigned long mark, int classzone_idx,
+			unsigned int alloc_flags,
 			long free_pages)
 {
 	long min = mark;
 	int o;
-	const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+	const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
 
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;
@@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 }
 
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags)
+		      int classzone_idx, unsigned int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
@@ -2957,7 +2959,7 @@ out:
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
@@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
 }
 
-static inline int
+static inline unsigned int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	struct page *page = NULL;
-	int alloc_flags;
+	unsigned int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
-- 
cgit v1.2.3


From 09940a4f1e816abe3248fa0d185fc0e7f54c8c12 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:53 -0700
Subject: mm, page_alloc: simplify last cpupid reset

The current reset unnecessarily clears flags and makes pointless
calculations.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c2852cabf01..2b97be1147ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
-	page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
-	page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+	page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 #else /* !CONFIG_NUMA_BALANCING */
-- 
cgit v1.2.3


From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:10 -0700
Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice

The allocator fast path looks up the first usable zone in a zonelist and
then get_page_from_freelist does the same job in the zonelist iterator.
This patch preserves the necessary information.

                                             4.6.0-rc2                  4.6.0-rc2
                                        fastmark-v1r20             initonce-v1r20
  Min      alloc-odr0-1               364.00 (  0.00%)           359.00 (  1.37%)
  Min      alloc-odr0-2               262.00 (  0.00%)           260.00 (  0.76%)
  Min      alloc-odr0-4               214.00 (  0.00%)           214.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           186.00 (  0.00%)
  Min      alloc-odr0-16              173.00 (  0.00%)           173.00 (  0.00%)
  Min      alloc-odr0-32              165.00 (  0.00%)           165.00 (  0.00%)
  Min      alloc-odr0-64              161.00 (  0.00%)           162.00 ( -0.62%)
  Min      alloc-odr0-128             159.00 (  0.00%)           161.00 ( -1.26%)
  Min      alloc-odr0-256             168.00 (  0.00%)           170.00 ( -1.19%)
  Min      alloc-odr0-512             180.00 (  0.00%)           181.00 ( -0.56%)
  Min      alloc-odr0-1024            190.00 (  0.00%)           190.00 (  0.00%)
  Min      alloc-odr0-2048            196.00 (  0.00%)           196.00 (  0.00%)
  Min      alloc-odr0-4096            202.00 (  0.00%)           202.00 (  0.00%)
  Min      alloc-odr0-8192            206.00 (  0.00%)           205.00 (  0.49%)
  Min      alloc-odr0-16384           206.00 (  0.00%)           205.00 (  0.49%)

The benefit is negligible and the results are within the noise but each
cycle counts.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c            | 10 +++++-----
 include/linux/mmzone.h | 18 +++++++++++-------
 mm/internal.h          |  2 +-
 mm/mempolicy.c         | 19 ++++++++++---------
 mm/page_alloc.c        | 34 ++++++++++++++++------------------
 5 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a82a8ed..754813a6962b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
  */
 static void free_more_memory(void)
 {
-	struct zone *zone;
+	struct zoneref *z;
 	int nid;
 
 	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
-		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL,
-						&zone);
-		if (zone)
+
+		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL);
+		if (z->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS, NULL);
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 327f0fa1e1ce..4b28d2f8125e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
-	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+	return next_zones_zonelist(zonelist->_zonerefs,
 							highest_zoneidx, nodes);
-	*zone = zonelist_zone(z);
-	return z;
 }
 
 /**
@@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
  * within a given nodemask
  */
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
-	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
+	for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask),	\
-			zone = zonelist_zone(z))			\
+			zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (zone = z->zone;	\
+		zone;							\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))
+
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/mm/internal.h b/mm/internal.h
index 114593aab55c..d1ddd71c1bbf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 struct alloc_context {
 	struct zonelist *zonelist;
 	nodemask_t *nodemask;
-	struct zone *preferred_zone;
+	struct zoneref *preferred_zoneref;
 	int classzone_idx;
 	int migratetype;
 	enum zone_type high_zoneidx;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7f80ebcd6552..297d6854f849 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
 		return interleave_nodes(policy);
 
 	case MPOL_BIND: {
+		struct zoneref *z;
+
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
 		struct zonelist *zonelist;
-		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(node)->node_zonelists[0];
-		(void)first_zones_zonelist(zonelist, highest_zoneidx,
-							&policy->v.nodes,
-							&zone);
-		return zone ? zone->node : node;
+		z = first_zones_zonelist(zonelist, highest_zoneidx,
+							&policy->v.nodes);
+		return z->zone ? z->zone->node : node;
 	}
 
 	default:
@@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n)
 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol;
-	struct zone *zone;
+	struct zoneref *z;
 	int curnid = page_to_nid(page);
 	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
@@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_BIND:
+
 		/*
 		 * allows binding to multiple nodes.
 		 * use current page if in policy nodemask,
@@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 */
 		if (node_isset(curnid, pol->v.nodes))
 			goto out;
-		(void)first_zones_zonelist(
+		z = first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
-				&pol->v.nodes, &zone);
-		polnid = zone->node;
+				&pol->v.nodes);
+		polnid = z->zone->node;
 		break;
 
 	default:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 732875b1bdfb..dba8cfd0b2d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2704,7 +2704,7 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
-	struct zoneref *z;
+	struct zoneref *z = ac->preferred_zoneref;
 	struct zone *zone;
 	bool fair_skipped = false;
 	bool apply_fair = (alloc_flags & ALLOC_FAIR);
@@ -2714,7 +2714,7 @@ zonelist_scan:
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		struct page *page;
 		unsigned long mark;
@@ -2734,7 +2734,7 @@ zonelist_scan:
 				fair_skipped = true;
 				continue;
 			}
-			if (!zone_local(ac->preferred_zone, zone)) {
+			if (!zone_local(ac->preferred_zoneref->zone, zone)) {
 				if (fair_skipped)
 					goto reset_fair;
 				apply_fair = false;
@@ -2780,7 +2780,7 @@ zonelist_scan:
 				goto try_this_zone;
 
 			if (zone_reclaim_mode == 0 ||
-			    !zone_allows_reclaim(ac->preferred_zone, zone))
+			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 
 			ret = zone_reclaim(zone, gfp_mask, order);
@@ -2802,7 +2802,7 @@ zonelist_scan:
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(ac->preferred_zone, zone, order,
+		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			if (prep_new_page(page, order, gfp_mask, alloc_flags))
@@ -2831,7 +2831,7 @@ try_this_zone:
 reset_fair:
 		apply_fair = false;
 		fair_skipped = false;
-		reset_alloc_batches(ac->preferred_zone);
+		reset_alloc_batches(ac->preferred_zoneref->zone);
 		goto zonelist_scan;
 	}
 
@@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
 						ac->high_zoneidx, ac->nodemask)
-		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+		wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref));
 }
 
 static inline unsigned int
@@ -3332,7 +3332,7 @@ retry:
 	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
 	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
 		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
 		goto retry;
 	}
 
@@ -3370,7 +3370,6 @@ struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
-	struct zoneref *preferred_zoneref;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
@@ -3416,14 +3415,14 @@ retry_cpuset:
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
-	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask, &ac.preferred_zone);
-	if (!ac.preferred_zone) {
+	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+					ac.high_zoneidx, ac.nodemask);
+	if (!ac.preferred_zoneref) {
 		page = NULL;
 		goto no_zone;
 	}
 
-	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+	ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref);
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat)
  */
 int local_memory_node(int node)
 {
-	struct zone *zone;
+	struct zoneref *z;
 
-	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
-				   NULL,
-				   &zone);
-	return zone->node;
+				   NULL);
+	return z->zone->node;
 }
 #endif
 
-- 
cgit v1.2.3


From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:27 -0700
Subject: mm, page_alloc: inline pageblock lookup in page free fast paths

The function call overhead of get_pfnblock_flags_mask() is measurable in
the page free paths.  This patch uses an inlined version that is faster.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |   7 --
 mm/page_alloc.c        | 188 ++++++++++++++++++++++++++-----------------------
 mm/page_owner.c        |   2 +-
 mm/vmstat.c            |   2 +-
 4 files changed, 102 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4b28d2f8125e..c60db2096fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
 	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
 			PB_migrate_end, MIGRATETYPE_MASK)
 
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
-	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
-					MIGRATETYPE_MASK);
-}
-
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 822ce86fc883..bdf7a13311b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
 }
 #endif
 
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+							unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	return __pfn_to_section(pfn)->pageblock_flags;
+#else
+	return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	pfn &= (PAGES_PER_SECTION-1);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long word;
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	word = bitmap[word_bitidx];
+	bitidx += end_bitidx;
+	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long old_word, word;
+
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+	bitidx += end_bitidx;
+	mask <<= (BITS_PER_LONG - bitidx - 1);
+	flags <<= (BITS_PER_LONG - bitidx - 1);
+
+	word = READ_ONCE(bitmap[word_bitidx]);
+	for (;;) {
+		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+		if (word == old_word)
+			break;
+		word = old_word;
+	}
+}
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
@@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	return table;
 }
 
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
-							unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
-#else
-	return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	pfn &= (PAGES_PER_SECTION-1);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
-	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long word;
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	word = bitmap[word_bitidx];
-	bitidx += end_bitidx;
-	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-					unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long old_word, word;
-
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
-
-	bitidx += end_bitidx;
-	mask <<= (BITS_PER_LONG - bitidx - 1);
-	flags <<= (BITS_PER_LONG - bitidx - 1);
-
-	word = READ_ONCE(bitmap[word_bitidx]);
-	for (;;) {
-		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-		if (word == old_word)
-			break;
-		word = old_word;
-	}
-}
-
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 438768c092ac..792b56da13d8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		goto err;
 
 	/* Print information relevant to grouping pages by mobility */
-	pageblock_mt = get_pfnblock_migratetype(page, pfn);
+	pageblock_mt = get_pageblock_migratetype(page);
 	page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
 	ret += snprintf(kbuf + ret, count - ret,
 			"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f1a73bfb77b5..5b72a8ad2813 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
 		page = pfn_to_page(pfn);
-		pageblock_mt = get_pfnblock_migratetype(page, pfn);
+		pageblock_mt = get_pageblock_migratetype(page);
 
 		for (; pfn < block_end_pfn; pfn++) {
 			if (!pfn_valid_within(pfn))
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++--------------
 kernel/cpuset.c        | 14 +++++++-------
 mm/page_alloc.c        |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
 
 #ifdef CONFIG_CPUSETS
 
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
-	return static_key_false(&cpusets_enabled_key);
+	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
 static inline int nr_cpusets(void)
 {
 	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key) + 1;
+	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
 static inline void cpuset_inc(void)
 {
-	static_key_slow_inc(&cpusets_enabled_key);
+	static_branch_inc(&cpusets_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-	static_key_slow_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_enabled_key);
 }
 
 extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+	if (cpusets_enabled())
+		return __cpuset_node_allowed(node, gfp_mask);
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	if (cpusets_enabled())
+		return __cpuset_zone_allowed(z, gfp_mask);
+	return true;
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return true;
 }
 
 static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 611cc69af8f0..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /* See "Frequency meter" comments, below. */
 
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
-		return 1;
+		return true;
 	if (node_isset(node, current->mems_allowed))
-		return 1;
+		return true;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return 1;
+		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return 0;
+		return false;
 
 	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
+		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdf7a13311b5..39c441bb8d61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2859,7 +2859,7 @@ zonelist_scan:
 
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
-			!cpuset_zone_allowed(zone, gfp_mask))
+			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 		/*
 		 * Distribute pages in proportion to the individual
-- 
cgit v1.2.3


From 44bfc42e94cd76a0bd44f3fce98d4a7b76f31bc0 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Wed, 4 May 2016 14:35:48 +0100
Subject: KVM: arm/arm64: move GICv2 emulation defines into arm-gic-v3.h

As (some) GICv3 hosts can emulate a GICv2, some GICv2 specific masks
for the list register definition also apply to GICv3 LRs.
At the moment we have those definitions in the KVM VGICv3
implementation, so let's move them into the GICv3 header file to
have them automatically defined.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 5 +++++
 virt/kvm/arm/vgic-v3.c             | 8 +-------
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index d5d798b35c1f..ec938d14da5d 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -276,6 +276,11 @@
 #define ICH_LR_PHYS_ID_SHIFT		32
 #define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
 
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
+#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
+
 #define ICH_MISR_EOI			(1 << 0)
 #define ICH_MISR_U			(1 << 1)
 
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index c02a1b1cf855..75b02fa86436 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -29,12 +29,6 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 
-/* These are for GICv2 emulation only */
-#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
-#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
-#define ICH_LR_VIRTUALID_MASK		(BIT_ULL(32) - 1)
-
 static u32 ich_vtr_el2;
 
 static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
@@ -43,7 +37,7 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
 	u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
 
 	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-		lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
+		lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
 	else
 		lr_desc.irq = val & GICH_LR_VIRTUALID;
 
-- 
cgit v1.2.3


From 140b086dd19771410915a924db2e635c2b51a0f4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 26 Nov 2015 17:19:25 +0000
Subject: KVM: arm/arm64: vgic-new: Add GICv2 world switch backend

Processing maintenance interrupts and accessing the list registers
are dependent on the host's GIC version.
Introduce vgic-v2.c to contain GICv2 specific functions.
Implement the GICv2 specific code for syncing the emulation state
into the VGIC registers.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Eric Auger <eric.auger@linaro.org>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic.h |   1 +
 virt/kvm/arm/vgic/vgic-v2.c     | 176 ++++++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.c        |   6 ++
 virt/kvm/arm/vgic/vgic.h        |   6 ++
 4 files changed, 189 insertions(+)
 create mode 100644 virt/kvm/arm/vgic/vgic-v2.c

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 9c940263ca23..be0d26f940af 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -76,6 +76,7 @@
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
 #define GICH_LR_PHYSID_CPUID		(0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PRIORITY_SHIFT		23
 #define GICH_LR_STATE			(3 << 28)
 #define GICH_LR_PENDING_BIT		(1 << 28)
 #define GICH_LR_ACTIVE_BIT		(1 << 29)
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
new file mode 100644
index 000000000000..fb5e65ceffd0
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include "vgic.h"
+
+/*
+ * Call this function to convert a u64 value to an unsigned long * bitmask
+ * in a way that works on both 32-bit and 64-bit LE and BE platforms.
+ *
+ * Warning: Calling this function may modify *val.
+ */
+static unsigned long *u64_to_bitmask(u64 *val)
+{
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
+	*val = (*val >> 32) | (*val << 32);
+#endif
+	return (unsigned long *)val;
+}
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	if (cpuif->vgic_misr & GICH_MISR_EOI) {
+		u64 eisr = cpuif->vgic_eisr;
+		unsigned long *eisr_bmap = u64_to_bitmask(&eisr);
+		int lr;
+
+		for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) {
+			u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID;
+
+			WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE);
+
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+		}
+	}
+
+	/* check and disable underflow maintenance IRQ */
+	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+	/*
+	 * In the next iterations of the vcpu loop, if we sync the
+	 * vgic state after flushing it, but before entering the guest
+	 * (this happens for pending signals and vmid rollovers), then
+	 * make sure we don't pick up any old maintenance interrupts
+	 * here.
+	 */
+	cpuif->vgic_eisr = 0;
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+	int lr;
+
+	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+		u32 val = cpuif->vgic_lr[lr];
+		u32 intid = val & GICH_LR_VIRTUALID;
+		struct vgic_irq *irq;
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+		spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & GICH_LR_PENDING_BIT)) {
+			irq->pending = true;
+
+			if (vgic_irq_is_sgi(intid)) {
+				u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+				cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+				irq->source |= (1 << cpuid);
+			}
+		}
+
+		/* Clear soft pending state when level IRQs have been acked */
+		if (irq->config == VGIC_CONFIG_LEVEL &&
+		    !(val & GICH_LR_PENDING_BIT)) {
+			irq->soft_pending = false;
+			irq->pending = irq->line_level;
+		}
+
+		spin_unlock(&irq->irq_lock);
+	}
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 val = irq->intid;
+
+	if (irq->pending) {
+		val |= GICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending = false;
+
+		if (vgic_irq_is_sgi(irq->intid)) {
+			u32 src = ffs(irq->source);
+
+			BUG_ON(!src);
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source)
+				irq->pending = true;
+		}
+	}
+
+	if (irq->active)
+		val |= GICH_LR_ACTIVE_BIT;
+
+	if (irq->hw) {
+		val |= GICH_LR_HW;
+		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL)
+			val |= GICH_LR_EOI;
+	}
+
+	/* The GICv2 LR only holds five bits of priority. */
+	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 08a862a98442..44d2533ac84e 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -400,10 +400,12 @@ retry:
 
 static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_process_maintenance(vcpu);
 }
 
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_fold_lr_state(vcpu);
 }
 
 /* Requires the irq_lock to be held. */
@@ -411,14 +413,18 @@ static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
 				    struct vgic_irq *irq, int lr)
 {
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+	vgic_v2_populate_lr(vcpu, irq, lr);
 }
 
 static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
 {
+	vgic_v2_clear_lr(vcpu, lr);
 }
 
 static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
 {
+	vgic_v2_set_underflow(vcpu);
 }
 
 /* Requires the ap_list_lock to be held. */
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 29b96b96a30b..0db490e491ef 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -22,4 +22,10 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+
 #endif
-- 
cgit v1.2.3


From 59529f69f5048e50dcde3434661981c01f8208b4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 30 Nov 2015 13:09:53 +0000
Subject: KVM: arm/arm64: vgic-new: Add GICv3 world switch backend

As the GICv3 virtual interface registers differ from their GICv2
siblings, we need different handlers for processing maintenance
interrupts and reading/writing to the LRs.
Implement the respective handler functions and connect them to
existing code to be called if the host is using a GICv3.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |   1 +
 virt/kvm/arm/vgic/vgic-v3.c        | 162 +++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.c           |  25 ++++--
 virt/kvm/arm/vgic/vgic.h           |  29 +++++++
 4 files changed, 212 insertions(+), 5 deletions(-)
 create mode 100644 virt/kvm/arm/vgic/vgic-v3.c

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index ec938d14da5d..35e93cfa1742 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -275,6 +275,7 @@
 #define ICH_LR_ACTIVE_BIT		(1ULL << 63)
 #define ICH_LR_PHYS_ID_SHIFT		32
 #define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
+#define ICH_LR_PRIORITY_SHIFT		48
 
 /* These are for GICv2 emulation only */
 #define GICH_LR_VIRTUALID		(0x3ffUL << 0)
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
new file mode 100644
index 000000000000..fb547da7a43d
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -0,0 +1,162 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include "vgic.h"
+
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+
+	if (cpuif->vgic_misr & ICH_MISR_EOI) {
+		unsigned long eisr_bmap = cpuif->vgic_eisr;
+		int lr;
+
+		for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) {
+			u32 intid;
+			u64 val = cpuif->vgic_lr[lr];
+
+			if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+				intid = val & ICH_LR_VIRTUAL_ID_MASK;
+			else
+				intid = val & GICH_LR_VIRTUALID;
+
+			WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE);
+
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+		}
+
+		/*
+		 * In the next iterations of the vcpu loop, if we sync
+		 * the vgic state after flushing it, but before
+		 * entering the guest (this happens for pending
+		 * signals and vmid rollovers), then make sure we
+		 * don't pick up any old maintenance interrupts here.
+		 */
+		cpuif->vgic_eisr = 0;
+	}
+
+	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+}
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	int lr;
+
+	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+		u64 val = cpuif->vgic_lr[lr];
+		u32 intid;
+		struct vgic_irq *irq;
+
+		if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+			intid = val & ICH_LR_VIRTUAL_ID_MASK;
+		else
+			intid = val & GICH_LR_VIRTUALID;
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+		spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & ICH_LR_PENDING_BIT)) {
+			irq->pending = true;
+
+			if (vgic_irq_is_sgi(intid) &&
+			    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+				u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+				cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+				irq->source |= (1 << cpuid);
+			}
+		}
+
+		/* Clear soft pending state when level irqs have been acked */
+		if (irq->config == VGIC_CONFIG_LEVEL &&
+		    !(val & ICH_LR_PENDING_BIT)) {
+			irq->soft_pending = false;
+			irq->pending = irq->line_level;
+		}
+
+		spin_unlock(&irq->irq_lock);
+	}
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	u64 val = irq->intid;
+
+	if (irq->pending) {
+		val |= ICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending = false;
+
+		if (vgic_irq_is_sgi(irq->intid) &&
+		    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+			u32 src = ffs(irq->source);
+
+			BUG_ON(!src);
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source)
+				irq->pending = true;
+		}
+	}
+
+	if (irq->active)
+		val |= ICH_LR_ACTIVE_BIT;
+
+	if (irq->hw) {
+		val |= ICH_LR_HW;
+		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL)
+			val |= ICH_LR_EOI;
+	}
+
+	/*
+	 * We currently only support Group1 interrupts, which is a
+	 * known defect. This needs to be addressed at some point.
+	 */
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+		val |= ICH_LR_GROUP;
+
+	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 44d2533ac84e..0bf0d2060053 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -400,12 +400,18 @@ retry:
 
 static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_process_maintenance(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_process_maintenance(vcpu);
+	else
+		vgic_v3_process_maintenance(vcpu);
 }
 
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_fold_lr_state(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_fold_lr_state(vcpu);
+	else
+		vgic_v3_fold_lr_state(vcpu);
 }
 
 /* Requires the irq_lock to be held. */
@@ -414,17 +420,26 @@ static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
 {
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
 
-	vgic_v2_populate_lr(vcpu, irq, lr);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_populate_lr(vcpu, irq, lr);
+	else
+		vgic_v3_populate_lr(vcpu, irq, lr);
 }
 
 static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
 {
-	vgic_v2_clear_lr(vcpu, lr);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_clear_lr(vcpu, lr);
+	else
+		vgic_v3_clear_lr(vcpu, lr);
 }
 
 static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
 {
-	vgic_v2_set_underflow(vcpu);
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_underflow(vcpu);
+	else
+		vgic_v3_set_underflow(vcpu);
 }
 
 /* Requires the ap_list_lock to be held. */
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 0db490e491ef..81b1a20dfbc5 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -28,4 +28,33 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+#else
+static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
+				       struct vgic_irq *irq, int lr)
+{
+}
+
+static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+}
+
+static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 2b0cda8789654bfcebca397daebc37aff081bd75 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 26 Apr 2016 11:06:47 +0100
Subject: KVM: arm/arm64: vgic-new: Add CTLR, TYPER and IIDR handlers

Those three registers are v2 emulation specific, so their implementation
lives entirely in vgic-mmio-v2.c. Also they are handled in one function,
as their implementation is pretty simple.
When the guest enables the distributor, we kick all VCPUs to get
potentially pending interrupts serviced.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/linux/irqchip/arm-gic.h  |  1 +
 virt/kvm/arm/vgic/vgic-mmio-v2.c | 46 +++++++++++++++++++++++++++++++++++++++-
 virt/kvm/arm/vgic/vgic.c         | 15 +++++++++++++
 virt/kvm/arm/vgic/vgic.h         |  4 ++++
 4 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index be0d26f940af..fd051855539b 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -33,6 +33,7 @@
 
 #define GIC_DIST_CTRL			0x000
 #define GIC_DIST_CTR			0x004
+#define GIC_DIST_IIDR			0x008
 #define GIC_DIST_IGROUP			0x080
 #define GIC_DIST_ENABLE_SET		0x100
 #define GIC_DIST_ENABLE_CLEAR		0x180
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a3e31a93a3fb..d812c933708a 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -20,9 +20,53 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	u32 value;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0;
+		break;
+	case GIC_DIST_CTR:
+		value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+		value = (value >> 5) - 1;
+		value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+		break;
+	case GIC_DIST_IIDR:
+		value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+		break;
+	default:
+		return 0;
+	}
+
+	return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	bool was_enabled = dist->enabled;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		dist->enabled = val & GICD_ENABLE;
+		if (!was_enabled && dist->enabled)
+			vgic_kick_vcpus(vcpu->kvm);
+		break;
+	case GIC_DIST_CTR:
+	case GIC_DIST_IIDR:
+		/* Nothing to do */
+		return;
+	}
+}
+
 static const struct vgic_register_region vgic_v2_dist_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 12,
+		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
 		vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 7e010087224c..12ae84b4931f 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -545,3 +545,18 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 
 	return pending;
 }
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int c;
+
+	/*
+	 * We've injected an interrupt, time to find out who deserves
+	 * a good kick...
+	 */
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		if (kvm_vgic_vcpu_pending_irq(vcpu))
+			kvm_vcpu_kick(vcpu);
+	}
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index fd9acaa1e305..cf620157e1e4 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -16,11 +16,15 @@
 #ifndef __KVM_ARM_VGIC_NEW_H__
 #define __KVM_ARM_VGIC_NEW_H__
 
+#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
+#define IMPLEMENTER_ARM		0x43b
+
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
 
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_kick_vcpus(struct kvm *kvm);
 
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 7e13318daa4a67bff2f800923a993ef3818b3c53 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 18 May 2016 09:06:10 -0700
Subject: net: define gso types for IPx over IPv4 and IPv6

This patch defines two new GSO definitions SKB_GSO_IPXIP4 and
SKB_GSO_IPXIP6 along with corresponding NETIF_F_GSO_IPXIP4 and
NETIF_F_GSO_IPXIP6. These are used to described IP in IP
tunnel and what the outer protocol is. The inner protocol
can be deduced from other GSO types (e.g. SKB_GSO_TCPV4 and
SKB_GSO_TCPV6). The GSO types of SKB_GSO_IPIP and SKB_GSO_SIT
are removed (these are both instances of SKB_GSO_IPXIP4).
SKB_GSO_IPXIP6 will be used when support for GSO with IP
encapsulation over IPv6 is added.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |  5 ++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c       |  3 +--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c       |  3 +--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c     |  3 +--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |  3 +--
 drivers/net/ethernet/intel/igb/igb_main.c         |  3 +--
 drivers/net/ethernet/intel/igbvf/netdev.c         |  3 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  3 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  3 +--
 include/linux/netdev_features.h                   | 12 ++++++------
 include/linux/netdevice.h                         |  4 ++--
 include/linux/skbuff.h                            |  4 ++--
 net/core/ethtool.c                                |  4 ++--
 net/ipv4/af_inet.c                                |  2 +-
 net/ipv4/ipip.c                                   |  2 +-
 net/ipv6/ip6_offload.c                            |  4 ++--
 net/ipv6/sit.c                                    |  4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c                   | 17 +++++++----------
 19 files changed, 37 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index d465bd721146..0a5b770cefaa 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13259,12 +13259,11 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 		NETIF_F_RXHASH | NETIF_F_HW_VLAN_CTAG_TX;
 	if (!chip_is_e1x) {
 		dev->hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL |
-				    NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT;
+				    NETIF_F_GSO_IPXIP4;
 		dev->hw_enc_features =
 			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
 			NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
-			NETIF_F_GSO_IPIP |
-			NETIF_F_GSO_SIT |
+			NETIF_F_GSO_IPXIP4 |
 			NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL;
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5a0dca3e6ef6..72a2efff8e49 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6311,7 +6311,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
 			   NETIF_F_TSO | NETIF_F_TSO6 |
 			   NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
-			   NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
+			   NETIF_F_GSO_IPXIP4 |
 			   NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
 			   NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
 			   NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO;
@@ -6321,8 +6321,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			NETIF_F_TSO | NETIF_F_TSO6 |
 			NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
 			NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
-			NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
-			NETIF_F_GSO_PARTIAL;
+			NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_PARTIAL;
 	dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM |
 				    NETIF_F_GSO_GRE_CSUM;
 	dev->vlan_features = dev->hw_features | NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1cd0ebf7520a..242a1ff344e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9083,8 +9083,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
 				   NETIF_F_GSO_GRE_CSUM		|
-				   NETIF_F_GSO_IPIP		|
-				   NETIF_F_GSO_SIT		|
+				   NETIF_F_GSO_IPXIP4		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
 				   NETIF_F_GSO_PARTIAL		|
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 99a524db5560..0a8122c00ae2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2284,8 +2284,7 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
 					 SKB_GSO_GRE_CSUM |
-					 SKB_GSO_IPIP |
-					 SKB_GSO_SIT |
+					 SKB_GSO_IPXIP4 |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index fd7dae46c5d8..2bbbbd0f9f15 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1559,8 +1559,7 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
 					 SKB_GSO_GRE_CSUM |
-					 SKB_GSO_IPIP |
-					 SKB_GSO_SIT |
+					 SKB_GSO_IPXIP4 |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 642bb45ed906..02d0a1ca6960 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2230,8 +2230,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
 				   NETIF_F_GSO_GRE_CSUM		|
-				   NETIF_F_GSO_IPIP		|
-				   NETIF_F_GSO_SIT		|
+				   NETIF_F_GSO_IPXIP4		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
 				   NETIF_F_GSO_PARTIAL		|
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 21727692bef6..b1a5cdb77088 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2418,8 +2418,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				  NETIF_F_GSO_GRE_CSUM | \
-				  NETIF_F_GSO_IPIP | \
-				  NETIF_F_GSO_SIT | \
+				  NETIF_F_GSO_IPXIP4 | \
 				  NETIF_F_GSO_UDP_TUNNEL | \
 				  NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 322a2d7828a5..79b907f1a520 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2763,8 +2763,7 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IGBVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				    NETIF_F_GSO_GRE_CSUM | \
-				    NETIF_F_GSO_IPIP | \
-				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_IPXIP4 | \
 				    NETIF_F_GSO_UDP_TUNNEL | \
 				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 9f3677c7e96f..69452c379cbc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9482,8 +9482,7 @@ skip_sriov:
 
 #define IXGBE_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				    NETIF_F_GSO_GRE_CSUM | \
-				    NETIF_F_GSO_IPIP | \
-				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_IPXIP4 | \
 				    NETIF_F_GSO_UDP_TUNNEL | \
 				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 5e348b125090..d86e51116384 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4062,8 +4062,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 #define IXGBEVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				      NETIF_F_GSO_GRE_CSUM | \
-				      NETIF_F_GSO_IPIP | \
-				      NETIF_F_GSO_SIT | \
+				      NETIF_F_GSO_IPXIP4 | \
 				      NETIF_F_GSO_UDP_TUNNEL | \
 				      NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index bc8736266749..aa7b2400f98c 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -44,8 +44,8 @@ enum {
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
 	NETIF_F_GSO_GRE_CSUM_BIT,	/* ... GRE with csum with TSO */
-	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
-	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
+	NETIF_F_GSO_IPXIP4_BIT,		/* ... IP4 or IP6 over IP4 with TSO */
+	NETIF_F_GSO_IPXIP6_BIT,		/* ... IP4 or IP6 over IP6 with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
 	NETIF_F_GSO_PARTIAL_BIT,	/* ... Only segment inner-most L4
@@ -121,8 +121,8 @@ enum {
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
 #define NETIF_F_GSO_GRE_CSUM	__NETIF_F(GSO_GRE_CSUM)
-#define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
-#define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
+#define NETIF_F_GSO_IPXIP4	__NETIF_F(GSO_IPXIP4)
+#define NETIF_F_GSO_IPXIP6	__NETIF_F(GSO_IPXIP6)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
@@ -200,8 +200,8 @@ enum {
 
 #define NETIF_F_GSO_ENCAP_ALL	(NETIF_F_GSO_GRE |			\
 				 NETIF_F_GSO_GRE_CSUM |			\
-				 NETIF_F_GSO_IPIP |			\
-				 NETIF_F_GSO_SIT |			\
+				 NETIF_F_GSO_IPXIP4 |			\
+				 NETIF_F_GSO_IPXIP6 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c148edfe4965..f45929ce8157 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4006,8 +4006,8 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_IPIP    != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c413c588a24f..65968a97517f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -471,9 +471,9 @@ enum {
 
 	SKB_GSO_GRE_CSUM = 1 << 8,
 
-	SKB_GSO_IPIP = 1 << 9,
+	SKB_GSO_IPXIP4 = 1 << 9,
 
-	SKB_GSO_SIT = 1 << 10,
+	SKB_GSO_IPXIP6 = 1 << 10,
 
 	SKB_GSO_UDP_TUNNEL = 1 << 11,
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index bdb4013581b1..f4034817d255 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,8 +84,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
 	[NETIF_F_GSO_GRE_CSUM_BIT] =	 "tx-gre-csum-segmentation",
-	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
-	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
+	[NETIF_F_GSO_IPXIP4_BIT] =	 "tx-ipxip4-segmentation",
+	[NETIF_F_GSO_IPXIP6_BIT] =	 "tx-ipxip6-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7f08d4525981..25040b183a60 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1483,7 +1483,7 @@ out_unlock:
 static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return inet_gro_complete(skb, nhoff);
 }
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 92827483ee3d..978370132f29 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 9ad743b2c624..787e55f4796c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -86,7 +86,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
 
 	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
 		udpfrag = proto == IPPROTO_UDP && encap;
 	else
 		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
@@ -294,7 +294,7 @@ out_unlock:
 static int sit_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return ipv6_gro_complete(skb, nhoff);
 }
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a13d8c114ccb..0a5a255277e5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -913,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		goto tx_error;
 	}
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_SIT)) {
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6d19d2eeaa60..01d3d894de46 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
 
 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
 {
-	if (encaps_af == AF_INET) {
-		if (orig_af == AF_INET)
-			return SKB_GSO_IPIP;
-
-		return SKB_GSO_SIT;
+	switch (encaps_af) {
+	case AF_INET:
+		return SKB_GSO_IPXIP4;
+	case AF_INET6:
+		return SKB_GSO_IPXIP6;
+	default:
+		return 0;
 	}
-
-	/* GSO: we need to provide proper SKB_GSO_ value for IPv6:
-	 * SKB_GSO_SIT/IPV6
-	 */
-	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 95829b3a9c0b1d88778b23bc2afdf5a83de066ff Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 19 May 2016 11:30:54 -0400
Subject: net: suppress warnings on dev_alloc_skb

Noticed an allocation failure in a network driver the other day on a 32 bit
system:

DMA-API: debugging out of memory - disabling
bnx2fc: adapter_lookup: hba NULL
lldpad: page allocation failure. order:0, mode:0x4120
Pid: 4556, comm: lldpad Not tainted 2.6.32-639.el6.i686.debug #1
Call Trace:
 [<c08a4086>] ? printk+0x19/0x23
 [<c05166a4>] ? __alloc_pages_nodemask+0x664/0x830
 [<c0649d02>] ? free_object+0x82/0xa0
 [<fb4e2c9b>] ? ixgbe_alloc_rx_buffers+0x10b/0x1d0 [ixgbe]
 [<fb4e2fff>] ? ixgbe_configure_rx_ring+0x29f/0x420 [ixgbe]
 [<fb4e228c>] ? ixgbe_configure_tx_ring+0x15c/0x220 [ixgbe]
 [<fb4e3709>] ? ixgbe_configure+0x589/0xc00 [ixgbe]
 [<fb4e7be7>] ? ixgbe_open+0xa7/0x5c0 [ixgbe]
 [<fb503ce6>] ? ixgbe_init_interrupt_scheme+0x5b6/0x970 [ixgbe]
 [<fb4e8e54>] ? ixgbe_setup_tc+0x1a4/0x260 [ixgbe]
 [<fb505a9f>] ? ixgbe_dcbnl_set_state+0x7f/0x90 [ixgbe]
 [<c088d80d>] ? dcb_doit+0x10ed/0x16d0
...

Thought that perhaps the big splat in the logs wasn't really necessecary, as
all call sites for dev_alloc_skb:

a) check the return code for the function

and

b) either print their own error message or have a recovery path that makes the
warning moot.

Fix it by modifying dev_alloc_pages to pass __GFP_NOWARN as a gfp flag to
suppress the warning

applies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 65968a97517f..ee38a4127475 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2467,7 +2467,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 
 static inline struct page *dev_alloc_pages(unsigned int order)
 {
-	return __dev_alloc_pages(GFP_ATOMIC, order);
+	return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
 }
 
 /**
@@ -2485,7 +2485,7 @@ static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
 
 static inline struct page *dev_alloc_page(void)
 {
-	return __dev_alloc_page(GFP_ATOMIC);
+	return dev_alloc_pages(0);
 }
 
 /**
-- 
cgit v1.2.3


From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 20 May 2016 16:56:31 -0700
Subject: mm: vmscan: reduce size of inactive file list

The inactive file list should still be large enough to contain readahead
windows and freshly written file data, but it no longer is the only
source for detecting multiple accesses to file pages.  The workingset
refault measurement code causes recently evicted file pages that get
accessed again after a shorter interval to be promoted directly to the
active list.

With that mechanism in place, we can afford to (on a larger system)
dedicate more memory to the active file list, so we can actually cache
more of the frequently used file pages in memory, and not have them
pushed out by streaming writes, once-used streaming file reads, etc.

This can help things like database workloads, where only half the page
cache can currently be used to cache the database working set.  This
patch automatically increases that fraction on larger systems, using the
same ratio that has already been used for anonymous memory.

[hannes@cmpxchg.org: cgroup-awareness]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  25 -----------
 mm/page_alloc.c            |  44 -------------------
 mm/vmscan.c                | 104 ++++++++++++++++++---------------------------
 3 files changed, 42 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 94da96738df3..a805474df4ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return mz->lru_size[lru];
 }
 
-static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
 void mem_cgroup_handle_over_high(void);
 
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return true;
 }
 
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return true;
-}
-
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c469c1dfb8b..edbdf56b3c9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6670,49 +6670,6 @@ void setup_per_zone_wmarks(void)
 	mutex_unlock(&zonelists_mutex);
 }
 
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
-	unsigned int gb, ratio;
-
-	/* Zone size in gigabytes */
-	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
-	struct zone *zone;
-
-	for_each_zone(zone)
-		calculate_zone_inactive_ratio(zone);
-}
-
 /*
  * Initialise min_free_kbytes.
  *
@@ -6758,7 +6715,6 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
-	setup_per_zone_inactive_ratio();
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcfdfc1a0942..38d6d06c955f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1862,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	free_hot_cold_page_list(&l_hold, true);
 }
 
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
-	unsigned long active, inactive;
-
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
-	return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
  *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	/*
-	 * If we don't have swap space, anonymous page deactivation
-	 * is pointless.
-	 */
-	if (!total_swap_pages)
-		return false;
-
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_inactive_anon_is_low(lruvec);
-
-	return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
  *
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
  *
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
  *
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * total     target    max
+ * memory    ratio     inactive
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
  */
-static bool inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
 {
+	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
+	unsigned long gb;
 
-	inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
-	active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	/*
+	 * If we don't have swap space, anonymous page deactivation
+	 * is pointless.
+	 */
+	if (!file && !total_swap_pages)
+		return false;
 
-	return active > inactive;
-}
+	inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
-	if (is_file_lru(lru))
-		return inactive_file_is_low(lruvec);
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
 	else
-		return inactive_anon_is_low(lruvec);
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, lru))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru)))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2059,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_file_is_low(lruvec) &&
+	if (!inactive_list_is_low(lruvec, true) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2301,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(lruvec))
+	if (inactive_list_is_low(lruvec, false))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
@@ -2962,7 +2942,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_anon_is_low(lruvec))
+		if (inactive_list_is_low(lruvec, false))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
-- 
cgit v1.2.3


From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:38 -0700
Subject: mm, compaction: change COMPACT_ constants into enum

Compaction code is doing weird dances between COMPACT_FOO -> int ->
unsigned long

But there doesn't seem to be any reason for that.  All functions which
return/use one of those constants are not expecting any other value so it
really makes sense to define an enum for them and make it clear that no
other values are expected.

This is a pure cleanup and shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------
 mm/compaction.c            | 27 ++++++++++++++-------------
 mm/page_alloc.c            |  2 +-
 3 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 242b660f64e6..706cbf00e919 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,21 +2,29 @@
 #define _LINUX_COMPACTION_H
 
 /* Return values for compact_zone() and try_to_compact_pages() */
-/* compaction didn't start as it was deferred due to past failures */
-#define COMPACT_DEFERRED	0
-/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED		1
-/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE	2
-/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL		3
-/* The full zone was compacted */
-#define COMPACT_COMPLETE	4
-/* For more detailed tracepoint output */
-#define COMPACT_NO_SUITABLE_PAGE	5
-#define COMPACT_NOT_SUITABLE_ZONE	6
-#define COMPACT_CONTENDED		7
 /* When adding new states, please adjust include/trace/events/compaction.h */
+enum compact_result {
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/*
+	 * compaction didn't start as it was not possible or direct reclaim
+	 * was more suitable
+	 */
+	COMPACT_SKIPPED,
+	/* compaction should continue to another pageblock */
+	COMPACT_CONTINUE,
+	/*
+	 * direct compaction partially compacted a zone and there are suitable
+	 * pages
+	 */
+	COMPACT_PARTIAL,
+	/* The full zone was compacted */
+	COMPACT_COMPLETE,
+	/* For more detailed tracepoint output */
+	COMPACT_NO_SUITABLE_PAGE,
+	COMPACT_NOT_SUITABLE_ZONE,
+	COMPACT_CONTENDED,
+};
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
@@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
-extern unsigned long compaction_suitable(struct zone *zone, int order,
+extern enum compact_result compaction_suitable(struct zone *zone, int order,
 		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
@@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
 			enum migrate_mode mode, int *contended)
@@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline unsigned long compaction_suitable(struct zone *zone, int order,
+static inline enum compact_result compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	return COMPACT_SKIPPED;
diff --git a/mm/compaction.c b/mm/compaction.c
index eda3c2244f30..e721d252c5d2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 	return COMPACT_NO_SUITABLE_PAGE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+			struct compact_control *cc,
+			const int migratetype)
 {
 	int ret;
 
@@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-static unsigned long __compaction_suitable(struct zone *zone, int order,
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
@@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 	return COMPACT_CONTINUE;
 }
 
-unsigned long compaction_suitable(struct zone *zone, int order,
+enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
 	trace_mm_compaction_suitable(zone, order, ret);
@@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
-	int ret;
+	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
 	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1530,11 +1531,11 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
 		unsigned int alloc_flags, int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500;
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended)
 {
@@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	int rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_DEFERRED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-		int status;
+		enum compact_result status;
 		int zone_contended;
 
 		if (compaction_deferred(zone, order))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index edbdf56b3c9b..ed62c4b90598 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
-	unsigned long compact_result;
+	enum compact_result compact_result;
 	struct page *page;
 
 	if (!order)
-- 
cgit v1.2.3


From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:44 -0700
Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED

try_to_compact_pages() can currently return COMPACT_SKIPPED even when
the compaction is defered for some zone just because zone DMA is skipped
in 99% of cases due to watermark checks.  This makes COMPACT_DEFERRED
basically unusable for the page allocator as a feedback mechanism.

Make sure we distinguish those two states properly and switch their
ordering in the enum.  This would mean that the COMPACT_SKIPPED will be
returned only when all eligible zones are skipped.

As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath
will be more precise and we would bail out rather than reclaim.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 7 +++++--
 include/trace/events/compaction.h | 2 +-
 mm/compaction.c                   | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 706cbf00e919..11f228712ed5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,13 +4,16 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
-	/* compaction didn't start as it was deferred due to past failures */
-	COMPACT_DEFERRED,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
 	 */
 	COMPACT_SKIPPED,
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/* compaction not active last round */
+	COMPACT_INACTIVE = COMPACT_DEFERRED,
+
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
 	/*
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e215bf68f521..6ba16c86d7db 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -10,8 +10,8 @@
 #include <trace/events/mmflags.h>
 
 #define COMPACTION_STATUS					\
-	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_SKIPPED,		"skipped")		\
+	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
 	EM( COMPACT_COMPLETE,		"complete")		\
diff --git a/mm/compaction.c b/mm/compaction.c
index 455ecd87f48d..b2b94474dd28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	enum compact_result rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_SKIPPED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		enum compact_result status;
 		int zone_contended;
 
-		if (compaction_deferred(zone, order))
+		if (compaction_deferred(zone, order)) {
+			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
 			continue;
+		}
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
 				&zone_contended, alloc_flags,
@@ -1667,7 +1669,7 @@ break_loop:
 	 * If at least one zone wasn't deferred or skipped, we report if all
 	 * zones that were tried were lock contended.
 	 */
-	if (rc > COMPACT_SKIPPED && all_zones_contended)
+	if (rc > COMPACT_INACTIVE && all_zones_contended)
 		*contended = COMPACT_CONTENDED_LOCK;
 
 	return rc;
-- 
cgit v1.2.3


From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:47 -0700
Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE

COMPACT_COMPLETE now means that compaction and free scanner met.  This
is not very useful information if somebody just wants to use this
feedback and make any decisions based on that.  The current caller might
be a poor guy who just happened to scan tiny portion of the zone and
that could be the reason no suitable pages were compacted.  Make sure we
distinguish the full and partial zone walks.

Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success
and be optimistic in retrying.

The existing users of COMPACT_COMPLETE are conservatively changed to use
COMPACT_PARTIAL_SKIPPED as well but some of them should be probably
reconsidered and only defer the compaction only for COMPACT_COMPLETE
with the new semantic.

This patch shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 10 +++++++++-
 include/trace/events/compaction.h |  1 +
 mm/compaction.c                   | 14 +++++++++++---
 mm/internal.h                     |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 11f228712ed5..9b37f9d3f7a8 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,7 +21,15 @@ enum compact_result {
 	 * pages
 	 */
 	COMPACT_PARTIAL,
-	/* The full zone was compacted */
+	/*
+	 * direct compaction has scanned part of the zone but wasn't successfull
+	 * to compact suitable pages.
+	 */
+	COMPACT_PARTIAL_SKIPPED,
+	/*
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
+	 */
 	COMPACT_COMPLETE,
 	/* For more detailed tracepoint output */
 	COMPACT_NO_SUITABLE_PAGE,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6ba16c86d7db..36e2d6fb1360 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -14,6 +14,7 @@
 	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
 	EM( COMPACT_COMPLETE,		"complete")		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
diff --git a/mm/compaction.c b/mm/compaction.c
index b2b94474dd28..4af1577adb5c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 		if (cc->direct_compaction)
 			zone->compact_blockskip_flush = true;
 
-		return COMPACT_COMPLETE;
+		if (cc->whole_zone)
+			return COMPACT_COMPLETE;
+		else
+			return COMPACT_PARTIAL_SKIPPED;
 	}
 
 	if (is_via_compact_memory(cc->order))
@@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
+
+	if (cc->migrate_pfn == start_pfn)
+		cc->whole_zone = true;
+
 	cc->last_migrated_pfn = 0;
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 			goto break_loop;
 		}
 
-		if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+		if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+					status == COMPACT_PARTIAL_SKIPPED)) {
 			/*
 			 * We think that allocation won't succeed in this zone
 			 * so we defer compaction there. If it ends up
@@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 						cc.classzone_idx, 0)) {
 			success = true;
 			compaction_defer_reset(zone, cc.order, false);
-		} else if (status == COMPACT_COMPLETE) {
+		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
 			/*
 			 * We use sync migration mode here, so we defer like
 			 * sync direct compaction does.
diff --git a/mm/internal.h b/mm/internal.h
index 3ac544f1963f..f6f3353b0868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -174,6 +174,7 @@ struct compact_control {
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
+	bool whole_zone;		/* Whole zone has been scanned */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-- 
cgit v1.2.3


From 4f9a358c36fcdad3ea1db263ec4d484a70ad543e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:50 -0700
Subject: mm, compaction: update compaction_result ordering

compaction_result will be used as the primary feedback channel for
compaction users.  At the same time try_to_compact_pages (and
potentially others) assume a certain ordering where a more specific
feedback takes precendence.

This gets a bit awkward when we have conflicting feedback from different
zones.  E.g one returing COMPACT_COMPLETE meaning the full zone has been
scanned without any outcome while other returns with COMPACT_PARTIAL aka
made some progress.  The caller should get COMPACT_PARTIAL because that
means that the compaction still can make some progress.  The same
applies for COMPACT_PARTIAL vs COMPACT_PARTIAL_SKIPPED.

Reorder PARTIAL to be the largest one so the larger the value is the
more progress we have done.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9b37f9d3f7a8..ff39fa0a1ede 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,6 +4,8 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NOT_SUITABLE_ZONE,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
@@ -11,30 +13,34 @@ enum compact_result {
 	COMPACT_SKIPPED,
 	/* compaction didn't start as it was deferred due to past failures */
 	COMPACT_DEFERRED,
+
 	/* compaction not active last round */
 	COMPACT_INACTIVE = COMPACT_DEFERRED,
 
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NO_SUITABLE_PAGE,
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
+
 	/*
-	 * direct compaction partially compacted a zone and there are suitable
-	 * pages
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
 	 */
-	COMPACT_PARTIAL,
+	COMPACT_COMPLETE,
 	/*
 	 * direct compaction has scanned part of the zone but wasn't successfull
 	 * to compact suitable pages.
 	 */
 	COMPACT_PARTIAL_SKIPPED,
+
+	/* compaction terminated prematurely due to lock contentions */
+	COMPACT_CONTENDED,
+
 	/*
-	 * The full zone was compacted scanned but wasn't successfull to compact
-	 * suitable pages.
+	 * direct compaction partially compacted a zone and there might be
+	 * suitable pages
 	 */
-	COMPACT_COMPLETE,
-	/* For more detailed tracepoint output */
-	COMPACT_NO_SUITABLE_PAGE,
-	COMPACT_NOT_SUITABLE_ZONE,
-	COMPACT_CONTENDED,
+	COMPACT_PARTIAL,
 };
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
-- 
cgit v1.2.3


From cab1802b5f0dddea30547a7451fda8c7e4c593f0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:56 -0700
Subject: mm, compaction: abstract compaction feedback to helpers

Compaction can provide a wild variation of feedback to the caller.  Many
of them are implementation specific and the caller of the compaction
(especially the page allocator) shouldn't be bound to specifics of the
current implementation.

This patch abstracts the feedback into three basic types:
	- compaction_made_progress - compaction was active and made some
	  progress.
	- compaction_failed - compaction failed and further attempts to
	  invoke it would most probably fail and therefore it is not
	  worth retrying
	- compaction_withdrawn - compaction wasn't invoked for an
          implementation specific reasons. In the current implementation
          it means that the compaction was deferred, contended or the
          page scanners met too early without any progress. Retrying is
          still worthwhile.

[vbabka@suse.cz: do not change thp back off behavior]
[akpm@linux-foundation.org: fix typo in comment, per Hillf]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ff39fa0a1ede..8d8c916fe67a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -78,6 +78,70 @@ extern void compaction_defer_reset(struct zone *zone, int order,
 				bool alloc_success);
 extern bool compaction_restarting(struct zone *zone, int order);
 
+/* Compaction has made some progress and retrying makes sense */
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	/*
+	 * Even though this might sound confusing this in fact tells us
+	 * that the compaction successfully isolated and migrated some
+	 * pageblocks.
+	 */
+	if (result == COMPACT_PARTIAL)
+		return true;
+
+	return false;
+}
+
+/* Compaction has failed and it doesn't make much sense to keep retrying. */
+static inline bool compaction_failed(enum compact_result result)
+{
+	/* All zones were scanned completely and still not result. */
+	if (result == COMPACT_COMPLETE)
+		return true;
+
+	return false;
+}
+
+/*
+ * Compaction  has backed off for some reason. It might be throttling or
+ * lock contention. Retrying is still worthwhile.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	/*
+	 * Compaction backed off due to watermark checks for order-0
+	 * so the regular reclaim has to try harder and reclaim something.
+	 */
+	if (result == COMPACT_SKIPPED)
+		return true;
+
+	/*
+	 * If compaction is deferred for high-order allocations, it is
+	 * because sync compaction recently failed. If this is the case
+	 * and the caller requested a THP allocation, we do not want
+	 * to heavily disrupt the system, so we fail the allocation
+	 * instead of entering direct reclaim.
+	 */
+	if (result == COMPACT_DEFERRED)
+		return true;
+
+	/*
+	 * If compaction in async mode encounters contention or blocks higher
+	 * priority task we back off early rather than cause stalls.
+	 */
+	if (result == COMPACT_CONTENDED)
+		return true;
+
+	/*
+	 * Page scanners have met but we haven't scanned full zones so this
+	 * is a back off in fact.
+	 */
+	if (result == COMPACT_PARTIAL_SKIPPED)
+		return true;
+
+	return false;
+}
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
@@ -114,6 +178,21 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 	return true;
 }
 
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_failed(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	return true;
+}
+
 static inline int kcompactd_run(int nid)
 {
 	return 0;
-- 
cgit v1.2.3


From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:00 -0700
Subject: mm, oom: rework oom detection

__alloc_pages_slowpath has traditionally relied on the direct reclaim
and did_some_progress as an indicator that it makes sense to retry
allocation rather than declaring OOM.  shrink_zones had to rely on
zone_reclaimable if shrink_zone didn't make any progress to prevent from
a premature OOM killer invocation - the LRU might be full of dirty or
writeback pages and direct reclaim cannot clean those up.

zone_reclaimable allows to rescan the reclaimable lists several times
and restart if a page is freed.  This is really subtle behavior and it
might lead to a livelock when a single freed page keeps allocator
looping but the current task will not be able to allocate that single
page.  OOM killer would be more appropriate than looping without any
progress for unbounded amount of time.

This patch changes OOM detection logic and pulls it out from shrink_zone
which is too low to be appropriate for any high level decisions such as
OOM which is per zonelist property.  It is __alloc_pages_slowpath which
knows how many attempts have been done and what was the progress so far
therefore it is more appropriate to implement this logic.

The new heuristic is implemented in should_reclaim_retry helper called
from __alloc_pages_slowpath.  It tries to be more deterministic and
easier to follow.  It builds on an assumption that retrying makes sense
only if the currently reclaimable memory + free pages would allow the
current allocation request to succeed (as per __zone_watermark_ok) at
least for one zone in the usable zonelist.

This alone wouldn't be sufficient, though, because the writeback might
get stuck and reclaimable pages might be pinned for a really long time
or even depend on the current allocation context.  Therefore there is a
backoff mechanism implemented which reduces the reclaim target after
each reclaim round without any progress.  This means that we should
eventually converge to only NR_FREE_PAGES as the target and fail on the
wmark check and proceed to OOM.  The backoff is simple and linear with
1/16 of the reclaimable pages for each round without any progress.  We
are optimistic and reset counter for successful reclaim rounds.

Costly high order pages mostly preserve their semantic and those without
__GFP_REPEAT fail right away while those which have the flag set will
back off after the amount of reclaimable pages reaches equivalent of the
requested order.  The only difference is that if there was no progress
during the reclaim we rely on zone watermark check.  This is more
logical thing to do than previous 1<<order attempts which were a result
of zone_reclaimable faking the progress.

[vdavydov@virtuozzo.com: check classzone_idx for shrink_zone]
[hannes@cmpxchg.org: separate the heuristic into should_reclaim_retry]
[rientjes@google.com: use zone_page_state_snapshot for NR_FREE_PAGES]
[rientjes@google.com: shrink_zones doesn't need to return anything]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/page_alloc.c      | 100 ++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  25 +++----------
 3 files changed, 97 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ad220359f1b0..0af2bb2028fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8bcc10616fab..fa39efc3a692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
 	return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
 }
 
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
+ * pages) and no_progress_loops (number of reclaim rounds without any progress
+ * in a row) is considered as well as the reclaimable pages on the applicable
+ * zone list (with a backoff mechanism which is a function of no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+		     struct alloc_context *ac, int alloc_flags,
+		     bool did_some_progress, unsigned long pages_reclaimed,
+		     int no_progress_loops)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure we converge to OOM if we cannot make any progress
+	 * several times in the row.
+	 */
+	if (no_progress_loops > MAX_RECLAIM_RETRIES)
+		return false;
+
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		if (pages_reclaimed >= (1<<order))
+			return false;
+
+		if (did_some_progress)
+			return true;
+	}
+
+	/*
+	 * Keep reclaiming pages while there is a chance this will lead somewhere.
+	 * If none of the target zones can satisfy our allocation request even
+	 * if all reclaimable pages are considered then we are screwed and have
+	 * to go OOM.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+
+		available = zone_reclaimable_pages(zone);
+		available -= DIV_ROUND_UP(no_progress_loops * available,
+					  MAX_RECLAIM_RETRIES);
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+		/*
+		 * Would the allocation succeed if we reclaimed the whole
+		 * available?
+		 */
+		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+				ac->high_zoneidx, alloc_flags, available)) {
+			/* Wait for some write requests to complete then retry */
+			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
+	int no_progress_loops = 0;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3525,23 +3597,35 @@ retry:
 	if (gfp_mask & __GFP_NORETRY)
 		goto noretry;
 
-	/* Keep reclaiming pages as long as there is reasonable progress */
-	pages_reclaimed += did_some_progress;
-	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
-		goto retry;
+	/*
+	 * Do not retry costly high order allocations unless they are
+	 * __GFP_REPEAT
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+		goto noretry;
+
+	if (did_some_progress) {
+		no_progress_loops = 0;
+		pages_reclaimed += did_some_progress;
+	} else {
+		no_progress_loops++;
 	}
 
+	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+				 did_some_progress > 0, pages_reclaimed,
+				 no_progress_loops))
+		goto retry;
+
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
 		goto got_pg;
 
 	/* Retry as long as the OOM killer is making progress */
-	if (did_some_progress)
+	if (did_some_progress) {
+		no_progress_loops = 0;
 		goto retry;
+	}
 
 noretry:
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a386454c015a..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	unsigned long nr;
 
@@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
  */
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
-	bool reclaimable = false;
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
-			if (nr_soft_reclaimed)
-				reclaimable = true;
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
-			reclaimable = true;
-
-		if (global_reclaim(sc) &&
-		    !reclaimable && zone_reclaimable(zone))
-			reclaimable = true;
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 	}
 
 	/*
@@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	 * promoted it to __GFP_HIGHMEM.
 	 */
 	sc->gfp_mask = orig_mask;
-
-	return reclaimable;
 }
 
 /*
@@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
-	bool zones_reclaimable;
 retry:
 	delayacct_freepages_start();
 
@@ -2638,7 +2625,7 @@ retry:
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
-		zones_reclaimable = shrink_zones(zonelist, sc);
+		shrink_zones(zonelist, sc);
 
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2685,10 +2672,6 @@ retry:
 		goto retry;
 	}
 
-	/* Any of the zones still reclaimable?  Don't OOM. */
-	if (zones_reclaimable)
-		return 1;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:12 -0700
Subject: mm, oom, compaction: prevent from should_compact_retry looping for
 ever for costly orders

"mm: consider compaction feedback also for costly allocation" has
removed the upper bound for the reclaim/compaction retries based on the
number of reclaimed pages for costly orders.  While this is desirable
the patch did miss a mis interaction between reclaim, compaction and the
retry logic.  The direct reclaim tries to get zones over min watermark
while compaction backs off and returns COMPACT_SKIPPED when all zones
are below low watermark + 1<<order gap.  If we are getting really close
to OOM then __compaction_suitable can keep returning COMPACT_SKIPPED a
high order request (e.g.  hugetlb order-9) while the reclaim is not able
to release enough pages to get us over low watermark.  The reclaim is
still able to make some progress (usually trashing over few remaining
pages) so we are not able to break out from the loop.

I have seen this happening with the same test described in "mm: consider
compaction feedback also for costly allocation" on a swapless system.
The original problem got resolved by "vmscan: consider classzone_idx in
compaction_ready" but it shows how things might go wrong when we
approach the oom event horizont.

The reason why compaction requires being over low rather than min
watermark is not clear to me.  This check was there essentially since
56de7263fcf3 ("mm: compaction: direct compact when a high-order
allocation fails").  It is clearly an implementation detail though and
we shouldn't pull it into the generic retry logic while we should be
able to cope with such eventuality.  The only place in
should_compact_retry where we retry without any upper bound is for
compaction_withdrawn() case.

Introduce compaction_zonelist_suitable function which checks the given
zonelist and returns true only if there is at least one zone which would
would unblock __compaction_suitable if more memory got reclaimed.  In
this implementation it checks __compaction_suitable with NR_FREE_PAGES
plus part of the reclaimable memory as the target for the watermark
check.  The reclaimable memory is reduced linearly by the allocation
order.  The idea is that we do not want to reclaim all the remaining
memory for a single allocation request just unblock
__compaction_suitable which doesn't guarantee we will make a further
progress.

The new helper is then used if compaction_withdrawn() feedback was
provided so we do not retry if there is no outlook for a further
progress.  !costly requests shouldn't be affected much - e.g.  order-2
pages would require to have at least 64kB on the reclaimable LRUs while
order-9 would need at least 32M which should be enough to not lock up.

[vbabka@suse.cz: fix classzone_idx vs. high_zoneidx usage in compaction_zonelist_suitable]
[akpm@linux-foundation.org: fix it for Mel's mm-page_alloc-remove-field-from-alloc_context.patch]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 ++++
 include/linux/mmzone.h     |  3 +++
 mm/compaction.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c            | 23 +++++++++++++----------
 4 files changed, 59 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 8d8c916fe67a..a58c852a268f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result)
 	return false;
 }
 
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+					int alloc_flags);
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60db2096fd8..8dd0333b01dc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone)
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx,
 		unsigned int alloc_flags);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4af1577adb5c..d8a20fcf8678 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone,
  */
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx,
+					unsigned long wmark_target)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * allocated and for a short time, the footprint is higher
 	 */
 	watermark += (2UL << order);
-	if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+				 alloc_flags, wmark_target))
 		return COMPACT_SKIPPED;
 
 	/*
@@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 {
 	enum compact_result ret;
 
-	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+				    zone_page_state(zone, NR_FREE_PAGES));
 	trace_mm_compaction_suitable(zone, order, ret);
 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
 		ret = COMPACT_SKIPPED;
@@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+		int alloc_flags)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure at least one zone would pass __compaction_suitable if we continue
+	 * retrying the reclaim.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+		enum compact_result compact_result;
+
+		/*
+		 * Do not consider all the reclaimable memory because we do not
+		 * want to trash just for a single high order allocation which
+		 * is even not guaranteed to appear even if __compaction_suitable
+		 * is happy about the watermark check.
+		 */
+		available = zone_reclaimable_pages(zone) / order;
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+		compact_result = __compaction_suitable(zone, order, alloc_flags,
+				ac_classzone_idx(ac), available);
+		if (compact_result != COMPACT_SKIPPED &&
+				compact_result != COMPACT_NOT_SUITABLE_ZONE)
+			return true;
+	}
+
+	return false;
+}
+
 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	enum compact_result ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dea406a62e3d..089f760ce64a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * one free page of a suitable size. Checking now avoids taking the zone lock
  * to check in the allocation paths if no pages are free.
  */
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx,
-			unsigned int alloc_flags,
-			long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages)
 {
 	long min = mark;
 	int o;
@@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
-		     enum migrate_mode *migrate_mode,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+		     enum compact_result compact_result, enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
@@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
 	 * due to locks contention before we declare that we should give up.
+	 * But do not retry if the given zonelist is not suitable for
+	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return true;
+		return compaction_zonelist_suitable(ac, order, alloc_flags);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+		     enum compact_result compact_result,
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
@@ -3706,8 +3708,9 @@ retry:
 	 * of free memory (see __compaction_suitable)
 	 */
 	if (did_some_progress > 0 &&
-			should_compact_retry(order, compact_result,
-				&migration_mode, compaction_retries))
+			should_compact_retry(ac, order, alloc_flags,
+				compact_result, &migration_mode,
+				compaction_retries))
 		goto retry;
 
 	/* Reclaim has failed us, start killing things */
-- 
cgit v1.2.3


From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:18 -0700
Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully

Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to
unmap the address space") not only clears TIF_MEMDIE for oom reaped task
but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the
oom killer.  This works in simple cases but it is not sufficient for
(unlikely) cases where the mm is shared between independent processes
(as they do not share signal struct).  If the mm had only small amount
of memory which could be reaped then another task sharing the mm could
be selected and that wouldn't help to move out from the oom situation.

Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same
as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set.  Set the
flag after __oom_reap_task is done with a task.  This will force the
select_bad_process() to ignore all already oom reaped tasks as well as
no such task is sacrificed for its parent.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 +
 mm/oom_kill.c         | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31bd0d97d178..40eabf176ce2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED		21	/* mm has been already reaped */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 415f7eb913fa..c0376efa79ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	/*
+	 * Do not even consider tasks which are explicitly marked oom
+	 * unkillable or have been already oom reaped.
+	 */
 	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN) {
+	if (adj == OOM_SCORE_ADJ_MIN ||
+			test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * This task can be safely ignored because we cannot do much more
 	 * to release its memory.
 	 */
-	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
 	mmput(mm);
 	return ret;
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 +++++
 kernel/fork.c            | 50 +++++++++++++++++++++++++++++++++---------------
 mm/oom_kill.c            |  8 ++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fda9c99ef95..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -513,6 +514,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40eabf176ce2..479e3cade7e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0376efa79ec..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-
 static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
@@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
-	mmput(mm);
+	/*
+	 * Drop our reference but make sure the mmput slow path is called from a
+	 * different context because we shouldn't risk we get stuck there and
+	 * put the oom_reaper out of the way.
+	 */
+	mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 98748bd722005be9de2662bd4f7e41ad8148bdbd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:24 -0700
Subject: oom: consider multi-threaded tasks in task_will_free_mem

task_will_free_mem is a misnomer for a more complex PF_EXITING test for
early break out from the oom killer because it is believed that such a
task would release its memory shortly and so we do not have to select an
oom victim and perform a disruptive action.

Currently we make sure that the given task is not participating in the
core dumping because it might get blocked for a long time - see commit
d003f371b270 ("oom: don't assume that a coredumping thread will exit
soon").

The check can still do better though.  We shouldn't consider the task
unless the whole thread group is going down.  This is rather unlikely
but not impossible.  A single exiting thread would surely leave all the
address space behind.  If we are really unlucky it might get stuck on
the exit path and keep its TIF_MEMDIE and so block the oom killer.

Link: http://lkml.kernel.org/r/1460452756-15491-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83b9c39bd8b7..d3f533f2f481 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -110,13 +110,24 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
 {
+	struct signal_struct *sig = task->signal;
+
 	/*
 	 * A coredumping process may sleep for an extended period in exit_mm(),
 	 * so the oom killer cannot assume that the process will promptly exit
 	 * and release memory.
 	 */
-	return (task->flags & PF_EXITING) &&
-		!(task->signal->flags & SIGNAL_GROUP_COREDUMP);
+	if (sig->flags & SIGNAL_GROUP_COREDUMP)
+		return false;
+
+	if (!(task->flags & PF_EXITING))
+		return false;
+
+	/* Make sure that the whole thread group is going down */
+	if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
+		return false;
+
+	return true;
 }
 
 /* sysctls */
-- 
cgit v1.2.3


From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 20 May 2016 16:57:27 -0700
Subject: mm,oom: speed up select_bad_process() loop

Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel
panics"), select_bad_process() is using for_each_process_thread().

Since oom_unkillable_task() scans all threads in the caller's thread
group and oom_task_origin() scans signal_struct of the caller's thread
group, we don't need to call oom_unkillable_task() and oom_task_origin()
on each thread.  Also, since !mm test will be done later at
oom_badness(), we don't need to do !mm test on each thread.  Therefore,
we only need to do TIF_MEMDIE test on each thread.

Although the original code was correct it was quite inefficient because
each thread group was scanned num_threads times which can be a lot
especially with processes with many threads.  Even though the OOM is
extremely cold path it is always good to be as effective as possible
when we are inside rcu_read_lock() - aka unpreemptible context.

If we track number of TIF_MEMDIE threads inside signal_struct, we don't
need to do TIF_MEMDIE test on each thread.  This will allow
select_bad_process() to use for_each_process().

This patch adds a counter to signal_struct for tracking how many
TIF_MEMDIE threads are in a given thread group, and check it at
oom_scan_process_thread() so that select_bad_process() can use
for_each_process() rather than for_each_process_thread().

[mhocko@suse.com: do not blow the signal_struct size]
  Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 17 ++++++-----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 479e3cade7e9..01fe1bb68754 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -669,6 +669,7 @@ struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
+	atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0e37dd1422f..5bb2f7698ad7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	 * This task already has access to memory reserves and is being killed.
 	 * Don't allow any other task to have access to the reserves.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!is_sysrq_oom(oc))
-			return OOM_SCAN_ABORT;
-	}
-	if (!task->mm)
-		return OOM_SCAN_CONTINUE;
+	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+		return OOM_SCAN_ABORT;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
-	struct task_struct *g, *p;
+	struct task_struct *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
 
 	rcu_read_lock();
-	for_each_process_thread(g, p) {
+	for_each_process(p) {
 		unsigned int points;
 
 		switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 		points = oom_badness(p, NULL, oc->nodemask, totalpages);
 		if (!points || points < chosen_points)
 			continue;
-		/* Prefer thread group leaders for display purposes */
-		if (points == chosen_points && thread_group_leader(chosen))
-			continue;
 
 		chosen = p;
 		chosen_points = points;
@@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_inc(&tsk->signal->oom_victims);
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
 {
 	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_dec(&tsk->signal->oom_victims);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
-- 
cgit v1.2.3


From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 May 2016 16:57:38 -0700
Subject: mm/vmalloc: keep a separate lazy-free list

When mixing lots of vmallocs and set_memory_*() (which calls
vm_unmap_aliases()) I encountered situations where the performance
degraded severely due to the walking of the entire vmap_area list each
invocation.

One simple improvement is to add the lazily freed vmap_area to a
separate lockless free list, such that we then avoid having to walk the
full list on each purge.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Roman Pen <r.peniaev@gmail.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Roman Pen <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 ++-
 mm/vmalloc.c            | 39 +++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..957adb741b6f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
 #include <linux/rbtree.h>
 
@@ -44,7 +45,7 @@ struct vmap_area {
 	unsigned long flags;
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
-	struct list_head purge_list;    /* "lazy purge" list */
+	struct llist_node purge_list;    /* "lazy purge" list */
 	struct vm_struct *vm;
 	struct rcu_head rcu_head;
 };
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae7d20b447ff..6e3291882739 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 /*** Global kva allocator ***/
 
-#define VM_LAZY_FREE	0x01
-#define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 					int sync, int force_flush)
 {
 	static DEFINE_SPINLOCK(purge_lock);
-	LIST_HEAD(valist);
+	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	int nr = 0;
@@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	if (sync)
 		purge_fragmented_blocks_allcpus();
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(va, &vmap_area_list, list) {
-		if (va->flags & VM_LAZY_FREE) {
-			if (va->va_start < *start)
-				*start = va->va_start;
-			if (va->va_end > *end)
-				*end = va->va_end;
-			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
-			list_add_tail(&va->purge_list, &valist);
-			va->flags |= VM_LAZY_FREEING;
-			va->flags &= ~VM_LAZY_FREE;
-		}
+	valist = llist_del_all(&vmap_purge_list);
+	llist_for_each_entry(va, valist, purge_list) {
+		if (va->va_start < *start)
+			*start = va->va_start;
+		if (va->va_end > *end)
+			*end = va->va_end;
+		nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
 	}
-	rcu_read_unlock();
 
 	if (nr)
 		atomic_sub(nr, &vmap_lazy_nr);
@@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 
 	if (nr) {
 		spin_lock(&vmap_area_lock);
-		list_for_each_entry_safe(va, n_va, &valist, purge_list)
+		llist_for_each_entry_safe(va, n_va, valist, purge_list)
 			__free_vmap_area(va);
 		spin_unlock(&vmap_area_lock);
 	}
@@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	va->flags |= VM_LAZY_FREE;
-	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
-	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+	int nr_lazy;
+
+	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+				    &vmap_lazy_nr);
+
+	/* After this point, we may free va at any time */
+	llist_add(&va->purge_list, &vmap_purge_list);
+
+	if (unlikely(nr_lazy > lazy_max_pages()))
 		try_purge_vmap_area_lazy();
 }
 
-- 
cgit v1.2.3


From b8ca9e3a612eaf3e54c6fa136c62246a1a9aece7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 May 2016 16:57:53 -0700
Subject: mm: tighten fault_in_pages_writeable()

copy_page_to_iter_iovec() is currently the only user of
fault_in_pages_writeable(), and it definitely can use fragments from
high order pages.

Make sure fault_in_pages_writeable() is only touching two adjacent pages
at most, as claimed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fe1513ffb7bf..97354102794d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -518,33 +518,27 @@ void page_endio(struct page *page, int rw, int err);
 extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
 
 /*
- * Fault a userspace page into pagetables.  Return non-zero on a fault.
- *
- * This assumes that two userspace pages are always sufficient.
+ * Fault one or two userspace pages into pagetables.
+ * Return -EINVAL if more than two pages would be needed.
+ * Return non-zero on a fault.
  */
 static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 {
-	int ret;
+	int span, ret;
 
 	if (unlikely(size == 0))
 		return 0;
 
+	span = offset_in_page(uaddr) + size;
+	if (span > 2 * PAGE_SIZE)
+		return -EINVAL;
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
 	 */
 	ret = __put_user(0, uaddr);
-	if (ret == 0) {
-		char __user *end = uaddr + size - 1;
-
-		/*
-		 * If the page was already mapped, this will get a cache miss
-		 * for sure, so try to avoid doing it.
-		 */
-		if (((unsigned long)uaddr & PAGE_MASK) !=
-				((unsigned long)end & PAGE_MASK))
-			ret = __put_user(0, end);
-	}
+	if (ret == 0 && span > PAGE_SIZE)
+		ret = __put_user(0, uaddr + size - 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 7fab358d90e6ba9d9cb702bee0c8a5f5c13bb6df Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Fri, 20 May 2016 16:57:59 -0700
Subject: include/linux/hugetlb*.h: clean up code

Macro HUGETLBFS_SB is clear enough, so one statement is clearer than 3
lines statements.

Remove redundant return statements for non-return functions, which can
save lines, at least.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h        | 4 +---
 include/linux/hugetlb_cgroup.h | 4 ----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e44c57876e89..7ef4b635015d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -353,9 +353,7 @@ extern unsigned int default_hstate_idx;
 
 static inline struct hstate *hstate_inode(struct inode *i)
 {
-	struct hugetlbfs_sb_info *hsb;
-	hsb = HUGETLBFS_SB(i->i_sb);
-	return hsb->hstate;
+	return HUGETLBFS_SB(i->i_sb)->hstate;
 }
 
 static inline struct hstate *hstate_file(struct file *f)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 24154c26d469..063962f6dfc6 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -93,20 +93,17 @@ hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 			     struct hugetlb_cgroup *h_cg,
 			     struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 			       struct hugetlb_cgroup *h_cg)
 {
-	return;
 }
 
 static inline void hugetlb_cgroup_file_init(void)
@@ -116,7 +113,6 @@ static inline void hugetlb_cgroup_file_init(void)
 static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
 					  struct page *newhpage)
 {
-	return;
 }
 
 #endif  /* CONFIG_MEM_RES_CTLR_HUGETLB */
-- 
cgit v1.2.3


From d70c17d436b3fb9dbdae8c93bf908a6110b0cb4f Mon Sep 17 00:00:00 2001
From: Chen Gang <chengang@emindsoft.com.cn>
Date: Fri, 20 May 2016 16:58:01 -0700
Subject: include/linux/hugetlb.h: use bool instead of int for
 hugepage_migration_supported()

It is used as a pure bool function within kernel source wide.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7ef4b635015d..c26d4638f665 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -452,12 +452,12 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
-static inline int hugepage_migration_supported(struct hstate *h)
+static inline bool hugepage_migration_supported(struct hstate *h)
 {
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	return huge_page_shift(h) == PMD_SHIFT;
 #else
-	return 0;
+	return false;
 #endif
 }
 
@@ -519,7 +519,7 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
-#define hugepage_migration_supported(h)	0
+#define hugepage_migration_supported(h)	false
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
-- 
cgit v1.2.3


From 0c9ad804f178eae02a34045bb0916fa0e31623d5 Mon Sep 17 00:00:00 2001
From: Weijie Yang <weijie.yang@samsung.com>
Date: Fri, 20 May 2016 16:58:04 -0700
Subject: mm fix commmets: if SPARSEMEM, pgdata doesn't have page_ext

If SPARSEMEM, use page_ext in mem_section
if !SPARSEMEM, use page_ext in pgdata

Signed-off-by: Weijie Yang <weijie.yang@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8dd0333b01dc..02069c23486d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1063,7 +1063,7 @@ struct mem_section {
 	unsigned long *pageblock_flags;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
-	 * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
+	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
 	 * section. (see page_ext.h about this.)
 	 */
 	struct page_ext *page_ext;
-- 
cgit v1.2.3


From d2a1a1f0a97a77d25cbada37161dc2ecdf01f93d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Fri, 20 May 2016 16:58:16 -0700
Subject: mm: use unsigned long constant for page flags

struct page->flags is unsigned long, so when shifting bits we should use
UL suffix to match it.

Found this problem after I added 64-bit CPU specific page flags and
failed to compile the kernel:

  mm/page_alloc.c: In function '__free_one_page':
  mm/page_alloc.c:672:2: error: integer overflow in expression [-Werror=overflow]

Link: http://lkml.kernel.org/r/1461971723-16187-1-git-send-email-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a61e06e5fbce..e5a32445f930 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -479,7 +479,7 @@ static inline void ClearPageCompound(struct page *page)
 }
 #endif
 
-#define PG_head_mask ((1L << PG_head))
+#define PG_head_mask ((1UL << PG_head))
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
@@ -670,7 +670,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 }
 
 #ifdef CONFIG_MMU
-#define __PG_MLOCKED		(1 << PG_mlocked)
+#define __PG_MLOCKED		(1UL << PG_mlocked)
 #else
 #define __PG_MLOCKED		0
 #endif
@@ -680,11 +680,11 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-	(1 << PG_lru	 | 1 << PG_locked    | \
-	 1 << PG_private | 1 << PG_private_2 | \
-	 1 << PG_writeback | 1 << PG_reserved | \
-	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED)
+	(1UL << PG_lru	 | 1UL << PG_locked    | \
+	 1UL << PG_private | 1UL << PG_private_2 | \
+	 1UL << PG_writeback | 1UL << PG_reserved | \
+	 1UL << PG_slab	 | 1UL << PG_swapcache | 1UL << PG_active | \
+	 1UL << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -695,10 +695,10 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
-	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+	(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
-	(1 << PG_private | 1 << PG_private_2)
+	(1UL << PG_private | 1UL << PG_private_2)
 /**
  * page_has_private - Determine if page has private stuff
  * @page: The page to be checked
-- 
cgit v1.2.3


From 5f527c2b3ea261bfccb7d12f9feade924cc4987c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 20 May 2016 16:58:24 -0700
Subject: mm: thp: microoptimize compound_mapcount()

compound_mapcount() is only called after PageCompound() has already been
checked by the caller, so there's no point to check it again.  Gcc may
optimize it away too because it's inline but this will remove the
runtime check for sure and add it'll add an assert instead.

Link: http://lkml.kernel.org/r/1462547040-1737-3-git-send-email-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2b97be1147ec..65d18a45b8e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -475,8 +475,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
 
 static inline int compound_mapcount(struct page *page)
 {
-	if (!PageCompound(page))
-		return 0;
+	VM_BUG_ON_PAGE(!PageCompound(page), page);
 	page = compound_head(page);
 	return atomic_read(compound_mapcount_ptr(page)) + 1;
 }
-- 
cgit v1.2.3


From d2005e3f41d4f9299e2df6a967c8beb5086967a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 20 May 2016 16:58:36 -0700
Subject: userfaultfd: don't pin the user memory in userfaultfd_file_create()

userfaultfd_file_create() increments mm->mm_users; this means that the
memory won't be unmapped/freed if mm owner exits/execs, and UFFDIO_COPY
after that can populate the orphaned mm more.

Change userfaultfd_file_create() and userfaultfd_ctx_put() to use
mm->mm_count to pin mm_struct.  This means that
atomic_inc_not_zero(mm->mm_users) is needed when we are going to
actually play with this memory.  Except handle_userfault() path doesn't
need this, the caller must already have a reference.

The patch adds the new trivial helper, mmget_not_zero(), it can have
more users.

Link: http://lkml.kernel.org/r/20160516172254.GA8595@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c      | 41 ++++++++++++++++++++++++++++-------------
 include/linux/sched.h |  7 ++++++-
 2 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 66cdb44616d5..2d97952e341a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 }
@@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 
 	ACCESS_ONCE(ctx->released) = true;
 
+	if (!mmget_not_zero(mm))
+		goto wakeup;
+
 	/*
 	 * Flush page faults out of all CPUs. NOTE: all page faults
 	 * must be retried without returning VM_FAULT_SIGBUS if
@@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
 	up_write(&mm->mmap_sem);
-
+	mmput(mm);
+wakeup:
 	/*
 	 * After no new page faults can wait on this fault_*wqh, flush
 	 * the last page faults that may have been already waiting on
@@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	start = uffdio_register.range.start;
 	end = start + uffdio_register.range.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 	if (!ret) {
 		/*
 		 * Now that we scanned all vmas we can already tell
@@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	start = uffdio_unregister.start;
 	end = start + uffdio_unregister.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 out:
 	return ret;
 }
@@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
 		goto out;
-
-	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-			   uffdio_copy.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+				   uffdio_copy.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
 		goto out;
 
-	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-			     uffdio_zeropage.range.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+				     uffdio_zeropage.range.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags)
 	ctx->released = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
-	atomic_inc(&ctx->mm->mm_users);
+	atomic_inc(&ctx->mm->mm_count);
 
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
 	if (IS_ERR(file)) {
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 out:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 01fe1bb68754..6b3213d96da6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2723,12 +2723,17 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct * mm)
+static inline void mmdrop(struct mm_struct *mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+static inline bool mmget_not_zero(struct mm_struct *mm)
+{
+	return atomic_inc_not_zero(&mm->mm_users);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* same as above but performs the slow path from the async kontext. Can
-- 
cgit v1.2.3


From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Fri, 20 May 2016 16:58:38 -0700
Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments

Since commit 92923ca3aace ("mm: meminit: only set page reserved in the
memblock region") the reserved bit is set on reserved memblock regions.
However start and end address are passed as unsigned long.  This is only
32bit on i386, so it can end up marking the wrong pages reserved for
ranges at 4GB and above.

This was observed on a 32bit Xen dom0 which was booted with initial
memory set to a value below 4G but allowing to balloon in memory
(dom0_mem=1024M for example).  This would define a reserved bootmem
region for the additional memory (for example on a 8GB system there was
a reverved region covering the 4GB-8GB range).  But since the addresses
were passed on as unsigned long, this was actually marking all pages
from 0 to 4GB as reserved.

Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region")
Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: <stable@vger.kernel.org>	[4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65d18a45b8e8..fbdb9d40847f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page);
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
-extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f4b69aaa23a..2dd1ba4e70cc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn)
  * marks the pages PageReserved. The remaining valid pages are later
  * sent to the buddy page allocator.
  */
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long end_pfn = PFN_UP(end);
-- 
cgit v1.2.3


From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 20 May 2016 16:58:41 -0700
Subject: mm: make faultaround produce old ptes

Currently, faultaround code produces young pte.  This can screw up
vmscan behaviour[1], as it makes vmscan think that these pages are hot
and not push them out on first round.

During sparse file access faultaround gets more pages mapped and all of
them are young.  Under memory pressure, this makes vmscan swap out anon
pages instead, or to drop other page cache pages which otherwise stay
resident.

Modify faultaround to produce old ptes, so they can easily be reclaimed
under memory pressure.

This can to some extend defeat the purpose of faultaround on machines
without hardware accessed bit as it will not help us with reducing the
number of minor page faults.

We may want to disable faultaround on such machines altogether, but
that's subject for separate patchset.

Minchan:
 "I tested 512M mmap sequential word read test on non-HW access bit
  system (i.e., ARM) and confirmed it doesn't increase minor fault any
  more.

  old: 4096 fault_around
  minor fault: 131291
  elapsed time: 6747645 usec

  new: 65536 fault_around
  minor fault: 131291
  elapsed time: 6709263 usec

  0.56% benefit"

[1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org

Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  2 +-
 mm/memory.c        | 23 ++++++++++++++++++-----
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbdb9d40847f..f223ac26b5d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon);
+		struct page *page, pte_t *pte, bool write, bool anon, bool old);
 #endif
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 8f4859989f1b..b418405903bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2191,7 +2191,7 @@ repeat:
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 		addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-		do_set_pte(vma, addr, page, pte, false, false);
+		do_set_pte(vma, addr, page, pte, false, false, true);
 		unlock_page(page);
 		goto next;
 unlock:
diff --git a/mm/memory.c b/mm/memory.c
index 007c72ad03f6..f29e5ab0342d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon)
+		struct page *page, pte_t *pte, bool write, bool anon, bool old)
 {
 	pte_t entry;
 
@@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	if (old)
+		entry = pte_mkold(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address, false);
@@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
 			goto unlock_out;
+		do_fault_around(vma, address, pte, pgoff, flags);
+		/* Check if the fault is handled by faultaround */
+		if (!pte_same(*pte, orig_pte)) {
+			/*
+			 * Faultaround produce old pte, but the pte we've
+			 * handler fault for should be young.
+			 */
+			pte_t entry = pte_mkyoung(*pte);
+			if (ptep_set_access_flags(vma, address, pte, entry, 0))
+				update_mmu_cache(vma, address, pte);
+			goto unlock_out;
+		}
 		pte_unmap_unlock(pte, ptl);
 	}
 
@@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, false, false);
+	do_set_pte(vma, address, fault_page, pte, false, false, false);
 	unlock_page(fault_page);
 unlock_out:
 	pte_unmap_unlock(pte, ptl);
@@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		goto uncharge_out;
 	}
-	do_set_pte(vma, address, new_page, pte, true, true);
+	do_set_pte(vma, address, new_page, pte, true, true, false);
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
@@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, true, false);
+	do_set_pte(vma, address, fault_page, pte, true, false, false);
 	pte_unmap_unlock(pte, ptl);
 
 	if (set_page_dirty(fault_page))
-- 
cgit v1.2.3


From 0bb2fd13b69abfd88880f356903b5c7ca36d5eea Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 20 May 2016 16:58:59 -0700
Subject: mm: page_is_guard(): return false when page_ext arrays are not
 allocated yet

When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  PGD 0
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
  Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009
  task: ffff88017c080040 ti: ffff88017c084000 task.ti: ffff88017c084000
  RIP: 0010:[<ffffffff8118d982>]  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  RSP: 0000:ffff88017c087c48  EFLAGS: 00010046
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
  RDX: 0000000000000980 RSI: 0000000000000080 RDI: 0000000000660401
  RBP: ffff88017c087cd0 R08: 0000000000000401 R09: 0000000000000009
  R10: ffff88017c080040 R11: 000000000000000a R12: 0000000000000400
  R13: ffffea0019810000 R14: ffffea0019810040 R15: ffff88066cfe6080
  FS:  0000000000000000(0000) GS:ffff88066cd40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 0000000002406000 CR4: 00000000000006e0
  Call Trace:
    free_hot_cold_page+0x192/0x1d0
    __free_pages+0x5c/0x90
    __free_pages_boot_core+0x11a/0x14e
    deferred_free_range+0x50/0x62
    deferred_init_memmap+0x220/0x3c3
    kthread+0xf8/0x110
    ret_from_fork+0x22/0x40
  Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
  RIP  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
   RSP <ffff88017c087c48>
  CR2: 0000000000000000

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but freeing page might reach here before the page_ext arrays are
allocated when feeding a range of pages to the allocator for the first
time during bootup or memory hotplug.

When it returns NULL, page_is_guard() should just return false instead
of checking PAGE_EXT_DEBUG_GUARD unconditionally.

Link: http://lkml.kernel.org/r/1463610225-29060-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f223ac26b5d9..b530c99e8e81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2386,6 +2386,9 @@ static inline bool page_is_guard(struct page *page)
 		return false;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return false;
+
 	return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 }
 #else
-- 
cgit v1.2.3


From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:11 -0700
Subject: mm: kasan: initial memory quarantine implementation

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE.  The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.

When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator.  From now on the
allocator may reuse it for another allocation.  Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).

When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped.  Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.

Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

Freed objects are first added to per-cpu quarantine queues.  When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue.  Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).

As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased.  Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.

Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.

This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov.  A number of improvements have been
suggested by Andrey Ryabinin.

[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  13 ++-
 mm/kasan/Makefile     |   1 +
 mm/kasan/kasan.c      |  57 ++++++++--
 mm/kasan/kasan.h      |  21 +++-
 mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/report.c     |   1 +
 mm/mempool.c          |   2 +-
 mm/slab.c             |  12 ++-
 mm/slab.h             |   2 +
 mm/slab_common.c      |   2 +
 10 files changed, 387 insertions(+), 15 deletions(-)
 create mode 100644 mm/kasan/quarantine.c

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
 
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..8df666bb23be 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 }
 #endif
 
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
 #ifdef CONFIG_SLAB
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	if (likely(cache->flags & SLAB_KASAN)) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track, GFP_NOWAIT);
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
+
+		switch (alloc_info->state) {
+		case KASAN_STATE_ALLOC:
+			alloc_info->state = KASAN_STATE_QUARANTINE;
+			quarantine_put(free_info, cache);
+			set_track(&free_info->track, GFP_NOWAIT);
+			kasan_poison_slab_free(cache, object);
+			return true;
+		case KASAN_STATE_QUARANTINE:
+		case KASAN_STATE_FREE:
+			pr_err("Double free");
+			dump_stack();
+			break;
+		default:
+			break;
+		}
 	}
+	return false;
+#else
+	kasan_poison_slab_free(cache, object);
+	return false;
 #endif
-
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(object == NULL))
 		return;
 
@@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(ptr == NULL))
 		return;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
 enum kasan_state {
 	KASAN_STATE_INIT,
 	KASAN_STATE_ALLOC,
+	KASAN_STATE_QUARANTINE,
 	KASAN_STATE_FREE
 };
 
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
 	u32 reserved;
 };
 
+struct qlist_node {
+	struct qlist_node *next;
+};
 struct kasan_free_meta {
-	/* Allocator freelist pointer, unused by KASAN. */
-	void **freelist;
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
 	struct kasan_track track;
 };
 
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
 #endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+		struct qlist_head *to, size_t size)
+{
+	if (unlikely(last == from->tail)) {
+		qlist_move_all(from, to);
+		return;
+	}
+	if (qlist_empty(to))
+		to->head = from->head;
+	else
+		to->tail->next = from->head;
+	to->tail = last;
+	from->head = last->next;
+	last->next = NULL;
+	from->bytes -= size;
+	to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	alloc_info->state = KASAN_STATE_FREE;
+	___cache_free(cache, object, _THIS_IP_);
+	local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+		qlist_move_all(q, &temp);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!qlist_empty(&temp))) {
+		spin_lock_irqsave(&quarantine_lock, flags);
+		qlist_move_all(&temp, &global_quarantine);
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+	}
+}
+
+void quarantine_reduce(void)
+{
+	size_t new_quarantine_size;
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+	size_t size_to_free = 0;
+	struct qlist_node *last;
+
+	if (likely(READ_ONCE(global_quarantine.bytes) <=
+		   READ_ONCE(quarantine_size)))
+		return;
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+	last = global_quarantine.head;
+	while (last) {
+		struct kmem_cache *cache = qlink_to_cache(last);
+
+		size_to_free += cache->size;
+		if (!last->next || size_to_free >
+		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+			break;
+		last = last->next;
+	}
+	qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *prev = NULL, *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	while (curr) {
+		struct qlist_node *qlink = curr;
+		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+		if (obj_cache == cache) {
+			if (unlikely(from->head == qlink)) {
+				from->head = curr->next;
+				prev = curr;
+			} else
+				prev->next = curr->next;
+			if (unlikely(from->tail == qlink))
+				from->tail = curr->next;
+			from->bytes -= cache->size;
+			qlist_put(to, qlink, cache->size);
+		} else {
+			prev = curr;
+		}
+		curr = curr->next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	qlist_move_cache(&global_quarantine, &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
+	case KASAN_STATE_QUARANTINE:
 		pr_err("Object freed, allocated with size %u bytes\n",
 		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_free(pool->pool_data, element);
+		kasan_poison_slab_free(pool->pool_data, element);
 	if (pool->alloc == mempool_kmalloc)
 		kasan_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/slab.c b/mm/slab.c
index c11bf5007952..28864c022430 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,9 +3547,17 @@ free_done:
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
+
+	___cache_free(cachep, objp, caller);
+}
 
-	kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
+{
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
+	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 
 	get_online_cpus();
 	get_online_mems();
+	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
-- 
cgit v1.2.3


From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:28 -0700
Subject: mm/kasan: add API to check memory regions

Memory access coded in an assembly won't be seen by KASAN as a compiler
can instrument only C code.  Add kasan_check_[read,write]() API which is
going to be used to check a certain memory range.

Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                  |  2 +-
 include/linux/kasan-checks.h | 12 ++++++++++++
 mm/kasan/kasan.c             | 12 ++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kasan-checks.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 374ffa2d81b7..8b92445561b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,7 +6242,7 @@ S:	Maintained
 F:	arch/*/include/asm/kasan.h
 F:	arch/*/mm/kasan_init*
 F:	Documentation/kasan.txt
-F:	include/linux/kasan.h
+F:	include/linux/kasan*.h
 F:	lib/test_kasan.c
 F:	mm/kasan/
 F:	scripts/Makefile.kasan
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e5beb40d97b1..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-- 
cgit v1.2.3


From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:48 -0700
Subject: zsmalloc: require GFP in zs_malloc()

Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to
zs_create_pool(), so we can be more flexible, but, more importantly, we
need this to switch zram to per-cpu compression streams -- zram will try
to allocate handle with preemption disabled in a fast path and switch to
a slow path (using different gfp mask) if the fast one has failed.

Apart from that, this also align zs_malloc() interface with zspool/zbud.

[sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask]
  Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  4 ++--
 include/linux/zsmalloc.h      |  4 ++--
 mm/zsmalloc.c                 | 24 +++++++++++++-----------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..b09acdb753ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ae288c9f7156..aba39a291523 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
 			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
-	return zs_create_pool(name, gfp);
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
-- 
cgit v1.2.3


From 5f56a5dfdb9bcb3bca03df59980d4d2f012cbb53 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:16 -0700
Subject: exit_thread: remove empty bodies

Define HAVE_EXIT_THREAD for archs which want to do something in
exit_thread. For others, let's define exit_thread as an empty inline.

This is a cleanup before we change the prototype of exit_thread to
accept a task parameter.

[akpm@linux-foundation.org: fix mips]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                            |  5 +++++
 arch/alpha/kernel/process.c             |  8 --------
 arch/arc/kernel/process.c               |  7 -------
 arch/arm/Kconfig                        |  1 +
 arch/arm64/kernel/process.c             |  7 -------
 arch/avr32/Kconfig                      |  1 +
 arch/blackfin/include/asm/processor.h   |  7 -------
 arch/c6x/kernel/process.c               |  4 ----
 arch/cris/Kconfig                       |  1 +
 arch/cris/arch-v10/kernel/process.c     |  9 ---------
 arch/frv/include/asm/processor.h        |  7 -------
 arch/h8300/include/asm/processor.h      |  7 -------
 arch/hexagon/kernel/process.c           |  7 -------
 arch/ia64/Kconfig                       |  1 +
 arch/m32r/kernel/process.c              |  9 ---------
 arch/m68k/include/asm/processor.h       |  7 -------
 arch/metag/Kconfig                      |  1 +
 arch/metag/include/asm/processor.h      |  2 --
 arch/microblaze/include/asm/processor.h | 10 ----------
 arch/mips/kernel/process.c              |  4 ----
 arch/mn10300/Kconfig                    |  1 +
 arch/nios2/include/asm/processor.h      |  5 -----
 arch/openrisc/include/asm/processor.h   |  9 ---------
 arch/parisc/kernel/process.c            |  7 -------
 arch/powerpc/kernel/process.c           |  4 ----
 arch/s390/Kconfig                       |  1 +
 arch/score/kernel/process.c             |  2 --
 arch/sh/Kconfig                         |  1 +
 arch/sh/kernel/process_32.c             |  7 -------
 arch/sparc/Kconfig                      |  1 +
 arch/tile/Kconfig                       |  1 +
 arch/um/kernel/process.c                |  4 ----
 arch/unicore32/kernel/process.c         |  7 -------
 arch/x86/Kconfig                        |  1 +
 arch/xtensa/Kconfig                     |  1 +
 include/linux/sched.h                   |  7 +++++++
 36 files changed, 24 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..0f298f9123dc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -517,6 +517,11 @@ config HAVE_ARCH_MMAP_RND_BITS
 	  - ARCH_MMAP_RND_BITS_MIN
 	  - ARCH_MMAP_RND_BITS_MAX
 
+config HAVE_EXIT_THREAD
+	bool
+	help
+	  An architecture implements exit_thread.
+
 config ARCH_MMAP_RND_BITS_MIN
 	int
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..b483156698d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void
-exit_thread(void)
-{
-}
-
 void
 flush_thread(void)
 {
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a3f750e76b68..b5db9e7fd649 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -183,13 +183,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
 	return 0;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b99d25b4133e..956d3575426c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -50,6 +50,7 @@ config ARM
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 48eea6866c67..6cd2612236dc 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -200,13 +200,6 @@ void show_regs(struct pt_regs * regs)
 	__show_regs(regs);
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 static void tls_thread_flush(void)
 {
 	asm ("msr tpidr_el0, xzr");
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 18b88779e701..e43519a2ca89 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -4,6 +4,7 @@ config AVR32
 	# that we usually don't need on AVR32.
 	select EXPERT
 	select HAVE_CLK
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select VIRT_TO_BUS
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 7acd46653df3..0c265aba94ad 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,13 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 3ae9f5a166a0..0ee7686a78f3 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -82,10 +82,6 @@ void flush_thread(void)
 {
 }
 
-void exit_thread(void)
-{
-}
-
 /*
  * Do necessary setup to start up a newly executed thread.
  */
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 99bda1ba3d2f..5c0ca8ae9293 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -59,6 +59,7 @@ config CRIS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS2
+	select HAVE_EXIT_THREAD if ETRAX_ARCH_V32
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select GPIOLIB
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 02b783457be0..96e5afef6b47 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -35,15 +35,6 @@ void default_idle(void)
 	local_irq_enable();
 }
 
-/*
- * Free current thread data structures etc..
- */
-
-void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /* if the watchdog is enabled, we can simply disable interrupts and go
  * into an eternal loop, and the watchdog will reset the CPU after 0.1s
  * if on the other hand the watchdog wasn't enabled, we just enable it and wait
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ae8d423e79d9..73f0a79ad8e6 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -96,13 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
 #define release_segments(mm)		do { } while (0)
 #define forget_segments()		do { } while (0)
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 54e3fd83c336..111df7397ac7 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -110,13 +110,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a9ebd471823a..d9edfd3fc52a 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -136,13 +136,6 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 /*
  * Some archs flush debug and FPU info here
  */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b534ebab36ea..f80758cb7157 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_EXIT_THREAD
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e69221d581d5..a88b1f01e91f 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs)
 #endif
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* Nothing to do. */
-	DPRINTK("pid = %d\n", current->pid);
-}
-
 void flush_thread(void)
 {
 	DPRINTK("pid = %d\n", current->pid);
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 20dda1d4b860..a6ce2ec8d693 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88da3e31..e47a08d72819 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -11,6 +11,7 @@ config METAG
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h
index 0838ca699764..a0333ebcac35 100644
--- a/arch/metag/include/asm/processor.h
+++ b/arch/metag/include/asm/processor.h
@@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task)
 #define copy_segments(tsk, mm)		do { } while (0)
 #define release_segments(mm)		do { } while (0)
 
-extern void exit_thread(void);
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 497a988d79c2..c38d0dd91134 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free all resources held by a thread. */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern unsigned long get_wchan(struct task_struct *p);
@@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.  */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved (kernel) PC of a blocked thread.  */
 #  define thread_saved_pc(tsk)	\
 	((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index a6b3dc54260a..411c971e3417 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -73,10 +73,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	regs->regs[29] = sp;
 }
 
-void exit_thread(void)
-{
-}
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 06ddb5501ab1..9627e81a6cbb 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -1,5 +1,6 @@
 config MN10300
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_UID16
 	select GENERIC_IRQ_SHOW
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index c2ba45c159c7..1c953f0cadbf 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.. */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved PC of a blocked thread. */
 #define thread_saved_pc(tsk)	((tsk)->thread.kregs->ea)
 
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 4d235e3d2534..70334c9f7d24 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -84,15 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp);
 void release_thread(struct task_struct *);
 unsigned long get_wchan(struct task_struct *p);
 
-/*
- * Free current thread data structures etc..
- */
-
-extern inline void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /*
  * Return saved PC of a blocked thread. For now, this is the "user" PC
  */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 809905a811ed..40639439d8b3 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -144,13 +144,6 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	/* Only needs to handle fpu stuff or perf monitors.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ea8a28fd6f31..e2f12cbcade9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1329,10 +1329,6 @@ void show_regs(struct pt_regs * regs)
 		show_instructions(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index de0fcc08dff5..e2c9aaaf64b2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -134,6 +134,7 @@ config S390
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index a1519ad3d49d..aae9480706c2 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 	regs->regs[0] = sp;
 }
 
-void exit_thread(void) {}
-
 /*
  * When a process does an "exec", machine state like FPU and debug
  * registers need to be reset.  This is a hook function for that.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc3fc81..cb93af8f8017 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -71,6 +71,7 @@ config SUPERH32
 
 config SUPERH64
 	def_bool ARCH = "sh64"
+	select HAVE_EXIT_THREAD
 	select KALLSYMS
 
 config ARCH_DEFCONFIG
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 2885fc9d9dcd..ee12e9451874 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc,
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index db0a26cffa97..27b3a0ad40a0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -20,6 +20,7 @@ config SPARC
 	select HAVE_OPROFILE
 	select HAVE_ARCH_KGDB if !SMP || SPARC64
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_EXIT_THREAD
 	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select RTC_CLASS
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 81719302b056..174746225577 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,6 +3,7 @@
 
 config TILE
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_PERF_EVENTS
 	select USE_PMC if PERF_EVENTS
 	select HAVE_DMA_API_DEBUG
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 48af59aae129..0b04711f1f18 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -103,10 +103,6 @@ void interrupt_end(void)
 		tracehook_notify_resume(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 int get_current_pid(void)
 {
 	return task_pid_nr(current);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b008e9961465..00299c927852 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs)
 	__backtrace();
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct thread_info *thread = current_thread_info();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ace79d2da2c3..8ff5b3be95d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_FP_TEST
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 85257afe71c3..64336f666fb6 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -14,6 +14,7 @@ config XTENSA
 	select GENERIC_PCI_IOMAP
 	select GENERIC_SCHED_CLOCK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_EXIT_THREAD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if !MMU
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6b3213d96da6..167c0d4bf3fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2769,7 +2769,14 @@ static inline int copy_thread_tls(
 }
 #endif
 extern void flush_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
 extern void exit_thread(void);
+#else
+static inline void exit_thread(void)
+{
+}
+#endif
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
-- 
cgit v1.2.3


From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: exit_thread: accept a task parameter to be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c           |  4 ++--
 arch/arm/vfp/vfpmodule.c            |  4 ----
 arch/avr32/kernel/process.c         |  4 ++--
 arch/cris/arch-v32/kernel/process.c |  4 ++--
 arch/ia64/kernel/perfmon.c          |  4 ++--
 arch/ia64/kernel/process.c          | 14 +++++++-------
 arch/metag/kernel/process.c         |  6 +++---
 arch/mn10300/kernel/process.c       |  4 ++--
 arch/s390/kernel/process.c          |  5 +++--
 arch/sh/kernel/process_64.c         |  5 ++---
 arch/sparc/kernel/process_32.c      | 12 ++++++------
 arch/sparc/kernel/process_64.c      |  4 ++--
 arch/tile/kernel/process.c          |  4 ++--
 arch/x86/kernel/process.c           |  5 ++---
 arch/xtensa/kernel/process.c        |  4 ++--
 include/linux/sched.h               |  4 ++--
 kernel/exit.c                       |  2 +-
 17 files changed, 42 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 4adfb46e3ee9..a647d6642f3e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+	thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
 }
 
 void flush_thread(void)
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  *  THREAD_NOTIFY_EXIT
- *   - the thread (v) will be running on the local CPU, so
- *	v === current_thread_info()
- *   - thread->cpu is the local CPU number at the time it is accessed,
- *	but may change at any time.
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
 /*
  * Free current thread data structures etc
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	ocd_disable(current);
+	ocd_disable(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
  */
 
 extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	deconfigure_bp(current->pid);
+	deconfigure_bp(tsk->pid);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9cd607b06964..2436ad5f92c1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 
 
 /*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
  */
 void
 pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
 }
 
 /*
- * Clean up state associated with current thread.  This is called when
+ * Clean up state associated with a thread.  This is called when
  * the thread calls exit().
  */
 void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
 {
 
-	ia64_drop_fpu(current);
+	ia64_drop_fpu(tsk);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
+	if (tsk->thread.pfm_context)
+		pfm_exit_thread(tsk);
 
 	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+		pfm_release_debug_registers(tsk);
 #endif
 }
 
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc.
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	clear_fpu(&current->thread);
-	clear_dsp(&current->thread);
+	clear_fpu(&tsk->thread);
+	clear_dsp(&tsk->thread);
 }
 
 /* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 74a96ccf7451..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
 /*
  * free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_fpu(current);
+	exit_fpu(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 481d7a83efc6..bba4fa74b321 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -68,9 +68,10 @@ extern void kernel_thread_starter(void);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_thread_runtime_instr();
+	if (tsk == current)
+		exit_thread_runtime_instr();
 }
 
 void flush_thread(void)
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 	/*
 	 * See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
 	 * which it would get safely nulled.
 	 */
 #ifdef CONFIG_SH_FPU
-	if (last_task_used_math == current) {
+	if (last_task_used_math == tsk)
 		last_task_used_math = NULL;
-	}
 #endif
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifndef CONFIG_SMP
-	if(last_task_used_math == current) {
+	if (last_task_used_math == tsk) {
 #else
-	if (test_thread_flag(TIF_USEDFPU)) {
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 #endif
 		/* Keep process from leaving FPU in a bogon state. */
 		put_psr(get_psr() | PSR_EF);
-		fpsave(&current->thread.float_regs[0], &current->thread.fsr,
-		       &current->thread.fpqueue[0], &current->thread.fpqdepth);
+		fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+		       &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
 #ifndef CONFIG_SMP
 		last_task_used_math = NULL;
 #else
-		clear_thread_flag(TIF_USEDFPU);
+		clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
 #endif
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c16ef1af1843..fa14402b33f9 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 }
 
 /* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct thread_info *t = current_thread_info();
+	struct thread_info *t = task_thread_info(tsk);
 
 	if (t->utraps) {
 		if (t->utraps[0] < 2)
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index b5f30d376ce1..6b705ccc9cc1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -541,7 +541,7 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifdef CONFIG_HARDWALL
 	/*
@@ -550,7 +550,7 @@ void exit_thread(void)
 	 * the last reference to a hardwall fd, it would already have
 	 * been released and deactivated at this point.)
 	 */
-	hardwall_deactivate_all(current);
+	hardwall_deactivate_all(tsk);
 #endif
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2915d54e9dd5..96becbbb52e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
+	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 5bbfed81c97b..e0ded48561db 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -115,10 +115,10 @@ void arch_cpu_idle(void)
 /*
  * This is called when the thread calls exit().
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_release_all(current_thread_info());
+	coprocessor_release_all(task_thread_info(tsk));
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 167c0d4bf3fa..02bdab4d6db7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2771,9 +2771,9 @@ static inline int copy_thread_tls(
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(void);
+extern void exit_thread(struct task_struct *tsk);
 #else
-static inline void exit_thread(void)
+static inline void exit_thread(struct task_struct *tsk)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..75b34fe835b2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
-- 
cgit v1.2.3


From 2eeed7e98d6a1341b1574893a95ce5b8379140f2 Mon Sep 17 00:00:00 2001
From: René Nyffenegger <mail@renenyffenegger.ch>
Date: Fri, 20 May 2016 17:00:30 -0700
Subject: include/linux/syscalls.h: use pid_t instead of int
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In include/linux/syscalls.h, the four functions sys_kill, sys_tgkill,
sys_tkill and sys_rt_sigqueueinfo are declared with "int pid" and "int
tgid".

However, in kernel/signal.c, the corresponding definitions use the more
appropriate "pid_t" (which is a typedef'd int).

This patch changes "int" to "pid_t" in the declarations of sys_kill,
sys_tgkill, sys_tkill and sys_rt_sigqueueinfo in <linux/syscalls.h> in
order to harmonize the function declarations with their respective
definitions.

Link: http://lkml.kernel.org/r/57302FDA.7020205@renenyffenegger.ch
Signed-off-by: René Nyffenegger <mail@renenyffenegger.ch>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Cc: Zach Brown <zab@redhat.com>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d795472c54d8..d02239022bd0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -371,10 +371,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				size_t sigsetsize);
 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
 		siginfo_t __user *uinfo);
-asmlinkage long sys_kill(int pid, int sig);
-asmlinkage long sys_tgkill(int tgid, int pid, int sig);
-asmlinkage long sys_tkill(int pid, int sig);
-asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
+asmlinkage long sys_kill(pid_t pid, int sig);
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig);
+asmlinkage long sys_tkill(pid_t pid, int sig);
+asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
 asmlinkage long sys_sgetmask(void);
 asmlinkage long sys_ssetmask(int newmask);
 asmlinkage long sys_signal(int sig, __sighandler_t handler);
-- 
cgit v1.2.3


From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:33 -0700
Subject: printk/nmi: generic solution for safe printk in NMI

printk() takes some locks and could not be used a safe way in NMI
context.

The chance of a deadlock is real especially when printing stacks from
all CPUs.  This particular problem has been addressed on x86 by the
commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all
CPUs").

The patchset brings two big advantages.  First, it makes the NMI
backtraces safe on all architectures for free.  Second, it makes all NMI
messages almost safe on all architectures (the temporary buffer is
limited.  We still should keep the number of messages in NMI context at
minimum).

Note that there already are several messages printed in NMI context:
WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE
handlers.  These are not easy to avoid.

This patch reuses most of the code and makes it generic.  It is useful
for all messages and architectures that support NMI.

The alternative printk_func is set when entering and is reseted when
leaving NMI context.  It queues IRQ work to copy the messages into the
main ring buffer in a safe context.

__printk_nmi_flush() copies all available messages and reset the buffer.
Then we could use a simple cmpxchg operations to get synchronized with
writers.  There is also used a spinlock to get synchronized with other
flushers.

We do not longer use seq_buf because it depends on external lock.  It
would be hard to make all supported operations safe for a lockless use.
It would be confusing and error prone to make only some operations safe.

The code is put into separate printk/nmi.c as suggested by Steven
Rostedt.  It needs a per-CPU buffer and is compiled only on
architectures that call nmi_enter().  This is achieved by the new
HAVE_NMI Kconfig flag.

The are MN10300 and Xtensa architectures.  We need to clean up NMI
handling there first.  Let's do it separately.

The patch is heavily based on the draft from Peter Zijlstra, see

  https://lkml.org/lkml/2015/6/10/327

[arnd@arndb.de: printk-nmi: use %zu format string for size_t]
[akpm@linux-foundation.org: min_t->min - all types are size_t here]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>	[arm part]
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |   4 +
 arch/arm/Kconfig              |   1 +
 arch/arm/kernel/smp.c         |   2 +
 arch/avr32/Kconfig            |   1 +
 arch/blackfin/Kconfig         |   1 +
 arch/cris/Kconfig             |   1 +
 arch/mips/Kconfig             |   1 +
 arch/powerpc/Kconfig          |   1 +
 arch/s390/Kconfig             |   1 +
 arch/sh/Kconfig               |   1 +
 arch/sparc/Kconfig            |   1 +
 arch/tile/Kconfig             |   1 +
 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/apic/hw_nmi.c |   1 -
 include/linux/hardirq.h       |   2 +
 include/linux/percpu.h        |   3 -
 include/linux/printk.h        |  12 ++-
 init/Kconfig                  |   5 +
 init/main.c                   |   1 +
 kernel/printk/Makefile        |   1 +
 kernel/printk/internal.h      |  44 +++++++++
 kernel/printk/nmi.c           | 219 ++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c        |  19 +---
 lib/nmi_backtrace.c           |  89 +----------------
 24 files changed, 306 insertions(+), 107 deletions(-)
 create mode 100644 kernel/printk/internal.h
 create mode 100644 kernel/printk/nmi.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 0f298f9123dc..8f84fd268dee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_NMI
+	bool
+
 config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
 	bool
 #
 # An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 956d3575426c..90542db1220d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -67,6 +67,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index baee70267f29..df90bc59bfce 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		break;
 
 	case IPI_CPU_BACKTRACE:
+		printk_nmi_enter();
 		irq_enter();
 		nmi_cpu_backtrace(regs);
 		irq_exit();
+		printk_nmi_exit();
 		break;
 
 	default:
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e43519a2ca89..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -18,6 +18,7 @@ config AVR32
 	select GENERIC_CLOCKEVENTS
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select HAVE_NMI
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index a63c12259e77..28c63fea786d 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_NMI
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 5c0ca8ae9293..deba2662b9f3 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -70,6 +70,7 @@ config CRIS
 	select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+	select HAVE_NMI
 
 config HZ
 	int
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5663f411c225..8040fb1845b4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -48,6 +48,7 @@ config MIPS
 	select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
 	select GENERIC_CMOS_UPDATE
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL if MODULES
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0403b58ae8b..01f7464d9fea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select HAVE_NMI if PERF_EVENTS
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e2c9aaaf64b2..1c3c43d9d1b5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -166,6 +166,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
+	select HAVE_NMI
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cb93af8f8017..f6254341c065 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_NMI
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 27b3a0ad40a0..1012f7ffcdf5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -79,6 +79,7 @@ config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select HAVE_NMI
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 174746225577..76989b878f3c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -30,6 +30,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select HAVE_NMI if USE_PMC
 	select EDAC_SUPPORT
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ff5b3be95d4..0a7b885964ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/seq_buf.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..c683996110b1 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,7 @@ extern void irq_exit(void);
 
 #define nmi_enter()						\
 	do {							\
+		printk_nmi_enter();				\
 		lockdep_off();					\
 		ftrace_nmi_enter();				\
 		BUG_ON(in_nmi());				\
@@ -77,6 +78,7 @@ extern void irq_exit(void);
 		preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		ftrace_nmi_exit();				\
 		lockdep_on();					\
+		printk_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..51dd6b824fe2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index 79a91a2c0444..bccc1d607be5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1454,6 +1454,11 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_NMI
+	def_bool y
+	depends on PRINTK
+	depends on HAVE_NMI
+
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
diff --git a/init/main.c b/init/main.c
index 2075fafaad59..fa9b2bdde183 100644
--- a/init/main.c
+++ b/init/main.c
@@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
+	printk_nmi_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
 obj-y	= printk.o
+obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..2de99faedfc1
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,44 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..303cf0d15e57
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,219 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+
+#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	if (len >= sizeof(s->buffer))
+		return 0;
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_nmi_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	unsigned long flags;
+	size_t len, size;
+	int i, last_i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase.
+	 */
+	if (i && i >= len)
+		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+		       i, len);
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+
+	size = min(len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_nmi_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_nmi_seq_line(s, last_i, size - 1);
+		pr_cont("\n");
+	}
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_nmi_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..71eba0607034 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
 
 #include "console_cmdline.h"
 #include "braille.h"
+#include "internal.h"
 
 int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
@@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
-#include <linux/seq_buf.h>
 
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 /*
  * When raise() is called it will be is passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
 void nmi_trigger_all_cpu_backtrace(bool include_self,
 				   void (*raise)(cpumask_t *mask))
 {
-	struct nmi_seq_buf *s;
-	int i, cpu, this_cpu = get_cpu();
+	int i, this_cpu = get_cpu();
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("Sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int len, last_i = 0;
+	printk_nmi_flush();
 
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		pr_warn("NMI backtrace for cpu %d\n", cpu);
 		if (regs)
 			show_regs(regs);
 		else
 			dump_stack();
-		this_cpu_write(printk_func, printk_func_save);
-
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.2.3


From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:42 -0700
Subject: printk/nmi: flush NMI messages on the system panic

In NMI context, printk() messages are stored into per-CPU buffers to
avoid a possible deadlock.  They are normally flushed to the main ring
buffer via an IRQ work.  But the work is never called when the system
calls panic() in the very same NMI handler.

This patch tries to flush NMI buffers before the crash dump is
generated.  In this case it does not risk a double release and bails out
when the logbuf_lock is already taken.  The aim is to get the messages
into the main ring buffer when possible.  It makes them better
accessible in the vmcore.

Then the patch tries to flush the buffers second time when other CPUs
are down.  It might be more aggressive and reset logbuf_lock.  The aim
is to get the messages available for the consequent kmsg_dump() and
console_flush_on_panic() calls.

The patch causes vprintk_emit() to be called even in NMI context again.
But it is done via printk_deferred() so that the console handling is
skipped.  Consoles use internal locks and we could not prevent a
deadlock easily.  They are explicitly called later when the crash dump
is not generated, see console_flush_on_panic().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   |  2 ++
 kernel/kexec_core.c      |  1 +
 kernel/panic.c           |  6 +++++-
 kernel/printk/internal.h |  2 ++
 kernel/printk/nmi.c      | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |  2 +-
 6 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 51dd6b824fe2..f4da695fd615 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -127,11 +127,13 @@ extern void printk_nmi_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
 extern void printk_nmi_flush(void);
+extern void printk_nmi_flush_on_panic(void);
 #else
 static inline void printk_nmi_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
 static inline void printk_nmi_flush(void) { }
+static inline void printk_nmi_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1c03dfb4abfd..d5d408252992 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
+		printk_nmi_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers)
+	if (!crash_kexec_post_notifiers) {
+		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
+	}
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	/* Call flush even twice. It tries harder with a single online CPU */
+	printk_nmi_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 341bedccc065..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
+extern raw_spinlock_t logbuf_lock;
+
 /*
  * printk() could not take logbuf_lock in NMI context. Instead,
  * it temporary stores the strings into a per-CPU buffer.
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bf08557d7e3d..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -17,6 +17,7 @@
 
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/irq_work.h>
@@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
 	const char *buf = s->buffer + start;
 
-	printk("%.*s", (end - start) + 1, buf);
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", (end - start) + 1, buf);
+	else
+		printk("%.*s", (end - start) + 1, buf);
+
 }
 
 /*
@@ -194,6 +204,33 @@ void printk_nmi_flush(void)
 		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
 }
 
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_nmi_flush();
+}
+
 void __init printk_nmi_init(void)
 {
 	int cpu;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e38579d730f4..60cdf6386763 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,7 +245,7 @@ __packed __aligned(4)
  * within the scheduler's rq lock. It must be released before calling
  * console_unlock() or anything else that might wake up a process.
  */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
-- 
cgit v1.2.3


From 8da4b8c48e7b43cb16d05e1dbb34ad9f73ab7efd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:00 -0700
Subject: lib/uuid.c: move generate_random_uuid() to uuid.c

Let's gather the UUID related functions under one hood.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c  | 21 +--------------------
 fs/btrfs/volumes.c     |  2 +-
 fs/ext4/ioctl.c        |  2 +-
 fs/f2fs/file.c         |  2 +-
 fs/reiserfs/objectid.c |  2 +-
 fs/ubifs/sb.c          |  2 +-
 include/linux/random.h |  1 -
 include/linux/uuid.h   |  2 ++
 lib/uuid.c             | 20 ++++++++++++++++++++
 9 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e5336630..0158d3bff7e5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@
 #include <linux/irq.h>
 #include <linux/syscalls.h>
 #include <linux/completion.h>
+#include <linux/uuid.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -1621,26 +1622,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	return urandom_read(NULL, buf, count, NULL);
 }
 
-/***************************************************************
- * Random UUID interface
- *
- * Used here for a Boot ID, but can be useful for other kernel
- * drivers.
- ***************************************************************/
-
-/*
- * Generate random UUID
- */
-void generate_random_uuid(unsigned char uuid_out[16])
-{
-	get_random_bytes(uuid_out, 16);
-	/* Set UUID version to 4 --- truly random generation */
-	uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40;
-	/* Set the UUID variant to DCE */
-	uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80;
-}
-EXPORT_SYMBOL(generate_random_uuid);
-
 /********************************************************************
  *
  * Sysctl interface
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bd0f45fb38c4..bfb80da3e6eb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
-#include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
+#include <linux/uuid.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eae5917c534e..7497f50cb293 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <linux/random.h>
 #include <linux/quotaops.h>
+#include <linux/uuid.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index eb9d027e5981..c6b14951bef3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/mount.h>
 #include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a..415d66ca87d1 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
  */
 
 #include <linux/string.h>
-#include <linux/random.h>
 #include <linux/time.h>
+#include <linux/uuid.h>
 #include "reiserfs.h"
 
 /* find where objectid map starts */
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b794..3cbb904a6d7d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
 
 #include "ubifs.h"
 #include <linux/slab.h>
-#include <linux/random.h>
 #include <linux/math64.h>
+#include <linux/uuid.h>
 
 /*
  * Default journal size in logical eraseblocks as a percent of total
diff --git a/include/linux/random.h b/include/linux/random.h
index 9c29122037f9..e47e533742b5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -26,7 +26,6 @@ extern void get_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
-void generate_random_uuid(unsigned char uuid_out[16]);
 extern int random_int_secret_init(void);
 
 #ifndef MODULE
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6df2509033d7..91c2b6d9cbb7 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -33,6 +33,8 @@ static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2)
 	return memcmp(&u1, &u2, sizeof(uuid_be));
 }
 
+void generate_random_uuid(unsigned char uuid[16]);
+
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4dce1..6c81c0b0467e 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -23,6 +23,26 @@
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+	get_random_bytes(uuid, 16);
+	/* Set UUID version to 4 --- truly random generation */
+	uuid[6] = (uuid[6] & 0x0F) | 0x40;
+	/* Set the UUID variant to DCE */
+	uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
 static void __uuid_gen_common(__u8 b[16])
 {
 	prandom_bytes(b, 16);
-- 
cgit v1.2.3


From 2b1b0d66704a8cafe83be7114ec4c15ab3a314ad Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:04 -0700
Subject: lib/uuid.c: introduce a few more generic helpers

There are new helpers in this patch:

  uuid_is_valid		checks if a UUID is valid
  uuid_be_to_bin	converts from string to binary (big endian)
  uuid_le_to_bin	converts from string to binary (little endian)

They will be used in future, i.e. in the following patches in the series.

This also moves the indices arrays to lib/uuid.c to be shared accross
modules.

[andriy.shevchenko@linux.intel.com: fix typo]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h | 13 +++++++++++
 lib/uuid.c           | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/vsprintf.c       |  9 ++++----
 3 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 91c2b6d9cbb7..e0b95e728a77 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -22,6 +22,11 @@
 
 #include <uapi/linux/uuid.h>
 
+/*
+ * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
+ * not including trailing NUL.
+ */
+#define	UUID_STRING_LEN		36
 
 static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
 {
@@ -38,4 +43,12 @@ void generate_random_uuid(unsigned char uuid[16]);
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
+bool __must_check uuid_is_valid(const char *uuid);
+
+extern const u8 uuid_le_index[16];
+extern const u8 uuid_be_index[16];
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u);
+int uuid_be_to_bin(const char *uuid, uuid_be *u);
+
 #endif
diff --git a/lib/uuid.c b/lib/uuid.c
index 6c81c0b0467e..82787f652fbc 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -19,10 +19,17 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
 /***************************************************************
  * Random UUID interface
  *
@@ -65,3 +72,61 @@ void uuid_be_gen(uuid_be *bu)
 	bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
 }
 EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+  * uuid_is_valid - checks if UUID string valid
+  * @uuid:	UUID string to check
+  *
+  * Description:
+  * It checks if the UUID string is following the format:
+  *	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  * where x is a hex digit.
+  *
+  * Return: true if input is valid UUID string.
+  */
+bool uuid_is_valid(const char *uuid)
+{
+	unsigned int i;
+
+	for (i = 0; i < UUID_STRING_LEN; i++) {
+		if (i == 8 || i == 13 || i == 18 || i == 23) {
+			if (uuid[i] != '-')
+				return false;
+		} else if (!isxdigit(uuid[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+	static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+	unsigned int i;
+
+	if (!uuid_is_valid(uuid))
+		return -EINVAL;
+
+	for (i = 0; i < 16; i++) {
+		int hi = hex_to_bin(uuid[si[i]] + 0);
+		int lo = hex_to_bin(uuid[si[i]] + 1);
+
+		b[ei[i]] = (hi << 4) | lo;
+	}
+
+	return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index be0e7cf11e48..0967771d8f7f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
 #include <linux/ioport.h>
 #include <linux/dcache.h>
 #include <linux/cred.h>
+#include <linux/uuid.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_BLOCK
 #include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
 char *uuid_string(char *buf, char *end, const u8 *addr,
 		  struct printf_spec spec, const char *fmt)
 {
-	char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+	char uuid[UUID_STRING_LEN + 1];
 	char *p = uuid;
 	int i;
-	static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-	static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
-	const u8 *index = be;
+	const u8 *index = uuid_be_index;
 	bool uc = false;
 
 	switch (*(++fmt)) {
 	case 'L':
 		uc = true;		/* fall-through */
 	case 'l':
-		index = le;
+		index = uuid_le_index;
 		break;
 	case 'B':
 		uc = true;
-- 
cgit v1.2.3


From e3a93bce69ad3e2c38927abe311b8cb4f17abbaf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:07 -0700
Subject: lib/uuid.c: remove FSF address

There is no point in keeping an address in the file since it's subject
to change.

While here, update Intel Copyright years.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h      | 6 +-----
 include/uapi/linux/uuid.h | 4 ----
 lib/uuid.c                | 6 +-----
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index e0b95e728a77..2d095fc60204 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -1,7 +1,7 @@
 /*
  * UUID/GUID definition
  *
- * Copyright (C) 2010, Intel Corp.
+ * Copyright (C) 2010, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #ifndef _LINUX_UUID_H_
 #define _LINUX_UUID_H_
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 786f0773cc33..3738e5fb6a4d 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _UAPI_LINUX_UUID_H_
diff --git a/lib/uuid.c b/lib/uuid.c
index 82787f652fbc..e116ae5fa00f 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
 /*
  * Unified UUID/GUID definition
  *
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From ba7e34b1bbd2722685bbc75d168672d5154d8614 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:18 -0700
Subject: include/linux/efi.h: redefine type, constant, macro from generic code

Generic UUID library defines structure type, macro to define UUID, and
the length of the UUID string.  This patch removes duplicate data
structure definition, UUID string length constant as well as macro for
UUID handling.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/efi.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index df7acb51f3cc..c2db3ca22217 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -21,6 +21,7 @@
 #include <linux/pfn.h>
 #include <linux/pstore.h>
 #include <linux/reboot.h>
+#include <linux/uuid.h>
 #include <linux/screen_info.h>
 
 #include <asm/page.h>
@@ -44,17 +45,10 @@ typedef u16 efi_char16_t;		/* UNICODE character */
 typedef u64 efi_physical_addr_t;
 typedef void *efi_handle_t;
 
-
-typedef struct {
-	u8 b[16];
-} efi_guid_t;
+typedef uuid_le efi_guid_t;
 
 #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-((efi_guid_t) \
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-  (b) & 0xff, ((b) >> 8) & 0xff, \
-  (c) & 0xff, ((c) >> 8) & 0xff, \
-  (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+	UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
 
 /*
  * Generic EFI table header
@@ -1117,7 +1111,7 @@ extern int efi_status_to_err(efi_status_t status);
  * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
  * not including trailing NUL
  */
-#define EFI_VARIABLE_GUID_LEN 36
+#define EFI_VARIABLE_GUID_LEN	UUID_STRING_LEN
 
 /*
  * The type of search to perform when calling boottime->locate_handle
-- 
cgit v1.2.3


From 63579785752ba7d0e842078ec6b2875367046f06 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:24 -0700
Subject: include/linux/genhd.h: move to use generic UUID library

UUID library provides uuid_be type and uuid_be_to_bin() function.  This
substitutes open coded variant by generic library calls.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genhd.h | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c706765404a..359a8e4bd44d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
+#include <linux/uuid.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -93,7 +94,7 @@ struct disk_stats {
  * Enough for the string representation of any kind of UUID plus NULL.
  * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
  */
-#define PARTITION_META_INFO_UUIDLTH	37
+#define PARTITION_META_INFO_UUIDLTH	(UUID_STRING_LEN + 1)
 
 struct partition_meta_info {
 	char uuid[PARTITION_META_INFO_UUIDLTH];
@@ -228,27 +229,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
-static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
-{
-	int i;
-	for (i = 0; i < 16; ++i) {
-		*to++ = (hex_to_bin(*uuid_str) << 4) |
-			(hex_to_bin(*(uuid_str + 1)));
-		uuid_str += 2;
-		switch (i) {
-		case 3:
-		case 5:
-		case 7:
-		case 9:
-			uuid_str++;
-			continue;
-		}
-	}
-}
-
 static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
 {
-	part_pack_uuid(uuid_str, to);
+	uuid_be_to_bin(uuid_str, (uuid_be *)to);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:33 -0700
Subject: radix-tree: introduce radix_tree_empty

Commit e61452365372 ("radix_tree: add support for multi-order entries")
left the impression that the support for multiorder radix tree entries
was functional.  As soon as Ross tried to use it, it became apparent
that my testing was completely inadequate, and it didn't even work a
little bit for orders that were not a multiple of shift.

This series of patches is the result of about 6 weeks of redesign,
reimplementation, testing, arguing and hair-pulling.  The great news is
that the test-suite is now far better than it was.  That's reflected in
the diffstat for the test-suite alone:

 12 files changed, 436 insertions(+), 28 deletions(-)

The highlight for users of the tree is that the restriction on the order
of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix
tree now supports any order between 0 and 64.

For those who are interested in how the tree works, patch 9 is probably
the most interesting one as it introduces the new machinery for handling
sibling entries.

I've tried to be fair in attributing authorship to the person who
contributed the majority of the code in each patch; Ross has been an
invaluable partner in the development of this support and it's fair to
say that each of us has code in every commit.

I should also express my appreciation of the 0day testing.  It prompted
me that I was bloating the tinyconfig in an unacceptable way, and it
bisected to a commit which contained a rather nasty memory-corruption
bug.

This patch (of 29):

The irqdomain code was checking for 0 or 1 entries, not 0 entries like
the comment said they were.  Introduce a new helper that will actually
check for an empty tree.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 5 +++++
 kernel/irq/irqdomain.c     | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..83f708e5db59 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -136,6 +136,11 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+static inline bool radix_tree_empty(struct radix_tree_root *root)
+{
+	return root->rnode == NULL;
+}
+
 /**
  * Radix-tree synchronization
  *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
 
-	/*
-	 * radix_tree_delete() takes care of destroying the root
-	 * node when all entries are removed. Shout if there are
-	 * any mappings left.
-	 */
-	WARN_ON(domain->revmap_tree.height);
+	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
 	list_del(&domain->link);
 
-- 
cgit v1.2.3


From 97d778b2de9213c7a7483dad0f533c1af9f0810f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:42 -0700
Subject: radix tree test suite: allow testing other fan-out values

The defines in regression2.c are already in radix-tree.h and duplicating
them in the test case makes experimenting with other values for the
fan-out harder than necessary.  Allow the user of the radix tree to
decide what the fan-out should be rather than fixing it to 8 for
non-kernel uses.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h              | 4 +---
 tools/testing/radix-tree/linux/kernel.h | 2 ++
 tools/testing/radix-tree/regression2.c  | 7 -------
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 83f708e5db59..5ce5a1e0ecc5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -70,10 +70,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
-#ifdef __KERNEL__
+#ifndef RADIX_TREE_MAP_SHIFT
 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
 #endif
 
 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 76a88f35fdc4..31fe2c77d7ae 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -12,6 +12,8 @@
 #define CONFIG_SHMEM
 #define CONFIG_SWAP
 
+#define RADIX_TREE_MAP_SHIFT	3
+
 #ifndef NULL
 #define NULL	0
 #endif
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 5d2fa28cdca3..63bf347aaf33 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -51,13 +51,6 @@
 
 #include "regression.h"
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
 #define PAGECACHE_TAG_DIRTY     0
 #define PAGECACHE_TAG_WRITEBACK 1
 #define PAGECACHE_TAG_TOWRITE   2
-- 
cgit v1.2.3


From 6c4bd68a2962c03423a226d949caf64216d013cc Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:51 -0700
Subject: radix-tree: remove unused looping macros

radix_tree_for_each_chunk() and radix_tree_for_each_chunk_slot() have
never been used in the kernel since their introduction in 2012, so
remove them.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 5ce5a1e0ecc5..e1512a607709 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -478,34 +478,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 	return NULL;
 }
 
-/**
- * radix_tree_for_each_chunk - iterate over chunks
- *
- * @slot:	the void** variable for pointer to chunk first slot
- * @root:	the struct radix_tree_root pointer
- * @iter:	the struct radix_tree_iter pointer
- * @start:	iteration starting index
- * @flags:	RADIX_TREE_ITER_* and tag index
- *
- * Locks can be released and reacquired between iterations.
- */
-#define radix_tree_for_each_chunk(slot, root, iter, start, flags)	\
-	for (slot = radix_tree_iter_init(iter, start) ;			\
-	      (slot = radix_tree_next_chunk(root, iter, flags)) ;)
-
-/**
- * radix_tree_for_each_chunk_slot - iterate over slots in one chunk
- *
- * @slot:	the void** variable, at the beginning points to chunk first slot
- * @iter:	the struct radix_tree_iter pointer
- * @flags:	RADIX_TREE_ITER_*, should be constant
- *
- * This macro is designed to be nested inside radix_tree_for_each_chunk().
- * @slot points to the radix tree slot, @iter->index contains its index.
- */
-#define radix_tree_for_each_chunk_slot(slot, iter, flags)		\
-	for (; slot ; slot = radix_tree_next_slot(slot, iter, flags))
-
 /**
  * radix_tree_for_each_slot - iterate over non-empty slots
  *
-- 
cgit v1.2.3


From 21ef533931f73a8e963a6107aa5ec51b192f28be Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:02:26 -0700
Subject: radix-tree: add support for multi-order iterating

This enables the macros radix_tree_for_each_slot() and friends to be
used with multi-order entries.

The way that this works is that we treat all entries in a given slots[]
array as a single chunk.  If the index given to radix_tree_next_chunk()
happens to point us to a sibling entry, we will back up iter->index so
that it points to the canonical entry, and that will be the place where
we start our iteration.

As we're processing a chunk in radix_tree_next_slot(), we process
canonical entries, skip over sibling entries, and restart the chunk
lookup if we find a non-sibling indirect pointer.  This drops back to
the radix_tree_next_chunk() code, which will re-walk the tree and look
for another chunk.

This allows us to properly handle multi-order entries mixed with other
entries that are at various heights in the radix tree.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h                    | 69 +++++++++++++++++++++++----
 lib/radix-tree.c                              | 66 ++++++++++++++-----------
 tools/testing/radix-tree/generated/autoconf.h |  3 ++
 tools/testing/radix-tree/linux/kernel.h       |  5 +-
 4 files changed, 102 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/radix-tree/generated/autoconf.h

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index e1512a607709..8558d52e1c7b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -330,8 +330,9 @@ static inline void radix_tree_preload_end(void)
  * struct radix_tree_iter - radix tree iterator state
  *
  * @index:	index of current slot
- * @next_index:	next-to-last index for this chunk
+ * @next_index:	one beyond the last index for this chunk
  * @tags:	bit-mask for tag-iterating
+ * @shift:	shift for the node that holds our slots
  *
  * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
  * subinterval of slots contained within one radix tree leaf node.  It is
@@ -344,8 +345,20 @@ struct radix_tree_iter {
 	unsigned long	index;
 	unsigned long	next_index;
 	unsigned long	tags;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	unsigned int	shift;
+#endif
 };
 
+static inline unsigned int iter_shift(struct radix_tree_iter *iter)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	return iter->shift;
+#else
+	return 0;
+#endif
+}
+
 #define RADIX_TREE_ITER_TAG_MASK	0x00FF	/* tag index in lower byte */
 #define RADIX_TREE_ITER_TAGGED		0x0100	/* lookup tagged slots */
 #define RADIX_TREE_ITER_CONTIG		0x0200	/* stop at first hole */
@@ -405,6 +418,12 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 	return NULL;
 }
 
+static inline unsigned long
+__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
+{
+	return iter->index + (slots << iter_shift(iter));
+}
+
 /**
  * radix_tree_iter_next - resume iterating when the chunk may be invalid
  * @iter:	iterator state
@@ -416,7 +435,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 static inline __must_check
 void **radix_tree_iter_next(struct radix_tree_iter *iter)
 {
-	iter->next_index = iter->index + 1;
+	iter->next_index = __radix_tree_iter_add(iter, 1);
 	iter->tags = 0;
 	return NULL;
 }
@@ -430,7 +449,12 @@ void **radix_tree_iter_next(struct radix_tree_iter *iter)
 static __always_inline long
 radix_tree_chunk_size(struct radix_tree_iter *iter)
 {
-	return iter->next_index - iter->index;
+	return (iter->next_index - iter->index) >> iter_shift(iter);
+}
+
+static inline void *indirect_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
 }
 
 /**
@@ -448,24 +472,51 @@ static __always_inline void **
 radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 {
 	if (flags & RADIX_TREE_ITER_TAGGED) {
+		void *canon = slot;
+
 		iter->tags >>= 1;
+		if (unlikely(!iter->tags))
+			return NULL;
+		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+					radix_tree_is_indirect_ptr(slot[1])) {
+			if (indirect_to_ptr(slot[1]) == canon) {
+				iter->tags >>= 1;
+				iter->index = __radix_tree_iter_add(iter, 1);
+				slot++;
+				continue;
+			}
+			iter->next_index = __radix_tree_iter_add(iter, 1);
+			return NULL;
+		}
 		if (likely(iter->tags & 1ul)) {
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
 			return slot + 1;
 		}
-		if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) {
+		if (!(flags & RADIX_TREE_ITER_CONTIG)) {
 			unsigned offset = __ffs(iter->tags);
 
 			iter->tags >>= offset;
-			iter->index += offset + 1;
+			iter->index = __radix_tree_iter_add(iter, offset + 1);
 			return slot + offset + 1;
 		}
 	} else {
-		long size = radix_tree_chunk_size(iter);
+		long count = radix_tree_chunk_size(iter);
+		void *canon = slot;
 
-		while (--size > 0) {
+		while (--count > 0) {
 			slot++;
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
+
+			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+			    radix_tree_is_indirect_ptr(*slot)) {
+				if (indirect_to_ptr(*slot) == canon)
+					continue;
+				else {
+					iter->next_index = iter->index;
+					break;
+				}
+			}
+
 			if (likely(*slot))
 				return slot;
 			if (flags & RADIX_TREE_ITER_CONTIG) {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index ff460423ff4b..a4da86e40def 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -75,11 +75,6 @@ static inline void *ptr_to_indirect(void *ptr)
 	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
-{
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
-}
-
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
@@ -885,6 +880,14 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+					unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	iter->shift = shift;
+#endif
+}
+
 /**
  * radix_tree_next_chunk - find next chunk of slots for iteration
  *
@@ -898,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 {
 	unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
 	struct radix_tree_node *rnode, *node;
-	unsigned long index, offset, height;
+	unsigned long index, offset, maxindex;
 
 	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
 		return NULL;
@@ -916,33 +919,39 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (!index && iter->index)
 		return NULL;
 
-	rnode = rcu_dereference_raw(root->rnode);
+ restart:
+	shift = radix_tree_load_root(root, &rnode, &maxindex);
+	if (index > maxindex)
+		return NULL;
+
 	if (radix_tree_is_indirect_ptr(rnode)) {
 		rnode = indirect_to_ptr(rnode);
-	} else if (rnode && !index) {
+	} else if (rnode) {
 		/* Single-slot tree */
-		iter->index = 0;
-		iter->next_index = 1;
+		iter->index = index;
+		iter->next_index = maxindex + 1;
 		iter->tags = 1;
+		__set_iter_shift(iter, shift);
 		return (void **)&root->rnode;
 	} else
 		return NULL;
 
-restart:
-	height = rnode->path & RADIX_TREE_HEIGHT_MASK;
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	shift -= RADIX_TREE_MAP_SHIFT;
 	offset = index >> shift;
 
-	/* Index outside of the tree */
-	if (offset >= RADIX_TREE_MAP_SIZE)
-		return NULL;
-
 	node = rnode;
 	while (1) {
 		struct radix_tree_node *slot;
+		unsigned new_off = radix_tree_descend(node, &slot, offset);
+
+		if (new_off < offset) {
+			offset = new_off;
+			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
+			index |= offset << shift;
+		}
+
 		if ((flags & RADIX_TREE_ITER_TAGGED) ?
-				!test_bit(offset, node->tags[tag]) :
-				!node->slots[offset]) {
+				!tag_get(node, tag, offset) : !slot) {
 			/* Hole detected */
 			if (flags & RADIX_TREE_ITER_CONTIG)
 				return NULL;
@@ -954,7 +963,10 @@ restart:
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
-					if (node->slots[offset])
+					void *slot = node->slots[offset];
+					if (is_sibling_entry(node, slot))
+						continue;
+					if (slot)
 						break;
 				}
 			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
@@ -964,25 +976,23 @@ restart:
 				return NULL;
 			if (offset == RADIX_TREE_MAP_SIZE)
 				goto restart;
+			slot = rcu_dereference_raw(node->slots[offset]);
 		}
 
-		/* This is leaf-node */
-		if (!shift)
-			break;
-
-		slot = rcu_dereference_raw(node->slots[offset]);
-		if (slot == NULL)
+		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
+
 		node = indirect_to_ptr(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
 
 	/* Update the iterator state */
-	iter->index = index;
-	iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+	iter->index = index & ~((1 << shift) - 1);
+	iter->next_index = (index | ((RADIX_TREE_MAP_SIZE << shift) - 1)) + 1;
+	__set_iter_shift(iter, shift);
 
 	/* Construct iter->tags bit-mask from node->tags[tag] array */
 	if (flags & RADIX_TREE_ITER_TAGGED) {
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
new file mode 100644
index 000000000000..ad18cf5a2a3a
--- /dev/null
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -0,0 +1,3 @@
+#define CONFIG_RADIX_TREE_MULTIORDER 1
+#define CONFIG_SHMEM 1
+#define CONFIG_SWAP 1
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 8ea0ed450810..be98a47b4e1b 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -8,10 +8,7 @@
 #include <limits.h>
 
 #include "../../include/linux/compiler.h"
-
-#define CONFIG_RADIX_TREE_MULTIORDER
-#define CONFIG_SHMEM
-#define CONFIG_SWAP
+#include "../../../include/linux/kconfig.h"
 
 #define RADIX_TREE_MAP_SHIFT	3
 
-- 
cgit v1.2.3


From 0c7fa0a8418cbe0e8963fe36db9575d03b8589f7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:07 -0700
Subject: radix-tree: split node->path into offset and height

Neither piece of information we're storing in node->path can be larger
than 64, so store each in its own unsigned char instead of shifting and
masking to store them both in an unsigned int.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  7 ++-----
 lib/radix-tree.c           | 38 +++++++++++++++++---------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 8558d52e1c7b..2d2ad9d685a3 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -84,16 +84,13 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
 
-/* Height component in node->path */
-#define RADIX_TREE_HEIGHT_SHIFT	(RADIX_TREE_MAX_PATH + 1)
-#define RADIX_TREE_HEIGHT_MASK	((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
-
 /* Internally used bits of node->count */
 #define RADIX_TREE_COUNT_SHIFT	(RADIX_TREE_MAP_SHIFT + 1)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned int	path;	/* Offset in parent & height from the bottom */
+	unsigned char	height;	/* From the bottom */
+	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
 		struct {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75944e42e4a0..dd04b51e5fbb 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -218,15 +218,15 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node, unsigned offset,
+static void dump_node(struct radix_tree_node *node,
 				unsigned shift, unsigned long index)
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
-		node, offset,
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->path, node->count, node->parent);
+		node->height, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -243,7 +243,7 @@ static void dump_node(struct radix_tree_node *node, unsigned offset,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), i,
+			dump_node(indirect_to_ptr(entry),
 					shift - RADIX_TREE_MAP_SHIFT, first);
 		}
 	}
@@ -257,7 +257,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0,
+	dump_node(indirect_to_ptr(root->rnode),
 				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
 }
 #endif
@@ -421,7 +421,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->path & RADIX_TREE_HEIGHT_MASK);
+	return radix_tree_maxindex(node->height);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,8 +434,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return (node->path & RADIX_TREE_HEIGHT_MASK) *
-			RADIX_TREE_MAP_SHIFT;
+		return node->height * RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -476,9 +475,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height+1;
-		BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
-		node->path = newheight;
+		newheight = root->height + 1;
+		BUG_ON(newheight > BITS_PER_LONG);
+		node->height = newheight;
+		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
 		slot = root->rnode;
@@ -546,13 +546,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->path = height;
+			slot->height = height;
+			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
 				rcu_assign_pointer(node->slots[offset],
 							ptr_to_indirect(slot));
 				node->count++;
-				slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
 			} else
 				rcu_assign_pointer(root->rnode,
 							ptr_to_indirect(slot));
@@ -1319,11 +1319,10 @@ struct locate_info {
 static unsigned long __locate(struct radix_tree_node *slot, void *item,
 			      unsigned long index, struct locate_info *info)
 {
-	unsigned int shift, height;
+	unsigned int shift;
 	unsigned long i;
 
-	height = slot->path & RADIX_TREE_HEIGHT_MASK;
-	shift = height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->height * RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -1508,10 +1507,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 
 		parent = node->parent;
 		if (parent) {
-			unsigned int offset;
-
-			offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
-			parent->slots[offset] = NULL;
+			parent->slots[node->offset] = NULL;
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-- 
cgit v1.2.3


From c12e51b07b3ac4c188fd91a82f96840fdb9cca6f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:10 -0700
Subject: radix-tree: replace node->height with node->shift

node->shift represents the shift necessary for looking in the slots
array at this level.  It is equal to the old (node->height - 1) *
RADIX_TREE_MAP_SHIFT.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  2 +-
 lib/radix-tree.c           | 30 ++++++++++++++++--------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 2d2ad9d685a3..037458257e12 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -89,7 +89,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned char	height;	/* From the bottom */
+	unsigned char	shift;	/* Bits remaining in each slot */
 	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index dd04b51e5fbb..648da9080418 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -223,10 +223,10 @@ static void dump_node(struct radix_tree_node *node,
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
 		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->height, node->count, node->parent);
+		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -419,9 +419,14 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 	return height_to_maxindex[height];
 }
 
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+	return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->height);
+	return shift_maxindex(node->shift);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,7 +439,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return node->height * RADIX_TREE_MAP_SHIFT;
+		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -475,9 +480,9 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height + 1;
+		newheight = root->height;
 		BUG_ON(newheight > BITS_PER_LONG);
-		node->height = newheight;
+		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
@@ -490,7 +495,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = newheight;
+		root->height = ++newheight;
 	} while (height > root->height);
 out:
 	return height * RADIX_TREE_MAP_SHIFT;
@@ -519,7 +524,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 {
 	struct radix_tree_node *node = NULL, *slot;
 	unsigned long maxindex;
-	unsigned int height, shift, offset;
+	unsigned int shift, offset;
 	unsigned long max = index | ((1UL << order) - 1);
 
 	shift = radix_tree_load_root(root, &slot, &maxindex);
@@ -537,16 +542,15 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		}
 	}
 
-	height = root->height;
-
 	offset = 0;			/* uninitialised var warning */
 	while (shift > order) {
+		shift -= RADIX_TREE_MAP_SHIFT;
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->height = height;
+			slot->shift = shift;
 			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
@@ -560,8 +564,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		height--;
-		shift -= RADIX_TREE_MAP_SHIFT;
 		node = indirect_to_ptr(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
@@ -1322,7 +1324,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 	unsigned int shift;
 	unsigned long i;
 
-	shift = slot->height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->shift + RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
-- 
cgit v1.2.3


From d0891265bbc988dc91ed8580b38eb3dac128581b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:19 -0700
Subject: radix-tree: remove root->height

The only remaining references to root->height were in extend and shrink,
where it was updated.  Now we can remove it entirely.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |   3 --
 lib/radix-tree.c           | 106 +++++++++++++--------------------------------
 2 files changed, 31 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 037458257e12..c0d223cfac00 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -110,13 +110,11 @@ struct radix_tree_node {
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
-	unsigned int		height;
 	gfp_t			gfp_mask;
 	struct radix_tree_node	__rcu *rnode;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
 	.gfp_mask = (mask),						\
 	.rnode = NULL,							\
 }
@@ -126,7 +124,6 @@ struct radix_tree_root {
 
 #define INIT_RADIX_TREE(root, mask)					\
 do {									\
-	(root)->height = 0;						\
 	(root)->gfp_mask = (mask);					\
 	(root)->rnode = NULL;						\
 } while (0)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75c9e6197b5b..58f79fee8c71 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,12 +38,6 @@
 #include <linux/preempt.h>		/* in_interrupt() */
 
 
-/*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
 /*
  * Radix tree node cache.
  */
@@ -218,8 +212,7 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node,
-				unsigned shift, unsigned long index)
+static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
 	unsigned long i;
 
@@ -229,8 +222,8 @@ static void dump_node(struct radix_tree_node *node,
 		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		unsigned long first = index | (i << shift);
-		unsigned long last = first | ((1UL << shift) - 1);
+		unsigned long first = index | (i << node->shift);
+		unsigned long last = first | ((1UL << node->shift) - 1);
 		void *entry = node->slots[i];
 		if (!entry)
 			continue;
@@ -243,8 +236,7 @@ static void dump_node(struct radix_tree_node *node,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry),
-					shift - RADIX_TREE_MAP_SHIFT, first);
+			dump_node(indirect_to_ptr(entry), first);
 		}
 	}
 }
@@ -252,13 +244,12 @@ static void dump_node(struct radix_tree_node *node,
 /* For debug */
 static void radix_tree_dump(struct radix_tree_root *root)
 {
-	pr_debug("radix root: %p height %d rnode %p tags %x\n",
-			root, root->height, root->rnode,
+	pr_debug("radix root: %p rnode %p tags %x\n",
+			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode),
-				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
+	dump_node(indirect_to_ptr(root->rnode), 0);
 }
 #endif
 
@@ -411,14 +402,8 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 EXPORT_SYMBOL(radix_tree_maybe_preload);
 
 /*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
  */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
 static inline unsigned long shift_maxindex(unsigned int shift)
 {
 	return (RADIX_TREE_MAP_SIZE << shift) - 1;
@@ -450,24 +435,22 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
  *	Extend a radix tree so it can store key @index.
  */
 static int radix_tree_extend(struct radix_tree_root *root,
-				unsigned long index)
+				unsigned long index, unsigned int shift)
 {
 	struct radix_tree_node *slot;
-	unsigned int height;
+	unsigned int maxshift;
 	int tag;
 
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
+	/* Figure out what the shift should be.  */
+	maxshift = shift;
+	while (index > shift_maxindex(maxshift))
+		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	if (root->rnode == NULL) {
-		root->height = height;
+	slot = root->rnode;
+	if (!slot)
 		goto out;
-	}
 
 	do {
-		unsigned int newheight;
 		struct radix_tree_node *node = radix_tree_node_alloc(root);
 
 		if (!node)
@@ -479,14 +462,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 				tag_set(node, tag, 0);
 		}
 
-		/* Increase the height.  */
-		newheight = root->height;
-		BUG_ON(newheight > BITS_PER_LONG);
-		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
+		BUG_ON(shift > BITS_PER_LONG);
+		node->shift = shift;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		slot = root->rnode;
 		if (radix_tree_is_indirect_ptr(slot)) {
 			slot = indirect_to_ptr(slot);
 			slot->parent = node;
@@ -495,10 +475,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = ++newheight;
-	} while (height > root->height);
+		shift += RADIX_TREE_MAP_SHIFT;
+		slot = node;
+	} while (shift <= maxshift);
 out:
-	return height * RADIX_TREE_MAP_SHIFT;
+	return maxshift + RADIX_TREE_MAP_SHIFT;
 }
 
 /**
@@ -531,15 +512,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 
 	/* Make sure the tree is high enough.  */
 	if (max > maxindex) {
-		int error = radix_tree_extend(root, max);
+		int error = radix_tree_extend(root, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
 		slot = root->rnode;
-		if (order == shift) {
+		if (order == shift)
 			shift += RADIX_TREE_MAP_SHIFT;
-			root->height++;
-		}
 	}
 
 	offset = 0;			/* uninitialised var warning */
@@ -1412,32 +1391,32 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
  *	@root		radix tree root
  */
 static inline bool radix_tree_shrink(struct radix_tree_root *root)
 {
 	bool shrunk = false;
 
-	/* try to shrink tree height */
-	while (root->height > 0) {
+	for (;;) {
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
+		if (!radix_tree_is_indirect_ptr(to_free))
+			break;
 		to_free = indirect_to_ptr(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or it is a multiorder entry,
-		 * we cannot shrink.
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
 		 */
 		if (to_free->count != 1)
 			break;
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && (root->height > 1))
+		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
 		if (radix_tree_is_indirect_ptr(slot)) {
@@ -1454,7 +1433,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
 		root->rnode = slot;
-		root->height--;
 
 		/*
 		 * We have a dilemma here. The node's slot[0] must not be
@@ -1515,7 +1493,6 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-			root->height = 0;
 			root->rnode = NULL;
 		}
 
@@ -1631,26 +1608,6 @@ radix_tree_node_ctor(void *arg)
 	INIT_LIST_HEAD(&node->private_list);
 }
 
-static __init unsigned long __maxindex(unsigned int height)
-{
-	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
-	int shift = RADIX_TREE_INDEX_BITS - width;
-
-	if (shift < 0)
-		return ~0UL;
-	if (shift >= BITS_PER_LONG)
-		return 0UL;
-	return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
 static int radix_tree_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -1677,6 +1634,5 @@ void __init radix_tree_init(void)
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 			radix_tree_node_ctor);
-	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
-- 
cgit v1.2.3


From 30ff46ccb303fb6f6c28b9aa9f2cdc4ba900ed3f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:22 -0700
Subject: radix-tree: rename INDIRECT_PTR to INTERNAL_NODE

The name RADIX_TREE_INDIRECT_PTR doesn't really match the meaning.
RADIX_TREE_INTERNAL_NODE is a better name.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 30 +++++++++++++-----------------
 lib/radix-tree.c           |  2 +-
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c0d223cfac00..c8cc879046c7 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,20 +29,16 @@
 #include <linux/rcupdate.h>
 
 /*
- * An indirect pointer (root->rnode pointing to a radix_tree_node, rather
- * than a data item) is signalled by the low bit set in the root->rnode
- * pointer.
- *
- * In this case root->height is > 0, but the indirect pointer tests are
- * needed for RCU lookups (because root->height is unreliable). The only
- * time callers need worry about this is when doing a lookup_slot under
- * RCU.
- *
- * Indirect pointer in fact is also used to tag the last pointer of a node
- * when it is shrunk, before we rcu free the node. See shrink code for
- * details.
+ * Entries in the radix tree have the low bit set if they refer to a
+ * radix_tree_node.  If the low bit is clear then the entry is user data.
+ *
+ * We also use the low bit to indicate that the slot will be freed in the
+ * next RCU idle period, and users need to re-walk the tree to find the
+ * new slot for the index that they were looking for.  See the comment in
+ * radix_tree_shrink() for details.
  */
-#define RADIX_TREE_INDIRECT_PTR		1
+#define RADIX_TREE_INTERNAL_NODE	1
+
 /*
  * A common use of the radix tree is to store pointers to struct pages;
  * but shmem/tmpfs needs also to store swap entries in the same tree:
@@ -63,7 +59,7 @@
 
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
+	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
 
 /*** radix-tree API starts here ***/
@@ -228,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
+	return unlikely(radix_tree_is_indirect_ptr(arg));
 }
 
 /**
@@ -250,7 +246,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
 static inline int radix_tree_exception(void *arg)
 {
 	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
 }
 
 /**
@@ -448,7 +444,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 
 static inline void *indirect_to_ptr(void *ptr)
 {
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 58f79fee8c71..31d5929a625b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -68,7 +68,7 @@ static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
 static inline void *ptr_to_indirect(void *ptr)
 {
-	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
 }
 
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
-- 
cgit v1.2.3


From 4dd6c0987ca43d6544f4f0a3f86f6ea3bfc60fc1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:27 -0700
Subject: radix-tree: rename indirect_to_ptr() to entry_to_node()

Mirrors the earlier commit introducing node_to_entry().

Also change the type returned to be a struct radix_tree_node pointer.
That lets us simplify a couple of places in the radix tree shrink &
extend paths where we could convert an entry into a pointer, modify the
node, then convert the pointer back into an entry.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 12 +++++------
 lib/radix-tree.c                | 48 ++++++++++++++++++-----------------------
 tools/testing/radix-tree/test.c |  4 ++--
 tools/testing/radix-tree/test.h |  1 -
 4 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c8cc879046c7..b94aa198dd6b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -442,7 +442,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 	return (iter->next_index - iter->index) >> iter_shift(iter);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
+static inline struct radix_tree_node *entry_to_node(void *ptr)
 {
 	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
@@ -469,7 +469,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 					radix_tree_is_indirect_ptr(slot[1])) {
-			if (indirect_to_ptr(slot[1]) == canon) {
+			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
 				slot++;
@@ -499,12 +499,10 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 			    radix_tree_is_indirect_ptr(*slot)) {
-				if (indirect_to_ptr(*slot) == canon)
+				if (entry_to_node(*slot) == canon)
 					continue;
-				else {
-					iter->next_index = iter->index;
-					break;
-				}
+				iter->next_index = iter->index;
+				break;
 			}
 
 			if (likely(*slot))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index f66bb3932452..3c3fdd9c5bb3 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -230,13 +230,13 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 		if (is_sibling_entry(node, entry)) {
 			pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
 					entry, i,
-					*(void **)indirect_to_ptr(entry),
+					*(void **)entry_to_node(entry),
 					first, last);
 		} else if (!radix_tree_is_indirect_ptr(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), first);
+			dump_node(entry_to_node(entry), first);
 		}
 	}
 }
@@ -249,7 +249,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0);
+	dump_node(entry_to_node(root->rnode), 0);
 }
 #endif
 
@@ -422,7 +422,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	*nodep = node;
 
 	if (likely(radix_tree_is_indirect_ptr(node))) {
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
@@ -467,11 +467,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = node;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
 		rcu_assign_pointer(root->rnode, slot);
@@ -542,7 +539,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
 	}
@@ -645,7 +642,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 
 		if (node == RADIX_TREE_RETRY)
 			goto restart;
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(parent, &node, offset);
@@ -729,7 +726,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 		BUG_ON(!node);
 
@@ -777,7 +774,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
@@ -844,7 +841,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 
 		if (!node)
@@ -904,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		return NULL;
 
 	if (radix_tree_is_indirect_ptr(rnode)) {
-		rnode = indirect_to_ptr(rnode);
+		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
 		iter->index = index;
@@ -963,7 +960,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
 
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
@@ -1048,7 +1045,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		return 1;
 	}
 
-	node = indirect_to_ptr(slot);
+	node = entry_to_node(slot);
 	shift -= RADIX_TREE_MAP_SHIFT;
 
 	for (;;) {
@@ -1063,7 +1060,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 			goto next;
 		/* Sibling slots never have tags set on them */
 		if (radix_tree_is_indirect_ptr(slot)) {
-			node = indirect_to_ptr(slot);
+			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
 		}
@@ -1322,7 +1319,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 				}
 				continue;
 			}
-			node = indirect_to_ptr(node);
+			node = entry_to_node(node);
 			if (is_sibling_entry(slot, node))
 				continue;
 			slot = node;
@@ -1367,7 +1364,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 			break;
 		}
 
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 
 		max_index = node_maxindex(node);
 		if (cur_index > max_index) {
@@ -1403,7 +1400,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 
 		if (!radix_tree_is_indirect_ptr(to_free))
 			break;
-		to_free = indirect_to_ptr(to_free);
+		to_free = entry_to_node(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
@@ -1418,11 +1415,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = NULL;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = NULL;
 
 		/*
 		 * We don't need rcu_assign_pointer(), since we are simply
@@ -1481,7 +1475,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 		struct radix_tree_node *parent;
 
 		if (node->count) {
-			if (node == indirect_to_ptr(root->rnode))
+			if (node == entry_to_node(root->rnode))
 				deleted |= radix_tree_shrink(root);
 			return deleted;
 		}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 3004c58b9021..7b0bc1fa5919 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -149,7 +149,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 	int i;
 	int j;
 
-	slot = indirect_to_ptr(slot);
+	slot = entry_to_node(slot);
 
 	/* Verify consistency at this level */
 	for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
@@ -227,7 +227,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 		return;
 	}
 
-	node = indirect_to_ptr(node);
+	node = entry_to_node(node);
 	assert(maxindex <= node_maxindex(node));
 
 	shift = node->shift;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 866c8c676aa4..e85131369723 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -39,7 +39,6 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
 extern int nr_allocated;
 
 /* Normally private parts of lib/radix-tree.c */
-void *indirect_to_ptr(void *ptr);
 void radix_tree_dump(struct radix_tree_root *root);
 int root_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned long node_maxindex(struct radix_tree_node *);
-- 
cgit v1.2.3


From b194d16c27af905d6e3552f4851bc7d9fee4e90f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:30 -0700
Subject: radix-tree: rename radix_tree_is_indirect_ptr()

As with indirect_to_ptr(), ptr_to_indirect() and
RADIX_TREE_INDIRECT_PTR, change radix_tree_is_indirect_ptr() to
radix_tree_is_internal_node().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 10 ++++-----
 lib/radix-tree.c                | 48 ++++++++++++++++++++---------------------
 tools/testing/radix-tree/test.c |  4 ++--
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index b94aa198dd6b..bad63105e37e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -57,7 +57,7 @@
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
 
-static inline int radix_tree_is_indirect_ptr(void *ptr)
+static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
@@ -224,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely(radix_tree_is_indirect_ptr(arg));
+	return unlikely(radix_tree_is_internal_node(arg));
 }
 
 /**
@@ -259,7 +259,7 @@ static inline int radix_tree_exception(void *arg)
  */
 static inline void radix_tree_replace_slot(void **pslot, void *item)
 {
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 	rcu_assign_pointer(*pslot, item);
 }
 
@@ -468,7 +468,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 		if (unlikely(!iter->tags))
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-					radix_tree_is_indirect_ptr(slot[1])) {
+					radix_tree_is_internal_node(slot[1])) {
 			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
@@ -498,7 +498,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			iter->index = __radix_tree_iter_add(iter, 1);
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-			    radix_tree_is_indirect_ptr(*slot)) {
+			    radix_tree_is_internal_node(*slot)) {
 				if (entry_to_node(*slot) == canon)
 					continue;
 				iter->next_index = iter->index;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3c3fdd9c5bb3..b65c83036ca4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -100,7 +100,7 @@ static unsigned radix_tree_descend(struct radix_tree_node *parent,
 	void **entry = rcu_dereference_raw(parent->slots[offset]);
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-	if (radix_tree_is_indirect_ptr(entry)) {
+	if (radix_tree_is_internal_node(entry)) {
 		unsigned long siboff = get_slot_offset(parent, entry);
 		if (siboff < RADIX_TREE_MAP_SIZE) {
 			offset = siboff;
@@ -232,7 +232,7 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 					entry, i,
 					*(void **)entry_to_node(entry),
 					first, last);
-		} else if (!radix_tree_is_indirect_ptr(entry)) {
+		} else if (!radix_tree_is_internal_node(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
@@ -247,7 +247,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 	pr_debug("radix root: %p rnode %p tags %x\n",
 			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
-	if (!radix_tree_is_indirect_ptr(root->rnode))
+	if (!radix_tree_is_internal_node(root->rnode))
 		return;
 	dump_node(entry_to_node(root->rnode), 0);
 }
@@ -302,7 +302,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	ret = kmem_cache_alloc(radix_tree_node_cachep,
 			       gfp_mask | __GFP_ACCOUNT);
 out:
-	BUG_ON(radix_tree_is_indirect_ptr(ret));
+	BUG_ON(radix_tree_is_internal_node(ret));
 	return ret;
 }
 
@@ -421,7 +421,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 
 	*nodep = node;
 
-	if (likely(radix_tree_is_indirect_ptr(node))) {
+	if (likely(radix_tree_is_internal_node(node))) {
 		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
@@ -467,7 +467,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
@@ -535,7 +535,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			} else
 				rcu_assign_pointer(root->rnode,
 							node_to_entry(slot));
-		} else if (!radix_tree_is_indirect_ptr(slot))
+		} else if (!radix_tree_is_internal_node(slot))
 			break;
 
 		/* Go a level down */
@@ -585,7 +585,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	void **slot;
 	int error;
 
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
@@ -637,7 +637,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 	if (index > maxindex)
 		return NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		if (node == RADIX_TREE_RETRY)
@@ -720,7 +720,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	shift = radix_tree_load_root(root, &node, &maxindex);
 	BUG_ON(index > maxindex);
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -770,7 +770,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 
 	parent = NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
@@ -835,7 +835,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 	if (node == NULL)
 		return 0;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		int offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -900,7 +900,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (index > maxindex)
 		return NULL;
 
-	if (radix_tree_is_indirect_ptr(rnode)) {
+	if (radix_tree_is_internal_node(rnode)) {
 		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
@@ -957,7 +957,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 
 		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			break;
 
 		node = entry_to_node(slot);
@@ -1039,7 +1039,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		*first_indexp = last_index + 1;
 		return 0;
 	}
-	if (!radix_tree_is_indirect_ptr(slot)) {
+	if (!radix_tree_is_internal_node(slot)) {
 		*first_indexp = last_index + 1;
 		root_tag_set(root, settag);
 		return 1;
@@ -1059,7 +1059,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		if (!tag_get(node, iftag, offset))
 			goto next;
 		/* Sibling slots never have tags set on them */
-		if (radix_tree_is_indirect_ptr(slot)) {
+		if (radix_tree_is_internal_node(slot)) {
 			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
@@ -1152,7 +1152,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1235,7 +1235,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1311,7 +1311,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 					rcu_dereference_raw(slot->slots[i]);
 			if (node == RADIX_TREE_RETRY)
 				goto out;
-			if (!radix_tree_is_indirect_ptr(node)) {
+			if (!radix_tree_is_internal_node(node)) {
 				if (node == item) {
 					info->found_index = index;
 					info->stop = true;
@@ -1357,7 +1357,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 	do {
 		rcu_read_lock();
 		node = rcu_dereference_raw(root->rnode);
-		if (!radix_tree_is_indirect_ptr(node)) {
+		if (!radix_tree_is_internal_node(node)) {
 			rcu_read_unlock();
 			if (node == item)
 				info.found_index = 0;
@@ -1398,7 +1398,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		if (!radix_tree_is_indirect_ptr(to_free))
+		if (!radix_tree_is_internal_node(to_free))
 			break;
 		to_free = entry_to_node(to_free);
 
@@ -1412,10 +1412,10 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
+		if (!radix_tree_is_internal_node(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = NULL;
 
 		/*
@@ -1445,7 +1445,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * also results in a stale slot). So tag the slot as indirect
 		 * to force callers to retry.
 		 */
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			to_free->slots[0] = RADIX_TREE_RETRY;
 
 		radix_tree_node_free(to_free);
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 7b0bc1fa5919..a6e8099eaf4f 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -193,7 +193,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
 {
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node))
+	if (!radix_tree_is_internal_node(node))
 		return;
 	verify_node(node, tag, !!root_tag_get(root, tag));
 }
@@ -222,7 +222,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 {
 	unsigned shift;
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node)) {
+	if (!radix_tree_is_internal_node(node)) {
 		assert(maxindex == 0);
 		return;
 	}
-- 
cgit v1.2.3


From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:45 -0700
Subject: radix-tree: introduce radix_tree_replace_clear_tags()

In addition to replacing the entry, we also clear all associated tags.
This is really a one-off special for page_cache_tree_delete() which had
far too much detailed knowledge about how the radix tree works.

For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It
can be used by radix_tree_delete_item() as well as
radix_tree_replace_clear_tags().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  9 ++++--
 lib/radix-tree.c           | 76 ++++++++++++++++++++++++++++------------------
 mm/filemap.c               | 23 ++------------
 3 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index bad63105e37e..11c8e7cc3920 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+				struct radix_tree_root *root,
+				unsigned long index, void *entry);
+unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+			void **results, unsigned long first_index,
+			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9d9b4b9af4b6..c7114d233b38 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+static void node_tag_clear(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (!tag_get(node, tag, offset))
+			return;
+		tag_clear(node, tag, offset);
+		if (any_tag_set(node, tag))
+			return;
+
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
-	if (node == NULL)
-		goto out;
+	if (node)
+		node_tag_clear(root, parent, tag, offset);
 
-	index >>= shift;
-
-	while (parent) {
-		if (!tag_get(parent, tag, offset))
-			goto out;
-		tag_clear(parent, tag, offset);
-		if (any_tag_set(parent, tag))
-			goto out;
-
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		parent = parent->parent;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
 	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	offset = get_slot_offset(node, slot);
 
-	/*
-	 * Clear all tags associated with the item to be deleted.
-	 * This way of doing it would be inefficient, but seldom is any set.
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(node, tag, offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
+	/* Clear all tags associated with the item to be deleted.  */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
 
 	delete_sibling_entries(node, node_to_entry(slot), offset);
 	node->slots[offset] = NULL;
@@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+struct radix_tree_node *radix_tree_replace_clear_tags(
+			struct radix_tree_root *root,
+			unsigned long index, void *entry)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	__radix_tree_lookup(root, index, &node, &slot);
+
+	if (node) {
+		unsigned int tag, offset = get_slot_offset(node, slot);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
+	} else {
+		/* Clear root node tags */
+		root->gfp_mask &= __GFP_BITS_MASK;
+	}
+
+	radix_tree_replace_slot(slot, entry);
+	return node;
+}
+
 /**
  *	radix_tree_tagged - test whether any items in the tree are tagged
  *	@root:		radix tree root
diff --git a/mm/filemap.c b/mm/filemap.c
index b418405903bc..9665b1d4f318 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 {
 	struct radix_tree_node *node;
-	unsigned long index;
-	unsigned int offset;
-	unsigned int tag;
-	void **slot;
 
 	VM_BUG_ON(!PageLocked(page));
 
-	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+	node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+								shadow);
 
 	if (shadow) {
 		mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	}
 	mapping->nrpages--;
 
-	if (!node) {
-		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
-		radix_tree_replace_slot(slot, shadow);
+	if (!node)
 		return;
-	}
-
-	/* Clear tree tags for the removed page */
-	index = page->index;
-	offset = index & RADIX_TREE_MAP_MASK;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (test_bit(offset, node->tags[tag]))
-			radix_tree_tag_clear(&mapping->page_tree, index, tag);
-	}
 
-	/* Delete page, swap shadow entry */
-	radix_tree_replace_slot(slot, shadow);
 	workingset_node_pages_dec(node);
 	if (shadow)
 		workingset_node_shadows_inc(node);
-- 
cgit v1.2.3


From 78a9be0a0a3367b94af242632c525d22b26f1a87 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 20 May 2016 17:03:51 -0700
Subject: dax: move RADIX_DAX_ definitions to dax.c

These don't belong in radix-tree.h any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.

Signed-off-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                   | 9 +++++++++
 include/linux/radix-tree.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0dbe4e0f16fe..a345c168acaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 11c8e7cc3920..c2f69e25ba86 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -48,15 +48,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
-- 
cgit v1.2.3


From 3bcadd6fa6c4fd07ace3626357c824eb532488a6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:54 -0700
Subject: radix-tree: free up the bottom bit of exceptional entries for reuse

We are guaranteed that pointers to radix_tree_nodes always have the
bottom two bits clear (because they come from a slab cache, and slab
caches have a minimum alignment of sizeof(void *)), so we can redefine
'radix_tree_is_internal_node' to only return true if the bottom two bits
have value '01'.  This frees up one quarter of the potential values for
use by the user.

Idea from Neil Brown.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Suggested-by: Neil Brown <neilb@suse.de>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c2f69e25ba86..cb4b7e8cee81 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,28 +29,37 @@
 #include <linux/rcupdate.h>
 
 /*
- * Entries in the radix tree have the low bit set if they refer to a
- * radix_tree_node.  If the low bit is clear then the entry is user data.
- *
- * We also use the low bit to indicate that the slot will be freed in the
- * next RCU idle period, and users need to re-walk the tree to find the
- * new slot for the index that they were looking for.  See the comment in
- * radix_tree_shrink() for details.
+ * The bottom two bits of the slot determine how the remaining bits in the
+ * slot are interpreted:
+ *
+ * 00 - data pointer
+ * 01 - internal entry
+ * 10 - exceptional entry
+ * 11 - locked exceptional entry
+ *
+ * The internal entry may be a pointer to the next level in the tree, a
+ * sibling entry, or an indicator that the entry in this slot has been moved
+ * to another location in the tree and the lookup should be restarted.  While
+ * NULL fits the 'data pointer' pattern, it means that there is no entry in
+ * the tree for this index (no matter what level of the tree it is found at).
+ * This means that you cannot store NULL in the tree as a value for the index.
  */
-#define RADIX_TREE_INTERNAL_NODE	1
+#define RADIX_TREE_ENTRY_MASK		3UL
+#define RADIX_TREE_INTERNAL_NODE	1UL
 
 /*
- * A common use of the radix tree is to store pointers to struct pages;
- * but shmem/tmpfs needs also to store swap entries in the same tree:
- * those are marked as exceptional entries to distinguish them.
+ * Most users of the radix tree store pointers but shmem/tmpfs stores swap
+ * entries in the same tree.  They are marked as exceptional entries to
+ * distinguish them from pointers to struct page.
  * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
  */
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-static inline int radix_tree_is_internal_node(void *ptr)
+static inline bool radix_tree_is_internal_node(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
+	return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
+				RADIX_TREE_INTERNAL_NODE;
 }
 
 /*** radix-tree API starts here ***/
@@ -236,8 +245,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
  */
 static inline int radix_tree_exception(void *arg)
 {
-	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
+	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 
 /**
-- 
cgit v1.2.3


From acc93d30d7d43f428272c20a047389c4cbca82ba Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 7 May 2016 11:40:28 -0700
Subject: Revert "block: enable dax for raw block devices"

This reverts commit 5a023cdba50c5f5f2bc351783b3131699deb3937.

The functionality is superseded by the new "Device DAX" facility.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/ioctl.c           | 32 -----------------
 fs/block_dev.c          | 96 +++++++++++++++----------------------------------
 include/linux/fs.h      |  8 -----
 include/uapi/linux/fs.h |  1 -
 4 files changed, 29 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..698c7933d582 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -407,35 +407,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
@@ -598,9 +569,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
-	case BLKDAXGET:
-		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
-		break;
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..36ee10ca503e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -1159,6 +1160,33 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1724,79 +1752,13 @@ static const struct address_space_operations def_blk_aops = {
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents.  Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing.  We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
-{
-	return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
-{
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
-	.fault		= blkdev_dax_fault,
-	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
-	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(file);
-
-	file_accessed(file);
-	if (IS_DAX(bd_inode)) {
-		vma->vm_ops = &blkdev_dax_vm_ops;
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	} else {
-		vma->vm_ops = &blkdev_default_vm_ops;
-	}
-
-	return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..8363a10660f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,14 +2320,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a079d50376e1..fbff8b28aa35 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXGET _IO(0x12,129)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v1.2.3


From 5726d0b454614a47e641a04c8106392d67a8e1ad Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:06 -0700
Subject: nilfs2: remove FSF mailing address from GPL notices

This removes the extra paragraph which mentions FSF address in GPL
notices from source code of nilfs2 and avoids the checkpatch.pl error
related to it.

Link: http://lkml.kernel.org/r/1461935747-10380-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 4 ----
 fs/nilfs2/alloc.h         | 4 ----
 fs/nilfs2/bmap.c          | 4 ----
 fs/nilfs2/bmap.h          | 4 ----
 fs/nilfs2/btnode.c        | 4 ----
 fs/nilfs2/btnode.h        | 4 ----
 fs/nilfs2/btree.c         | 4 ----
 fs/nilfs2/btree.h         | 4 ----
 fs/nilfs2/cpfile.c        | 4 ----
 fs/nilfs2/cpfile.h        | 4 ----
 fs/nilfs2/dat.c           | 4 ----
 fs/nilfs2/dat.h           | 4 ----
 fs/nilfs2/dir.c           | 4 ----
 fs/nilfs2/direct.c        | 4 ----
 fs/nilfs2/direct.h        | 4 ----
 fs/nilfs2/file.c          | 4 ----
 fs/nilfs2/gcinode.c       | 4 ----
 fs/nilfs2/ifile.c         | 4 ----
 fs/nilfs2/ifile.h         | 4 ----
 fs/nilfs2/inode.c         | 4 ----
 fs/nilfs2/ioctl.c         | 4 ----
 fs/nilfs2/mdt.c           | 4 ----
 fs/nilfs2/mdt.h           | 4 ----
 fs/nilfs2/namei.c         | 4 ----
 fs/nilfs2/nilfs.h         | 4 ----
 fs/nilfs2/page.c          | 4 ----
 fs/nilfs2/page.h          | 4 ----
 fs/nilfs2/recovery.c      | 4 ----
 fs/nilfs2/segbuf.c        | 4 ----
 fs/nilfs2/segbuf.h        | 4 ----
 fs/nilfs2/segment.c       | 4 ----
 fs/nilfs2/segment.h       | 4 ----
 fs/nilfs2/sufile.c        | 4 ----
 fs/nilfs2/sufile.h        | 4 ----
 fs/nilfs2/super.c         | 4 ----
 fs/nilfs2/the_nilfs.c     | 4 ----
 fs/nilfs2/the_nilfs.h     | 4 ----
 include/linux/nilfs2_fs.h | 4 ----
 38 files changed, 152 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 2ccbf5531554..eaa0c6af80c3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Original code was written by Koji Sato <koji@osrg.net>.
  * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
  *                                Amagai Yoshiji <amagai@osrg.net>.
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 6e6f49aa53df..62982eea63c0 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Original code was written by Koji Sato <koji@osrg.net>.
  * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
  *                                Amagai Yoshiji <amagai@osrg.net>.
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index a9fb3636c142..fcd79e611c87 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bfa817ce40b3..baa6d404f369 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e0c9daf9aa22..55241effa3c0 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * This file was originally written by Seiji Kihara <kihara@osrg.net>
  * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
  * stabilization and simplification.
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index d876b565ce64..0727096a8c53 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Seiji Kihara <kihara@osrg.net>
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3a3821b00486..cc748ff3b876 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 22c02e35b6ef..9497732b3b1d 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b6596cab9e99..15016eca4dc4 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index a242b9a314f9..3dcaceb66be6 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7dc23f100e57..7427c114aa11 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index cbd8e9732503..edd0586220d4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6723d45a631a..955070bd20a5 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
  */
 /*
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index ebf89fd8ac1a..24700229d6d4 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index dc643de20a25..e4eb1b7e407c 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 088ba001c6ef..49243ca6e3e9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>,
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 0224b7826ace..b0321886ce33 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
  *            and Ryusuke Konishi <ryusuke@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 6548c7851b48..06f9b5aa6175 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  *
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 679674d13372..c61637e444f1 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Amagai Yoshiji <amagai@osrg.net>
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>
  *
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cfebcd2fc7f3..1bdb17bb01d3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e8fe24882b5b..4db0e5ff6c67 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6982b9153d5..0ab93bdfa261 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 03246cac3338..5fc07ecf32a3 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 3b2af05f9fb4..db0a4bcf6015 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
  *                       Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 385704027575..e2089a18b24b 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 489391561cda..b10057eb4d8e 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>,
  *            Seiji Kihara <kihara@osrg.net>.
  */
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index a43b8287d012..6df10e127678 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>,
  *            Seiji Kihara <kihara@osrg.net>.
  */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5afa77fadc11..2ae1dfccf6e8 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index f63620ce3892..8aef62d81553 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index b04f08cc2397..fa4ed03744bb 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 483e6634466f..cafb61c32b79 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0408b9b2814b..a118eabd3ed6 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 5b3720414e34..1d602482b51e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  */
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 35e34c84d4af..c4cf47e14b70 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>.
  */
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e05d1848164a..d304faa66f49 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  */
 /*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69bd801afb53..8e270128ae86 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 23778d385836..426e4acc772b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Ryusuke Konishi <ryusuke@osrg.net>
  *
  */
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index e9fcf90b270d..0d7f8141382a 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -13,10 +13,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  * Written by Koji Sato <koji@osrg.net>
  *            Ryusuke Konishi <ryusuke@osrg.net>
  */
-- 
cgit v1.2.3


From 4b420ab4eedc7a816ad0f2278871019de1a8ccef Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:09 -0700
Subject: nilfs2: clean up old e-mail addresses

E-mail addresses of osrg.net domain are no longer available.  This
removes them from authorship notices and prevents reporters from being
confused.

Link: http://lkml.kernel.org/r/1461935747-10380-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 5 ++---
 fs/nilfs2/alloc.h         | 5 ++---
 fs/nilfs2/bmap.c          | 2 +-
 fs/nilfs2/bmap.h          | 2 +-
 fs/nilfs2/btnode.c        | 5 ++---
 fs/nilfs2/btnode.h        | 4 ++--
 fs/nilfs2/btree.c         | 2 +-
 fs/nilfs2/btree.h         | 2 +-
 fs/nilfs2/cpfile.c        | 2 +-
 fs/nilfs2/cpfile.h        | 2 +-
 fs/nilfs2/dat.c           | 2 +-
 fs/nilfs2/dat.h           | 2 +-
 fs/nilfs2/dir.c           | 2 +-
 fs/nilfs2/direct.c        | 2 +-
 fs/nilfs2/direct.h        | 2 +-
 fs/nilfs2/file.c          | 3 +--
 fs/nilfs2/gcinode.c       | 5 ++---
 fs/nilfs2/ifile.c         | 4 ++--
 fs/nilfs2/ifile.h         | 4 ++--
 fs/nilfs2/inode.c         | 2 +-
 fs/nilfs2/ioctl.c         | 2 +-
 fs/nilfs2/mdt.c           | 2 +-
 fs/nilfs2/mdt.h           | 2 +-
 fs/nilfs2/namei.c         | 3 +--
 fs/nilfs2/nilfs.h         | 3 +--
 fs/nilfs2/page.c          | 3 +--
 fs/nilfs2/page.h          | 3 +--
 fs/nilfs2/recovery.c      | 2 +-
 fs/nilfs2/segbuf.c        | 2 +-
 fs/nilfs2/segbuf.h        | 2 +-
 fs/nilfs2/segment.c       | 2 +-
 fs/nilfs2/segment.h       | 2 +-
 fs/nilfs2/sufile.c        | 4 ++--
 fs/nilfs2/sufile.h        | 2 +-
 fs/nilfs2/super.c         | 2 +-
 fs/nilfs2/the_nilfs.c     | 2 +-
 fs/nilfs2/the_nilfs.h     | 2 +-
 include/linux/nilfs2_fs.h | 3 +--
 38 files changed, 46 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eaa0c6af80c3..82362a5d81ed 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- *                                Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
  */
 
 #include <linux/types.h>
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 62982eea63c0..2bd567d98bc6 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- *                                Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
  */
 
 #ifndef _NILFS_ALLOC_H
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index fcd79e611c87..a5eab798a7a3 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index baa6d404f369..c14f822682da 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_BMAP_H
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 55241effa3c0..0576033699bc 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * This file was originally written by Seiji Kihara <kihara@osrg.net>
- * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
- * stabilization and simplification.
+ * Originally written by Seiji Kihara.
+ * Fully revised by Ryusuke Konishi for stabilization and simplification.
  *
  */
 
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0727096a8c53..2cc1b80e18f7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Seiji Kihara <kihara@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Seiji Kihara.
+ * Revised by Ryusuke Konishi.
  */
 
 #ifndef _NILFS_BTNODE_H
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index cc748ff3b876..8fc73d0a923c 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/slab.h>
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 9497732b3b1d..df1a25faa83b 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_BTREE_H
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 15016eca4dc4..d192b48df9fb 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 3dcaceb66be6..5bdb8262928b 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_CPFILE_H
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7427c114aa11..e92257bc42ee 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/types.h>
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index edd0586220d4..c7035b5b1aed 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_DAT_H
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 955070bd20a5..5756dda3083f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji.
  */
 /*
  *  linux/fs/ext2/dir.c
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 24700229d6d4..22058d0b36e9 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/errno.h>
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index e4eb1b7e407c..3015a6e78724 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_DIRECT_H
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 49243ca6e3e9..547381f3ce13 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>,
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji and Ryusuke Konishi.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index b0321886ce33..693aded72498 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,9 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi.
+ * Revised by Ryusuke Konishi.
  *
  */
 /*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 06f9b5aa6175..d9048f6a7780 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index c61637e444f1..1b7d7afcb35e 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Amagai Yoshiji <amagai@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1bdb17bb01d3..90061151f0c2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 4db0e5ff6c67..e86599801d8b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #include <linux/fs.h>
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0ab93bdfa261..36fa9412f850 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #include <linux/buffer_head.h>
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 5fc07ecf32a3..54e3b7fd783c 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #ifndef _NILFS_MDT_H
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index db0a4bcf6015..8234af99d7d7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
- *                       Ryusuke Konishi <ryusuke@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi.
  */
 /*
  *  linux/fs/ext2/namei.c
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index e2089a18b24b..fa179d4e5dba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
  */
 
 #ifndef _NILFS_H
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b10057eb4d8e..19687139f197 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- *            Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
  */
 
 #include <linux/pagemap.h>
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 6df10e127678..041f2dc5e634 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- *            Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
  */
 
 #ifndef _NILFS_PAGE_H
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 2ae1dfccf6e8..402a45c2550d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 
 #include <linux/buffer_head.h>
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 8aef62d81553..52f6a6c8bab1 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fa4ed03744bb..fc3a0fe3119c 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 #ifndef _NILFS_SEGBUF_H
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index cafb61c32b79..ddbfb09527cd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a118eabd3ed6..6cb12dbee7c1 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 #ifndef _NILFS_SEGMENT_H
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1d602482b51e..df439da883bc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,8 +13,8 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Koji Sato.
+ * Revised by Ryusuke Konishi.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index c4cf47e14b70..46e89872294c 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
  */
 
 #ifndef _NILFS_SUFILE_H
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index d304faa66f49..2ed3e4008e72 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  */
 /*
  *  linux/fs/ext2/super.c
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8e270128ae86..ba4b8189c342 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 426e4acc772b..6a262933fed9 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,7 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
  *
  */
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 0d7f8141382a..823d63d61081 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -13,8 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
- * Written by Koji Sato <koji@osrg.net>
- *            Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
  */
 /*
  *  linux/include/linux/ext2_fs.h
-- 
cgit v1.2.3


From 0c6c44cb9f93f7c0ad803b41ae7c0b08cf6942e2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:39 -0700
Subject: nilfs2: avoid bare use of 'unsigned'

This fixes checkpatch.pl warning "WARNING: Prefer 'unsigned int' to
bare use of 'unsigned'".

Link: http://lkml.kernel.org/r/1462886671-3521-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         |  8 ++++----
 fs/nilfs2/alloc.h         |  2 +-
 fs/nilfs2/bmap.c          |  2 +-
 fs/nilfs2/bmap.h          |  4 ++--
 fs/nilfs2/btree.c         |  3 ++-
 fs/nilfs2/cpfile.c        |  8 +++++---
 fs/nilfs2/cpfile.h        |  4 ++--
 fs/nilfs2/dat.c           |  2 +-
 fs/nilfs2/dat.h           |  2 +-
 fs/nilfs2/dir.c           | 47 ++++++++++++++++++++++++-----------------------
 fs/nilfs2/direct.c        |  5 +++--
 fs/nilfs2/inode.c         | 14 +++++++-------
 fs/nilfs2/mdt.c           |  4 ++--
 fs/nilfs2/mdt.h           |  6 +++---
 fs/nilfs2/namei.c         |  2 +-
 fs/nilfs2/nilfs.h         |  2 +-
 fs/nilfs2/page.c          |  8 ++++----
 fs/nilfs2/page.h          |  3 ++-
 fs/nilfs2/recovery.c      |  4 ++--
 fs/nilfs2/segbuf.c        |  4 ++--
 fs/nilfs2/segbuf.h        |  5 +++--
 fs/nilfs2/segment.c       | 18 +++++++++---------
 fs/nilfs2/segment.h       |  4 ++--
 fs/nilfs2/sysfs.c         |  6 +++---
 fs/nilfs2/the_nilfs.c     |  4 ++--
 fs/nilfs2/the_nilfs.h     | 10 +++++-----
 include/linux/nilfs2_fs.h |  6 +++---
 27 files changed, 97 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index bdd5ac522904..698f582d69af 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -53,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode)
  * @inode: inode of metadata file using this allocator
  * @entry_size: size of the persistent object
  */
-int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
@@ -384,7 +384,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
-					    unsigned bsize,
+					    unsigned int bsize,
 					    spinlock_t *lock)
 {
 	int pos, end = bsize;
@@ -735,8 +735,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	unsigned long group, group_offset;
 	__u64 group_min_nr, last_nrs[8];
 	const unsigned long epg = nilfs_palloc_entries_per_group(inode);
-	const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
-	unsigned entry_start, end, pos;
+	const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
+	unsigned int entry_start, end, pos;
 	spinlock_t *lock;
 	int i, j, k, ret;
 	u32 nfree;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 2bd567d98bc6..05149e606a78 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -37,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode)
 	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
 }
 
-int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned int);
 int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
 				 struct buffer_head **);
 void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 4976fe3be4a5..f2a7877e0c8c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -93,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 }
 
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
-			     unsigned maxblocks)
+			     unsigned int maxblocks)
 {
 	int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index c14f822682da..a6852807b22c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -57,7 +57,7 @@ struct nilfs_bmap_stats {
 struct nilfs_bmap_operations {
 	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
 	int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
-				 unsigned);
+				 unsigned int);
 	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
 	int (*bop_delete)(struct nilfs_bmap *, __u64);
 	void (*bop_clear)(struct nilfs_bmap *);
@@ -150,7 +150,7 @@ struct nilfs_bmap_store {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int);
 int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
 int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
 int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 8fc73d0a923c..57ec6af28b49 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -685,7 +685,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
 }
 
 static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
-				     __u64 key, __u64 *ptrp, unsigned maxblocks)
+				     __u64 key, __u64 *ptrp,
+				     unsigned int maxblocks)
 {
 	struct nilfs_btree_path *path;
 	struct nilfs_btree_node *node;
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 16f884bd857c..b61c3e0eb342 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -431,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-					  void *buf, unsigned cisz, size_t nci)
+					  void *buf, unsigned int cisz,
+					  size_t nci)
 {
 	struct nilfs_checkpoint *cp;
 	struct nilfs_cpinfo *ci = buf;
@@ -482,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-					  void *buf, unsigned cisz, size_t nci)
+					  void *buf, unsigned int cisz,
+					  size_t nci)
 {
 	struct buffer_head *bh;
 	struct nilfs_cpfile_header *header;
@@ -568,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
  */
 
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-				void *buf, unsigned cisz, size_t nci)
+				void *buf, unsigned int cisz, size_t nci)
 {
 	switch (mode) {
 	case NILFS_CHECKPOINT:
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 5bdb8262928b..0249744ae234 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -33,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
-				size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *,
+				unsigned int, size_t);
 
 int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
 		      struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index e92257bc42ee..7367610ea807 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -424,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	return ret;
 }
 
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 			    size_t nvi)
 {
 	struct buffer_head *entry_bh;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index c7035b5b1aed..abbfdabcabea 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t);
 
 int nilfs_dat_read(struct super_block *sb, size_t entry_size,
 		   struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bbcc03de1e74..e506f4f7120a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -46,7 +46,7 @@
  * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
  * more robust, but we have what we have
  */
-static inline unsigned nilfs_chunk_size(struct inode *inode)
+static inline unsigned int nilfs_chunk_size(struct inode *inode)
 {
 	return inode->i_sb->s_blocksize;
 }
@@ -61,9 +61,9 @@ static inline void nilfs_put_page(struct page *page)
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
  */
-static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	unsigned last_byte = inode->i_size;
+	unsigned int last_byte = inode->i_size;
 
 	last_byte -= page_nr << PAGE_SHIFT;
 	if (last_byte > PAGE_SIZE)
@@ -71,7 +71,8 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+			       unsigned int to)
 {
 	loff_t pos = page_offset(page) + from;
 
@@ -80,12 +81,12 @@ static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
 
 static void nilfs_commit_chunk(struct page *page,
 			       struct address_space *mapping,
-			       unsigned from, unsigned to)
+			       unsigned int from, unsigned int to)
 {
 	struct inode *dir = mapping->host;
 	loff_t pos = page_offset(page) + from;
-	unsigned len = to - from;
-	unsigned nr_dirty, copied;
+	unsigned int len = to - from;
+	unsigned int nr_dirty, copied;
 	int err;
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
@@ -103,10 +104,10 @@ static bool nilfs_check_page(struct page *page)
 {
 	struct inode *dir = page->mapping->host;
 	struct super_block *sb = dir->i_sb;
-	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned int chunk_size = nilfs_chunk_size(dir);
 	char *kaddr = page_address(page);
-	unsigned offs, rec_len;
-	unsigned limit = PAGE_SIZE;
+	unsigned int offs, rec_len;
+	unsigned int limit = PAGE_SIZE;
 	struct nilfs_dir_entry *p;
 	char *error;
 
@@ -256,7 +257,6 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 	unsigned int offset = pos & ~PAGE_MASK;
 	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
-/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
 
 	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
 		return 0;
@@ -318,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 {
 	const unsigned char *name = qstr->name;
 	int namelen = qstr->len;
-	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
 	struct page *page = NULL;
@@ -408,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct page *page, struct inode *inode)
 {
-	unsigned from = (char *) de - (char *) page_address(page);
-	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+	unsigned int from = (char *)de - (char *)page_address(page);
+	unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
 	struct address_space *mapping = page->mapping;
 	int err;
 
@@ -431,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 	struct inode *dir = d_inode(dentry->d_parent);
 	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
-	unsigned chunk_size = nilfs_chunk_size(dir);
-	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned int chunk_size = nilfs_chunk_size(dir);
+	unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
 	struct page *page = NULL;
 	struct nilfs_dir_entry *de;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
 	char *kaddr;
-	unsigned from, to;
+	unsigned int from, to;
 	int err;
 
 	/*
@@ -531,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	char *kaddr = page_address(page);
-	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-	unsigned to = ((char *)dir - kaddr) +
-		nilfs_rec_len_from_disk(dir->rec_len);
-	struct nilfs_dir_entry *pde = NULL;
-	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	unsigned int from, to;
+	struct nilfs_dir_entry *de, *pde = NULL;
 	int err;
 
+	from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len);
+	de = (struct nilfs_dir_entry *)(kaddr + from);
+
 	while ((char *)de < (char *)dir) {
 		if (de->rec_len == 0) {
 			nilfs_error(inode->i_sb, __func__,
@@ -570,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page = grab_cache_page(mapping, 0);
-	unsigned chunk_size = nilfs_chunk_size(inode);
+	unsigned int chunk_size = nilfs_chunk_size(inode);
 	struct nilfs_dir_entry *de;
 	int err;
 	void *kaddr;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 22058d0b36e9..001068630063 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -58,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
 
 static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 				      __u64 key, __u64 *ptrp,
-				      unsigned maxblocks)
+				      unsigned int maxblocks)
 {
 	struct inode *dat = NULL;
 	__u64 ptr, ptr2;
@@ -79,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 		ptr = blocknr;
 	}
 
-	maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+	maxblocks = min_t(unsigned int, maxblocks,
+			  NILFS_DIRECT_KEY_MAX - key + 1);
 	for (cnt = 1; cnt < maxblocks &&
 		     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
 		     NILFS_BMAP_INVALID_PTR;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bbb47e8bde3e..83d2c485efba 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -83,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 blknum = 0;
 	int err = 0, ret;
-	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+	unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;
 
 	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
@@ -163,7 +163,7 @@ static int nilfs_readpage(struct file *file, struct page *page)
  * @nr_pages - number of pages to be read
  */
 static int nilfs_readpages(struct file *file, struct address_space *mapping,
-			   struct list_head *pages, unsigned nr_pages)
+			   struct list_head *pages, unsigned int nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
 }
@@ -222,7 +222,7 @@ static int nilfs_set_page_dirty(struct page *page)
 	int ret = __set_page_dirty_nobuffers(page);
 
 	if (page_has_buffers(page)) {
-		unsigned nr_dirty = 0;
+		unsigned int nr_dirty = 0;
 		struct buffer_head *bh, *head;
 
 		/*
@@ -245,7 +245,7 @@ static int nilfs_set_page_dirty(struct page *page)
 		if (nr_dirty)
 			nilfs_set_file_dirty(inode, nr_dirty);
 	} else if (ret) {
-		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+		unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
 
 		nilfs_set_file_dirty(inode, nr_dirty);
 	}
@@ -287,8 +287,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 			   struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	unsigned start = pos & (PAGE_SIZE - 1);
-	unsigned nr_dirty;
+	unsigned int start = pos & (PAGE_SIZE - 1);
+	unsigned int nr_dirty;
 	int err;
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, start,
@@ -902,7 +902,7 @@ int nilfs_inode_dirty(struct inode *inode)
 	return ret;
 }
 
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 8a2f8b240f25..3417d859a03c 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -490,8 +490,8 @@ void nilfs_mdt_destroy(struct inode *inode)
 	kfree(mdi);
 }
 
-void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
-			      unsigned header_size)
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
+			      unsigned int header_size)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ffb876e6efed..3f67f3932097 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -53,8 +53,8 @@ struct nilfs_shadow_map {
 struct nilfs_mdt_info {
 	struct rw_semaphore	mi_sem;
 	struct blockgroup_lock *mi_bgl;
-	unsigned		mi_entry_size;
-	unsigned		mi_first_entry_offset;
+	unsigned int		mi_entry_size;
+	unsigned int		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
 	struct nilfs_palloc_cache *mi_palloc_cache;
 	struct nilfs_shadow_map *mi_shadow;
@@ -90,7 +90,7 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
 void nilfs_mdt_clear(struct inode *inode);
 void nilfs_mdt_destroy(struct inode *inode);
 
-void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);
 
 int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 8f8070cffa58..1ec8ae5995a5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -139,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = dir->i_sb;
-	unsigned l = strlen(symname)+1;
+	unsigned int l = strlen(symname) + 1;
 	struct inode *inode;
 	int err;
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index fa179d4e5dba..ea320315d557 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -274,7 +274,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
 int nilfs_permission(struct inode *inode, int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
 extern int __nilfs_mark_inode_dirty(struct inode *, int);
 extern void nilfs_dirty_inode(struct inode *, int flags);
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 19687139f197..d97ba5f11b77 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -435,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
 	__nilfs_clear_page_dirty(page);
 }
 
-unsigned nilfs_page_count_clean_buffers(struct page *page,
-					unsigned from, unsigned to)
+unsigned int nilfs_page_count_clean_buffers(struct page *page,
+					    unsigned int from, unsigned int to)
 {
-	unsigned block_start, block_end;
+	unsigned int block_start, block_end;
 	struct buffer_head *bh, *head;
-	unsigned nc = 0;
+	unsigned int nc = 0;
 
 	for (bh = head = page_buffers(page), block_start = 0;
 	     bh != head || !block_start;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 041f2dc5e634..f3687c958fa8 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -53,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
 void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
+					    unsigned int);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
 					    sector_t *blkoff);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 685fa73cecd0..db156a199149 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -152,7 +152,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
 
 	sr = (struct nilfs_super_root *)bh_sr->b_data;
 	if (check) {
-		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+		unsigned int bytes = le16_to_cpu(sr->sr_bytes);
 
 		if (bytes == 0 || bytes > nilfs->ns_blocksize) {
 			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
@@ -504,7 +504,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 {
 	struct inode *inode;
 	struct nilfs_recovery_block *rb, *n;
-	unsigned blocksize = nilfs->ns_blocksize;
+	unsigned int blocksize = nilfs->ns_blocksize;
 	struct page *page;
 	loff_t pos;
 	int err = 0, err2 = 0;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 52f6a6c8bab1..bf36df10540b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -129,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 	return 0;
 }
 
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
 		       time_t ctime, __u64 cno)
 {
 	int err;
@@ -236,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
 {
 	struct nilfs_super_root *raw_sr;
 	struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
-	unsigned srsize;
+	unsigned int srsize;
 	u32 crc;
 
 	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fc3a0fe3119c..7bbccc099709 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -78,7 +78,7 @@ struct nilfs_segment_buffer {
 	__u64			sb_nextnum;
 	sector_t		sb_fseg_start, sb_fseg_end;
 	sector_t		sb_pseg_start;
-	unsigned		sb_rest_blocks;
+	unsigned int		sb_rest_blocks;
 
 	/* Buffers */
 	struct list_head	sb_segsum_buffers;
@@ -120,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
 			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+		       __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
 				struct buffer_head **);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a6ef1eb15edb..97dee069be83 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -393,10 +393,10 @@ static void nilfs_transaction_unlock(struct super_block *sb)
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
 					    struct nilfs_segsum_pointer *ssp,
-					    unsigned bytes)
+					    unsigned int bytes)
 {
 	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
-	unsigned blocksize = sci->sc_super->s_blocksize;
+	unsigned int blocksize = sci->sc_super->s_blocksize;
 	void *p;
 
 	if (unlikely(ssp->offset + bytes > blocksize)) {
@@ -418,8 +418,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
 	struct buffer_head *sumbh;
-	unsigned sumbytes;
-	unsigned flags = 0;
+	unsigned int sumbytes;
+	unsigned int flags = 0;
 	int err;
 
 	if (nilfs_doing_gc())
@@ -468,9 +468,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
  */
 static int nilfs_segctor_segsum_block_required(
 	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
-	unsigned binfo_size)
+	unsigned int binfo_size)
 {
-	unsigned blocksize = sci->sc_super->s_blocksize;
+	unsigned int blocksize = sci->sc_super->s_blocksize;
 	/* Size of finfo and binfo is enough small against blocksize */
 
 	return ssp->offset + binfo_size +
@@ -529,7 +529,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
 static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
 					struct buffer_head *bh,
 					struct inode *inode,
-					unsigned binfo_size)
+					unsigned int binfo_size)
 {
 	struct nilfs_segment_buffer *segbuf;
 	int required, err = 0;
@@ -773,7 +773,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
 {
 	struct nilfs_inode_info *ii, *n;
 	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
-	unsigned nv = 0;
+	unsigned int nv = 0;
 
 	while (!list_empty(head)) {
 		spin_lock(&nilfs->ns_inode_lock);
@@ -954,7 +954,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
-	unsigned isz, srsz;
+	unsigned int isz, srsz;
 
 	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
 	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6cb12dbee7c1..27822e760d3f 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
  */
 struct nilfs_cstage {
 	int			scnt;
-	unsigned		flags;
+	unsigned int		flags;
 	struct nilfs_inode_info *dirty_file_ptr;
 	struct nilfs_inode_info *gc_inode_ptr;
 };
@@ -80,7 +80,7 @@ struct nilfs_segment_buffer;
 
 struct nilfs_segsum_pointer {
 	struct buffer_head     *bh;
-	unsigned		offset; /* offset in bytes */
+	unsigned int		offset; /* offset in bytes */
 };
 
 /**
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 3e7d85335adf..8ffa42b704d8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
 				      struct the_nilfs *nilfs,
 				      char *buf)
 {
-	unsigned sbwcount;
+	unsigned int sbwcount;
 
 	down_read(&nilfs->ns_sem);
 	sbwcount = nilfs->ns_sbwcount;
@@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
 					    struct the_nilfs *nilfs,
 					    char *buf)
 {
-	unsigned sb_update_freq;
+	unsigned int sb_update_freq;
 
 	down_read(&nilfs->ns_sem);
 	sb_update_freq = nilfs->ns_sb_update_freq;
@@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
 					    struct the_nilfs *nilfs,
 					    const char *buf, size_t count)
 {
-	unsigned val;
+	unsigned int val;
 	int err;
 
 	err = kstrtouint(skip_spaces(buf), 0, &val);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba4b8189c342..b9e19ca3c96e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -108,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	struct nilfs_inode *rawi;
-	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
-	unsigned inode_size;
+	unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned int inode_size;
 	int err;
 
 	err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 62bd7b10fe43..06d2548d436d 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -114,10 +114,10 @@ struct the_nilfs {
 	struct buffer_head     *ns_sbh[2];
 	struct nilfs_super_block *ns_sbp[2];
 	time_t			ns_sbwtime;
-	unsigned		ns_sbwcount;
-	unsigned		ns_sbsize;
-	unsigned		ns_mount_state;
-	unsigned		ns_sb_update_freq;
+	unsigned int		ns_sbwcount;
+	unsigned int		ns_sbsize;
+	unsigned int		ns_mount_state;
+	unsigned int		ns_sb_update_freq;
 
 	/*
 	 * Following fields are dedicated to a writable FS-instance.
@@ -306,7 +306,7 @@ static inline void nilfs_get_root(struct nilfs_root *root)
 
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
 {
-	unsigned valid_fs;
+	unsigned int valid_fs;
 
 	down_read(&nilfs->ns_sem);
 	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 823d63d61081..3b584925d0e8 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -322,9 +322,9 @@ enum {
 					~NILFS_DIR_ROUND)
 #define NILFS_MAX_REC_LEN		((1<<16)-1)
 
-static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
 {
-	unsigned len = le16_to_cpu(dlen);
+	unsigned int len = le16_to_cpu(dlen);
 
 #if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
 	if (len == NILFS_MAX_REC_LEN)
@@ -333,7 +333,7 @@ static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
 	return len;
 }
 
-static inline __le16 nilfs_rec_len_to_disk(unsigned len)
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
 {
 #if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
 	if (len == (1 << 16))
-- 
cgit v1.2.3


From 076a378ba6e6b6ddd5f2336aa0876349b7d36409 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 23 May 2016 16:23:48 -0700
Subject: nilfs2: fix block comments

This fixes block comments with proper formatting to eliminate the
following checkpatch.pl warnings:

  "WARNING: Block comments use * on subsequent lines"
  "WARNING: Block comments use a trailing */ on a separate line"

Link: http://lkml.kernel.org/r/1462886671-3521-8-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c         | 12 +++++---
 fs/nilfs2/bmap.h          | 12 +++++---
 fs/nilfs2/cpfile.c        |  6 ++--
 fs/nilfs2/ifile.c         |  6 ++--
 fs/nilfs2/inode.c         | 73 ++++++++++++++++++++++++++++++-----------------
 fs/nilfs2/nilfs.h         | 24 ++++++++++------
 fs/nilfs2/recovery.c      | 14 +++++----
 fs/nilfs2/segment.c       | 56 ++++++++++++++++++++++++------------
 fs/nilfs2/segment.h       | 34 ++++++++++++++--------
 fs/nilfs2/the_nilfs.c     |  6 ++--
 include/linux/nilfs2_fs.h | 66 +++++++++++++++++++++++++++---------------
 11 files changed, 202 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 698f582d69af..1a85d94f5b25 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -68,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
 	mi->mi_blocks_per_group =
 		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
 			     mi->mi_entries_per_block) + 1;
-		/* Number of blocks in a group including entry blocks and
-		   a bitmap block */
+		/*
+		 * Number of blocks in a group including entry blocks
+		 * and a bitmap block
+		 */
 	mi->mi_blocks_per_desc_block =
 		nilfs_palloc_groups_per_desc_block(inode) *
 		mi->mi_blocks_per_group + 1;
-		/* Number of blocks per descriptor including the
-		   descriptor block */
+		/*
+		 * Number of blocks per descriptor including the
+		 * descriptor block
+		 */
 	return 0;
 }
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a6852807b22c..b6a4c8f93ac8 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -122,10 +122,14 @@ struct nilfs_bmap {
 
 /* pointer type */
 #define NILFS_BMAP_PTR_P	0	/* physical block number (i.e. LBN) */
-#define NILFS_BMAP_PTR_VS	1	/* virtual block number (single
-					   version) */
-#define NILFS_BMAP_PTR_VM	2	/* virtual block number (has multiple
-					   versions) */
+#define NILFS_BMAP_PTR_VS	1	/*
+					 * virtual block number (single
+					 * version)
+					 */
+#define NILFS_BMAP_PTR_VM	2	/*
+					 * virtual block number (has multiple
+					 * versions)
+					 */
 #define NILFS_BMAP_PTR_U	(-1)	/* never perform pointer operations */
 
 #define NILFS_BMAP_USE_VBN(bmap)	((bmap)->b_ptr_type > 0)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b61c3e0eb342..8a3d3b65af3f 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	void *kaddr;
 	int ret;
 
-	/* CP number is invalid if it's zero or larger than the
-	largest	exist one.*/
+	/*
+	 * CP number is invalid if it's zero or larger than the
+	 * largest existing one.
+	 */
 	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
 		return -ENOENT;
 	down_read(&NILFS_MDT(cpfile)->mi_sem);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d9048f6a7780..1d2b1805327a 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -64,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
 	struct nilfs_palloc_req req;
 	int ret;
 
-	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
-				 a group. dull code!! */
+	req.pr_entry_nr = 0;  /*
+			       * 0 says find free inode from beginning
+			       * of a group. dull code!!
+			       */
 	req.pr_entry_bh = NULL;
 
 	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 83d2c485efba..a0ebdb17e912 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -129,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
-		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
-						      to proper value */
+		map_bh(bh_result, inode->i_sb, 0);
+		/* Disk block number must be changed to proper value */
+
 	} else if (ret == -ENOENT) {
-		/* not found is not error (e.g. hole); must return without
-		   the mapped state flag. */
+		/*
+		 * not found is not error (e.g. hole); must return without
+		 * the mapped state flag.
+		 */
 		;
 	} else {
 		err = ret;
@@ -395,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 
 	err = nilfs_init_acl(inode, dir);
 	if (unlikely(err))
-		goto failed_after_creation; /* never occur. When supporting
-				    nilfs_init_acl(), proper cancellation of
-				    above jobs should be considered */
+		/*
+		 * Never occur.  When supporting nilfs_init_acl(),
+		 * proper cancellation of above jobs should be considered.
+		 */
+		goto failed_after_creation;
 
 	return inode;
 
  failed_after_creation:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
-	iput(inode);  /* raw_inode will be deleted through
-			 nilfs_evict_inode() */
+	iput(inode);  /*
+		       * raw_inode will be deleted through
+		       * nilfs_evict_inode().
+		       */
 	goto failed;
 
  failed_ifile_create_inode:
 	make_bad_inode(inode);
-	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
-			 called */
+	iput(inode);
  failed:
 	return ERR_PTR(err);
 }
@@ -662,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode,
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
-	/* When extending inode, nilfs->ns_inode_size should be checked
-	   for substitutions of appended fields */
+	/*
+	 * When extending inode, nilfs->ns_inode_size should be checked
+	 * for substitutions of appended fields.
+	 */
 }
 
 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
@@ -681,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
 	nilfs_write_inode_common(inode, raw_inode, 0);
-		/* XXX: call with has_bmap = 0 is a workaround to avoid
-		   deadlock of bmap. This delays update of i_bmap to just
-		   before writing */
+		/*
+		 * XXX: call with has_bmap = 0 is a workaround to avoid
+		 * deadlock of bmap.  This delays update of i_bmap to just
+		 * before writing.
+		 */
+
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 
@@ -748,8 +759,10 @@ void nilfs_truncate(struct inode *inode)
 	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(inode, 0);
 	nilfs_transaction_commit(sb);
-	/* May construct a logical segment and may fail in sync mode.
-	   But truncate has no return value. */
+	/*
+	 * May construct a logical segment and may fail in sync mode.
+	 * But truncate has no return value.
+	 */
 }
 
 static void nilfs_clear_inode(struct inode *inode)
@@ -806,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode)
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
-	/* May construct a logical segment and may fail in sync mode.
-	   But delete_inode has no return value. */
+	/*
+	 * May construct a logical segment and may fail in sync mode.
+	 * But delete_inode has no return value.
+	 */
 }
 
 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
@@ -915,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
 	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
-		/* Because this routine may race with nilfs_dispose_list(),
-		   we have to check NILFS_I_QUEUED here, too. */
+		/*
+		 * Because this routine may race with nilfs_dispose_list(),
+		 * we have to check NILFS_I_QUEUED here, too.
+		 */
 		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
-			/* This will happen when somebody is freeing
-			   this inode. */
+			/*
+			 * This will happen when somebody is freeing
+			 * this inode.
+			 */
 			nilfs_warning(inode->i_sb, __func__,
 				      "cannot get inode (ino=%lu)",
 				      inode->i_ino);
 			spin_unlock(&nilfs->ns_inode_lock);
-			return -EINVAL; /* NILFS_I_DIRTY may remain for
-					   freeing inode */
+			return -EINVAL; /*
+					 * NILFS_I_DIRTY may remain for
+					 * freeing inode.
+					 */
 		}
 		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index ea320315d557..b1d48bc0532d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -64,8 +64,10 @@ struct nilfs_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
-					   disk inode */
+	struct buffer_head *i_bh;	/*
+					 * i_bh contains a new or dirty
+					 * disk inode.
+					 */
 	struct nilfs_root *i_root;
 	struct inode vfs_inode;
 };
@@ -95,8 +97,10 @@ enum {
 	NILFS_I_NEW = 0,		/* Inode is newly created */
 	NILFS_I_DIRTY,			/* The file is dirty */
 	NILFS_I_QUEUED,			/* inode is in dirty_files list */
-	NILFS_I_BUSY,			/* inode is grabbed by a segment
-					   constructor */
+	NILFS_I_BUSY,			/*
+					 * Inode is grabbed by a segment
+					 * constructor
+					 */
 	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
 	NILFS_I_UPDATED,		/* The file has been written back */
 	NILFS_I_INODE_SYNC,		/* dsync is not allowed for inode */
@@ -140,8 +144,10 @@ enum {
 struct nilfs_transaction_info {
 	u32			ti_magic;
 	void		       *ti_save;
-				/* This should never used. If this happens,
-				   one of other filesystems has a bug. */
+				/*
+				 * This should never be used.  If it happens,
+				 * one of other filesystems has a bug.
+				 */
 	unsigned short		ti_flags;
 	unsigned short		ti_count;
 };
@@ -151,8 +157,10 @@ struct nilfs_transaction_info {
 
 /* ti_flags */
 #define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
-#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
-					   end of transaction. */
+#define NILFS_TI_SYNC		0x0002	/*
+					 * Force to construct segment at the
+					 * end of transaction.
+					 */
 #define NILFS_TI_GC		0x0004	/* GC context */
 #define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
 #define NILFS_TI_WRITER		0x0010	/* Constructor context */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index db156a199149..d893dc912b62 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -43,8 +43,10 @@ enum {
 
 /* work structure for recovery */
 struct nilfs_recovery_block {
-	ino_t ino;		/* Inode number of the file that this block
-				   belongs to */
+	ino_t ino;		/*
+				 * Inode number of the file that this block
+				 * belongs to
+				 */
 	sector_t blocknr;	/* block number */
 	__u64 vblocknr;		/* virtual block number */
 	unsigned long blkoff;	/* File offset of the data block (per block) */
@@ -869,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
 
 		flags = le16_to_cpu(sum->ss_flags);
 		if (!(flags & NILFS_SS_SR) && !scan_newer) {
-			/* This will never happen because a superblock
-			   (last_segment) always points to a pseg
-			   having a super root. */
+			/*
+			 * This will never happen because a superblock
+			 * (last_segment) always points to a pseg with
+			 * a super root.
+			 */
 			ret = NILFS_SEG_FAIL_CONSISTENCY;
 			goto failed;
 		}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9ee03c262ea..e78b68a81aec 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -45,18 +45,26 @@
  */
 #define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
 
-#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
-				appended in collection retry loop */
+#define SC_MAX_SEGDELTA 64   /*
+			      * Upper limit of the number of segments
+			      * appended in collection retry loop
+			      */
 
 /* Construction mode */
 enum {
 	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
-	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
-			   a logical segment without a super root */
-	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
-			   creating a checkpoint */
-	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
-			   a checkpoint */
+	SC_LSEG_DSYNC,	/*
+			 * Flush data blocks of a given file and make
+			 * a logical segment without a super root.
+			 */
+	SC_FLUSH_FILE,	/*
+			 * Flush data files, leads to segment writes without
+			 * creating a checkpoint.
+			 */
+	SC_FLUSH_DAT,	/*
+			 * Flush DAT file.  This also creates segments
+			 * without a checkpoint.
+			 */
 };
 
 /* Stage numbers of dirty block collection */
@@ -438,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
 {
 	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
 	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
-		return -E2BIG; /* The current segment is filled up
-				  (internal code) */
+		return -E2BIG; /*
+				* The current segment is filled up
+				* (internal code)
+				*/
 	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
 	return nilfs_segctor_reset_segment_buffer(sci);
 }
@@ -869,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
 					  &raw_cp, &bh_cp);
 	if (likely(!err)) {
-		/* The following code is duplicated with cpfile.  But, it is
-		   needed to collect the checkpoint even if it was not newly
-		   created */
+		/*
+		 * The following code is duplicated with cpfile.  But, it is
+		 * needed to collect the checkpoint even if it was not newly
+		 * created.
+		 */
 		mark_buffer_dirty(bh_cp);
 		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
 		nilfs_cpfile_put_checkpoint(
@@ -1400,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs,
 	if (atomic_read(&segbuf->sb_err)) {
 		/* Case 1: The first segment failed */
 		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
-			/* Case 1a:  Partial segment appended into an existing
-			   segment */
+			/*
+			 * Case 1a:  Partial segment appended into an existing
+			 * segment
+			 */
 			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
 						segbuf->sb_fseg_end);
 		else /* Case 1b:  New full segment */
@@ -1625,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
 static void nilfs_begin_page_io(struct page *page)
 {
 	if (!page || PageWriteback(page))
-		/* For split b-tree node pages, this function may be called
-		   twice.  We ignore the 2nd or later calls by this check. */
+		/*
+		 * For split b-tree node pages, this function may be called
+		 * twice.  We ignore the 2nd or later calls by this check.
+		 */
 		return;
 
 	lock_page(page);
@@ -2679,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 {
 	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
 
-	/* The segctord thread was stopped and its timer was removed.
-	   But some tasks remain. */
+	/*
+	 * The segctord thread was stopped and its timer was removed.
+	 * But some tasks remain.
+	 */
 	do {
 		struct nilfs_transaction_info ti;
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 27822e760d3f..6565c10b7b76 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -189,11 +189,15 @@ enum {
 	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
 	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
 	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
-	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
-				   checkpoint */
-	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
-				   other than DAT, cpfile, sufile, or files
-				   moved by GC */
+	NILFS_SC_PRIOR_FLUSH,	/*
+				 * Requesting immediate flush without making a
+				 * checkpoint
+				 */
+	NILFS_SC_HAVE_DELTA,	/*
+				 * Next checkpoint will have update of files
+				 * other than DAT, cpfile, sufile, or files
+				 * moved by GC.
+				 */
 };
 
 /* sc_state */
@@ -203,17 +207,23 @@ enum {
 /*
  * Constant parameters
  */
-#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
-					  destroying segctord */
+#define NILFS_SC_CLEANUP_RETRY	    3  /*
+					* Retry count of construction when
+					* destroying segctord
+					*/
 
 /*
  * Default values of timeout, in seconds.
  */
-#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
-					   It triggers construction of a
-					   logical segment with a super root */
-#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
-					   creation */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /*
+					 * Timeout value of dirty blocks.
+					 * It triggers construction of a
+					 * logical segment with a super root.
+					 */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /*
+					 * Maximum frequency of super root
+					 * creation
+					 */
 
 /*
  * The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index b9e19ca3c96e..809bd2de7ad0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -617,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
 		if (err)
 			goto out;
-			/* not failed_sbh; sbh is released automatically
-			   when reloading fails. */
+			/*
+			 * Not to failed_sbh; sbh is released automatically
+			 * when reloading fails.
+			 */
 	}
 	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
 	nilfs->ns_blocksize = blocksize;
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 3b584925d0e8..5988dd57ba66 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -127,10 +127,14 @@ struct nilfs_super_root {
 #define NILFS_MOUNT_ERRORS_RO		0x0020  /* Remount fs ro on errors */
 #define NILFS_MOUNT_ERRORS_PANIC	0x0040  /* Panic on errors */
 #define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
-#define NILFS_MOUNT_STRICT_ORDER	0x2000  /* Apply strict in-order
-						   semantics also for data */
-#define NILFS_MOUNT_NORECOVERY		0x4000  /* Disable write access during
-						   mount-time recovery */
+#define NILFS_MOUNT_STRICT_ORDER	0x2000  /*
+						 * Apply strict in-order
+						 * semantics also for data
+						 */
+#define NILFS_MOUNT_NORECOVERY		0x4000  /*
+						 * Disable write access during
+						 * mount-time recovery
+						 */
 #define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
 
 
@@ -142,16 +146,20 @@ struct nilfs_super_block {
 	__le16	s_minor_rev_level;	/* minor revision level */
 	__le16	s_magic;		/* Magic signature */
 
-	__le16  s_bytes;		/* Bytes count of CRC calculation
-					   for this structure. s_reserved
-					   is excluded. */
+	__le16  s_bytes;		/*
+					 * Bytes count of CRC calculation
+					 * for this structure. s_reserved
+					 * is excluded.
+					 */
 	__le16  s_flags;		/* flags */
 	__le32  s_crc_seed;		/* Seed value of CRC calculation */
 /*10*/	__le32	s_sum;			/* Check sum of super block */
 
-	__le32	s_log_block_size;	/* Block size represented as follows
-					   blocksize =
-					       1 << (s_log_block_size + 10) */
+	__le32	s_log_block_size;	/*
+					 * Block size represented as follows
+					 * blocksize =
+					 *     1 << (s_log_block_size + 10)
+					 */
 	__le64  s_nsegments;		/* Number of segments in filesystem */
 /*20*/	__le64  s_dev_size;		/* block device size in bytes */
 	__le64	s_first_data_block;	/* 1st seg disk block number */
@@ -163,8 +171,10 @@ struct nilfs_super_block {
 	__le64  s_last_seq;             /* seq. number of seg written last */
 /*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
 
-	__le64	s_ctime;		/* Creation time (execution time of
-					   newfs) */
+	__le64	s_ctime;		/*
+					 * Creation time (execution time of
+					 * newfs)
+					 */
 /*60*/	__le64	s_mtime;		/* Mount time */
 	__le64	s_wtime;		/* Write time */
 /*70*/	__le16	s_mnt_count;		/* Mount count */
@@ -188,8 +198,10 @@ struct nilfs_super_block {
 /*A8*/	char	s_volume_name[80];	/* volume name */
 
 /*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
-	__le32  s_c_block_max;          /* Threshold of data amount for
-					   the segment construction */
+	__le32  s_c_block_max;          /*
+					 * Threshold of data amount for
+					 * the segment construction
+					 */
 /*100*/	__le64  s_feature_compat;	/* Compatible feature set */
 	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
 	__le64  s_feature_incompat;	/* Incompatible feature set */
@@ -242,12 +254,18 @@ struct nilfs_super_block {
 
 #define NILFS_SB_OFFSET_BYTES	1024	/* byte offset of nilfs superblock */
 
-#define NILFS_SEG_MIN_BLOCKS	16	/* Minimum number of blocks in
-					   a full segment */
-#define NILFS_PSEG_MIN_BLOCKS	2	/* Minimum number of blocks in
-					   a partial segment */
-#define NILFS_MIN_NRSVSEGS	8	/* Minimum number of reserved
-					   segments */
+#define NILFS_SEG_MIN_BLOCKS	16	/*
+					 * Minimum number of blocks in
+					 * a full segment
+					 */
+#define NILFS_PSEG_MIN_BLOCKS	2	/*
+					 * Minimum number of blocks in
+					 * a partial segment
+					 */
+#define NILFS_MIN_NRSVSEGS	8	/*
+					 * Minimum number of reserved
+					 * segments
+					 */
 
 /*
  * We call DAT, cpfile, and sufile root metadata files.  Inodes of
@@ -513,9 +531,11 @@ struct nilfs_checkpoint {
 	__le64 cp_inodes_count;
 	__le64 cp_blocks_count;
 
-	/* Do not change the byte offset of ifile inode.
-	   To keep the compatibility of the disk format,
-	   additional fields should be added behind cp_ifile_inode. */
+	/*
+	 * Do not change the byte offset of ifile inode.
+	 * To keep the compatibility of the disk format,
+	 * additional fields should be added behind cp_ifile_inode.
+	 */
 	struct nilfs_inode cp_ifile_inode;
 };
 
-- 
cgit v1.2.3


From c96fc2d85f4a827e3bb2abe7de2394a1fb8a0fe7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 23 May 2016 16:23:57 -0700
Subject: signal: make oom_flags a bool

Currently the size of "struct signal_struct"->oom_flags member is
sizeof(unsigned) bytes, but only one flag OOM_FLAG_ORIGIN which is
updated by current thread is defined.  We can convert OOM_FLAG_ORIGIN
into a bool, and reuse the saved bytes for updating from the OOM killer
and/or the OOM reaper thread.

By the way, do we care about a race window between run_store() and
swapoff() because it would be theoretically possible that two threads
sharing the "struct signal_struct" concurrently call respective
functions? If we care, we can make oom_flags an atomic_t.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h   | 9 +++------
 include/linux/sched.h | 6 +++++-
 include/linux/types.h | 1 -
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index d3f533f2f481..83469522690a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -50,24 +50,21 @@ enum oom_scan_t {
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
-/* Thread is the potential origin of an oom condition; kill first on oom */
-#define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
-
 extern struct mutex oom_lock;
 
 static inline void set_current_oom_origin(void)
 {
-	current->signal->oom_flags |= OOM_FLAG_ORIGIN;
+	current->signal->oom_flag_origin = true;
 }
 
 static inline void clear_current_oom_origin(void)
 {
-	current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
+	current->signal->oom_flag_origin = false;
 }
 
 static inline bool oom_task_origin(const struct task_struct *p)
 {
-	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
+	return p->signal->oom_flag_origin;
 }
 
 extern void mark_oom_victim(struct task_struct *tsk);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c036de6c1ee..21c26e78aec5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -794,7 +794,11 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
-	oom_flags_t oom_flags;
+	/*
+	 * Thread is the potential origin of an oom condition; kill first on
+	 * oom
+	 */
+	bool oom_flag_origin;
 	short oom_score_adj;		/* OOM kill score adjustment */
 	short oom_score_adj_min;	/* OOM kill score adjustment min value.
 					 * Only settable by CAP_SYS_RESOURCE. */
diff --git a/include/linux/types.h b/include/linux/types.h
index 70dd3dfde631..baf718324f4a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,7 +156,6 @@ typedef u32 dma_addr_t;
 
 typedef unsigned __bitwise__ gfp_t;
 typedef unsigned __bitwise__ fmode_t;
-typedef unsigned __bitwise__ oom_flags_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
-- 
cgit v1.2.3


From 5c8ccefdf46c5f87d87b694c7fbc04941c2c99a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:24:02 -0700
Subject: signal: move the "sig < SIGRTMIN" check into siginmask(sig)

All the users of siginmask() must ensure that sig < SIGRTMIN.  sig_fatal()
doesn't and this is wrong:

	UBSAN: Undefined behaviour in kernel/signal.c:911:6
	shift exponent 32 is too large for 32-bit type 'long unsigned int'

the patch doesn't add the neccesary check to sig_fatal(), it moves the
check into siginmask() and updates other callers.

Link: http://lkml.kernel.org/r/20160517195052.GA15187@redhat.com
Reported-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 639be264f5f9..b63f63eaa39c 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -400,7 +400,9 @@ int unhandled_signal(struct task_struct *tsk, int sig);
 #else
 #define rt_sigmask(sig)	sigmask(sig)
 #endif
-#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
+
+#define siginmask(sig, mask) \
+	((sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))
 
 #define SIG_KERNEL_ONLY_MASK (\
 	rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))
@@ -421,14 +423,10 @@ int unhandled_signal(struct task_struct *tsk, int sig);
         rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
 	rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )
 
-#define sig_kernel_only(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
-	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
+#define sig_kernel_only(sig)		siginmask(sig, SIG_KERNEL_ONLY_MASK)
+#define sig_kernel_coredump(sig)	siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
+#define sig_kernel_ignore(sig)		siginmask(sig, SIG_KERNEL_IGNORE_MASK)
+#define sig_kernel_stop(sig)		siginmask(sig, SIG_KERNEL_STOP_MASK)
 
 #define sig_user_defined(t, signr) \
 	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-- 
cgit v1.2.3


From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:10 -0700
Subject: kexec: introduce a protection mechanism for the crashkernel reserved
 memory

For the cases that some kernel (module) path stamps the crash reserved
memory(already mapped by the kernel) where has been loaded the second
kernel data, the kdump kernel will probably fail to boot when panic
happens (or even not happens) leaving the culprit at large, this is
unacceptable.

The patch introduces a mechanism for detecting such cases:

1) After each crash kexec loading, it simply marks the reserved memory
   regions readonly since we no longer access it after that.  When someone
   stamps the region, the first kernel will panic and trigger the kdump.
   The weak arch_kexec_protect_crashkres() is introduced to do the actual
   protection.

2) To allow multiple loading, once 1) was done we also need to remark
   the reserved memory to readwrite each time a system call related to
   kdump is made.  The weak arch_kexec_unprotect_crashkres() is introduced
   to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 ++
 kernel/kexec.c        | 9 ++++++++-
 kernel/kexec_core.c   | 6 ++++++
 kernel/kexec_file.c   | 8 +++++++-
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 2cc643c6e870..643ff4a3fbf6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
 					Elf_Shdr *sechdrs, unsigned int relsec);
 int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 					unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);
 
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..b44cb3f5a15c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH)
+	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
+
 	if (nr_segments > 0) {
 		unsigned long i;
 
@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5d408252992..48b73cc8e425 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void)
 
 void __weak crash_unmap_reserved_pages(void)
 {}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..503bc2d348e5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_FILE_ON_CRASH)
+	if (flags & KEXEC_FILE_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 	return ret;
-- 
cgit v1.2.3


From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:22 -0700
Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and
 arch_kexec_protect(unprotect)_crashkres()

Commit 3f625002581b ("kexec: introduce a protection mechanism for the
crashkernel reserved memory") is a similar mechanism for protecting the
crash kernel reserved memory to previous crash_map/unmap_reserved_pages()
implementation, the new one is more generic in name and cleaner in code
(besides, some arch may not be allowed to unmap the pgtable).

Therefore, this patch consolidates them, and uses the new
arch_kexec_protect(unprotect)_crashkres() to replace former
crash_map/unmap_reserved_pages() which by now has been only used by
S390.

The consolidation work needs the crash memory to be mapped initially,
this is done in machine_kdump_pm_init() which is after
reserve_crashkernel().  Once kdump kernel is loaded, the new
arch_kexec_protect_crashkres() implemented for S390 will actually
unmap the pgtable like before.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/machine_kexec.c | 28 ++++++++++++++++++----------
 include/linux/kexec.h            |  2 --
 kernel/kexec.c                   | 12 ------------
 kernel/kexec_core.c              | 11 ++---------
 4 files changed, 20 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 2f1b7217c25c..0e64f08d3d69 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -43,13 +43,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 	switch (action) {
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
-		if (crashk_res.start)
-			crash_map_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
 		break;
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
-		if (crashk_res.start)
-			crash_unmap_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_protect_crashkres();
 		break;
 	default:
 		return NOTIFY_DONE;
@@ -60,6 +60,8 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 static int __init machine_kdump_pm_init(void)
 {
 	pm_notifier(machine_kdump_pm_cb, 0);
+	/* Create initial mapping for crashkernel memory */
+	arch_kexec_unprotect_crashkres();
 	return 0;
 }
 arch_initcall(machine_kdump_pm_init);
@@ -146,6 +148,8 @@ static int kdump_csum_valid(struct kimage *image)
 #endif
 }
 
+#ifdef CONFIG_CRASH_DUMP
+
 /*
  * Map or unmap crashkernel memory
  */
@@ -167,21 +171,25 @@ static void crash_map_pages(int enable)
 }
 
 /*
- * Map crashkernel memory
+ * Unmap crashkernel memory
  */
-void crash_map_reserved_pages(void)
+void arch_kexec_protect_crashkres(void)
 {
-	crash_map_pages(1);
+	if (crashk_res.end)
+		crash_map_pages(0);
 }
 
 /*
- * Unmap crashkernel memory
+ * Map crashkernel memory
  */
-void crash_unmap_reserved_pages(void)
+void arch_kexec_unprotect_crashkres(void)
 {
-	crash_map_pages(0);
+	if (crashk_res.end)
+		crash_map_pages(1);
 }
 
+#endif
+
 /*
  * Give back memory to hypervisor before new kdump is loaded
  */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 643ff4a3fbf6..e8acb2b43dd9 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -230,8 +230,6 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
 void crash_save_vmcoreinfo(void);
-void crash_map_reserved_pages(void);
-void crash_unmap_reserved_pages(void);
 void arch_crash_save_vmcoreinfo(void);
 __printf(1, 2)
 void vmcoreinfo_append_str(const char *fmt, ...);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b73dc211fcfd..4384672d3245 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (ret)
 		return ret;
 
-	if (flags & KEXEC_ON_CRASH)
-		crash_map_reserved_pages();
-
 	if (flags & KEXEC_PRESERVE_CONTEXT)
 		image->preserve_context = 1;
 
@@ -161,12 +158,6 @@ out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
-	/*
-	 * Once the reserved memory is mapped, we should unmap this memory
-	 * before returning
-	 */
-	if (flags & KEXEC_ON_CRASH)
-		crash_unmap_reserved_pages();
 	kimage_free(image);
 	return ret;
 }
@@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 
 	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
-		arch_kexec_protect_crashkres();
-
 	mutex_unlock(&kexec_mutex);
 
 	return result;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 48b73cc8e425..56b3ed0927b0 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size)
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
-	crash_map_reserved_pages();
 	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
@@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size)
 	crashk_res.end = end - 1;
 
 	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
 
 unlock:
 	mutex_unlock(&kexec_mutex);
@@ -1553,17 +1551,12 @@ int kernel_kexec(void)
 }
 
 /*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
  *
  * Provide an empty default implementation here -- architecture
  * code may override this
  */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 void __weak arch_kexec_protect_crashkres(void)
 {}
 
-- 
cgit v1.2.3


From 9fbeb5ab59a2b2a09cca2eb68283e7a090d4b98d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:30 -0700
Subject: mm: make vm_mmap killable

All the callers of vm_mmap seem to check for the failure already and
bail out in one way or another on the error which means that we can
change it to use killable version of vm_mmap_pgoff and return -EINTR if
the current task gets killed while waiting for mmap_sem.  This also
means that vm_mmap_pgoff can be killable by default and drop the
additional parameter.

This will help in the OOM conditions when the oom victim might be stuck
waiting for the mmap_sem for write which in turn can block oom_reaper
which relies on the mmap_sem for read to make a forward progress and
reclaim the address space of the victim.

Please note that load_elf_binary is ignoring vm_mmap error for
current->personality & MMAP_PAGE_ZERO case but that shouldn't be a
problem because the address is not used anywhere and we never return to
the userspace if we got killed.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/internal.h      |  3 +--
 mm/mmap.c          |  2 +-
 mm/nommu.c         |  2 +-
 mm/util.c          | 13 ++++---------
 5 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b530c99e8e81..d5eb8dddd7c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2013,7 +2013,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
-extern unsigned long vm_mmap(struct file *, unsigned long,
+extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);
 
diff --git a/mm/internal.h b/mm/internal.h
index bff7fd702331..a37e5b6f9d25 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,8 +444,7 @@ extern u32 hwpoison_filter_enable;
 
 extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
-        unsigned long, unsigned long,
-        bool);
+        unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
diff --git a/mm/mmap.c b/mm/mmap.c
index 11e1f2ca72af..420088682d4a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1333,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 out_fput:
 	if (file)
 		fput(file);
diff --git a/mm/nommu.c b/mm/nommu.c
index b74512746aae..c8bd59a03c71 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
 	if (file)
 		fput(file);
diff --git a/mm/util.c b/mm/util.c
index 03b237746850..917e0e3d0f8e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
-	unsigned long flag, unsigned long pgoff, bool killable)
+	unsigned long flag, unsigned long pgoff)
 {
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
@@ -297,12 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
-		if (killable) {
-			if (down_write_killable(&mm->mmap_sem))
-				return -EINTR;
-		} else {
-			down_write(&mm->mmap_sem);
-		}
+		if (down_write_killable(&mm->mmap_sem))
+			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate);
 		up_write(&mm->mmap_sem);
@@ -312,7 +308,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	return ret;
 }
 
-/* XXX are all callers checking an error */
 unsigned long vm_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long offset)
@@ -322,7 +317,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 	if (unlikely(offset_in_page(offset)))
 		return -EINVAL;
 
-	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false);
+	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(vm_mmap);
 
-- 
cgit v1.2.3


From 2d6c928241add2848e4eebfce407e95164229976 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:42 -0700
Subject: mm: make vm_brk killable

Now that all the callers handle vm_brk failure we can change it wait for
mmap_sem killable to help oom_reaper to not get blocked just because
vm_brk gets blocked behind mmap_sem readers.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/mmap.c          | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d5eb8dddd7c0..2835d598d258 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,7 +2011,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long vm_brk(unsigned long, unsigned long);
+extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index ca292a7c2b68..d3d9a94ca031 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2712,12 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	unsigned long ret;
 	bool populate;
 
-	/*
-	 * XXX not all users are chcecking the return value, convert
-	 * to down_write_killable after they are able to cope with
-	 * error
-	 */
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-- 
cgit v1.2.3


From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 May 2016 11:19:07 +0200
Subject: sched/core: Fix remote wakeups

Commit:

  b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")

... introduced a bug: Mike Galbraith found that it introduced a
performance regression, while Paul E. McKenney reported lost
wakeups and bisected it to this commit.

The reason is that I mis-read ttwu_queue() such that I assumed any
wakeup that got a remote queue must have had the task migrated.

Since this is not so; we need to transfer this information between
queueing the wakeup and actually doing the wakeup. Use a new
task_struct::sched_flag for this, we already write to
sched_contributes_to_load in the wakeup path so this is a hot and
modified cacheline.

Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Segall <bsegall@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")
Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6cc0df970f1a..e053517a88b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1533,6 +1533,7 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned sched_remote_wakeup:1;
 	unsigned :0; /* force alignment to the next boundary */
 
 	/* unserialized, strictly 'current' */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
 	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
+		int wake_flags = 0;
+
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		/*
-		 * See ttwu_queue(); we only call ttwu_queue_remote() when
-		 * its a x-cpu wakeup.
-		 */
-		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+		if (p->sched_remote_wakeup)
+			wake_flags = WF_MIGRATED;
+
+		ttwu_do_activate(rq, p, wake_flags, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
 	irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 
+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
 			smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
-		ttwu_queue_remote(p, cpu);
+		ttwu_queue_remote(p, cpu, wake_flags);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 536a6f88c49dd739961ffd53774775afed852c83 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Wed, 18 May 2016 13:26:23 +0200
Subject: KVM: Create debugfs dir and stat files for each VM

This patch adds a kvm debugfs subdirectory for each VM, which is named
after its pid and file descriptor. The directories contain the same
kind of files that are already in the kvm debugfs directory, but the
data exported through them is now VM specific.

This makes the debugfs kvm data a convenient alternative to the
tracepoints which already have per VM data. The debugfs data is easy
to read and low overhead.

CC: Dan Carpenter <dan.carpenter@oracle.com> [includes fixes by Dan Carpenter]
Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |   7 ++
 virt/kvm/kvm_main.c      | 187 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 184 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b1fa8f11c95b..1c9c973a7dd9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -412,6 +412,8 @@ struct kvm {
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
+	struct dentry *debugfs_dentry;
+	struct kvm_stat_data **debugfs_stat_data;
 };
 
 #define kvm_err(fmt, ...) \
@@ -991,6 +993,11 @@ enum kvm_stat_kind {
 	KVM_STAT_VCPU,
 };
 
+struct kvm_stat_data {
+	int offset;
+	struct kvm *kvm;
+};
+
 struct kvm_stats_debugfs_item {
 	const char *name;
 	int offset;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dd4ac9d9e8f5..37af23052470 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -63,6 +63,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+/* Worst case buffer size needed for holding an integer. */
+#define ITOA_MAX_LEN 12
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
+static int kvm_debugfs_num_entries;
+static const struct file_operations *stat_fops_per_vm[];
+
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
 #ifdef CONFIG_KVM_COMPAT
@@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 	kvfree(slots);
 }
 
+static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+{
+	int i;
+
+	if (!kvm->debugfs_dentry)
+		return;
+
+	debugfs_remove_recursive(kvm->debugfs_dentry);
+
+	for (i = 0; i < kvm_debugfs_num_entries; i++)
+		kfree(kvm->debugfs_stat_data[i]);
+	kfree(kvm->debugfs_stat_data);
+}
+
+static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+{
+	char dir_name[ITOA_MAX_LEN * 2];
+	struct kvm_stat_data *stat_data;
+	struct kvm_stats_debugfs_item *p;
+
+	if (!debugfs_initialized())
+		return 0;
+
+	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+	kvm->debugfs_dentry = debugfs_create_dir(dir_name,
+						 kvm_debugfs_dir);
+	if (!kvm->debugfs_dentry)
+		return -ENOMEM;
+
+	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+					 sizeof(*kvm->debugfs_stat_data),
+					 GFP_KERNEL);
+	if (!kvm->debugfs_stat_data)
+		return -ENOMEM;
+
+	for (p = debugfs_entries; p->name; p++) {
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+		if (!stat_data)
+			return -ENOMEM;
+
+		stat_data->kvm = kvm;
+		stat_data->offset = p->offset;
+		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+		if (!debugfs_create_file(p->name, 0444,
+					 kvm->debugfs_dentry,
+					 stat_data,
+					 stat_fops_per_vm[p->kind]))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
 	int r, i;
@@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	int i;
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_destroy_vm_debugfs(kvm);
 	kvm_arch_sync_events(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
@@ -2999,8 +3058,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	}
 #endif
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
-	if (r < 0)
+	if (r < 0) {
 		kvm_put_kvm(kvm);
+		return r;
+	}
+
+	if (kvm_create_vm_debugfs(kvm, r) < 0) {
+		kvm_put_kvm(kvm);
+		return -ENOMEM;
+	}
 
 	return r;
 }
@@ -3425,15 +3491,114 @@ static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 };
 
+static int kvm_debugfs_open(struct inode *inode, struct file *file,
+			   int (*get)(void *, u64 *), int (*set)(void *, u64),
+			   const char *fmt)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+					  inode->i_private;
+
+	/* The debugfs files are a reference to the kvm struct which
+	 * is still valid when kvm_destroy_vm is called.
+	 * To avoid the race between open and the removal of the debugfs
+	 * directory we test against the users count.
+	 */
+	if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+		return -ENOENT;
+
+	if (simple_attr_open(inode, file, get, set, fmt)) {
+		kvm_put_kvm(stat_data->kvm);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int kvm_debugfs_release(struct inode *inode, struct file *file)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+					  inode->i_private;
+
+	simple_attr_release(inode, file);
+	kvm_put_kvm(stat_data->kvm);
+
+	return 0;
+}
+
+static int vm_stat_get_per_vm(void *data, u64 *val)
+{
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+
+	*val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+
+	return 0;
+}
+
+static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+	__simple_attr_check_format("%llu\n", 0ull);
+	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+				NULL, "%llu\n");
+}
+
+static const struct file_operations vm_stat_get_per_vm_fops = {
+	.owner   = THIS_MODULE,
+	.open    = vm_stat_get_per_vm_open,
+	.release = kvm_debugfs_release,
+	.read    = simple_attr_read,
+	.write   = simple_attr_write,
+	.llseek  = generic_file_llseek,
+};
+
+static int vcpu_stat_get_per_vm(void *data, u64 *val)
+{
+	int i;
+	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+	struct kvm_vcpu *vcpu;
+
+	*val = 0;
+
+	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+		*val += *(u32 *)((void *)vcpu + stat_data->offset);
+
+	return 0;
+}
+
+static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+	__simple_attr_check_format("%llu\n", 0ull);
+	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+				 NULL, "%llu\n");
+}
+
+static const struct file_operations vcpu_stat_get_per_vm_fops = {
+	.owner   = THIS_MODULE,
+	.open    = vcpu_stat_get_per_vm_open,
+	.release = kvm_debugfs_release,
+	.read    = simple_attr_read,
+	.write   = simple_attr_write,
+	.llseek  = generic_file_llseek,
+};
+
+static const struct file_operations *stat_fops_per_vm[] = {
+	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+	[KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+};
+
 static int vm_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
+	struct kvm_stat_data stat_tmp = {.offset = offset};
+	u64 tmp_val;
 
 	*val = 0;
 	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		*val += *(u32 *)((void *)kvm + offset);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		stat_tmp.kvm = kvm;
+		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		*val += tmp_val;
+	}
 	spin_unlock(&kvm_lock);
 	return 0;
 }
@@ -3444,15 +3609,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-	int i;
+	struct kvm_stat_data stat_tmp = {.offset = offset};
+	u64 tmp_val;
 
 	*val = 0;
 	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			*val += *(u32 *)((void *)vcpu + offset);
-
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		stat_tmp.kvm = kvm;
+		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		*val += tmp_val;
+	}
 	spin_unlock(&kvm_lock);
 	return 0;
 }
@@ -3473,7 +3639,8 @@ static int kvm_init_debug(void)
 	if (kvm_debugfs_dir == NULL)
 		goto out;
 
-	for (p = debugfs_entries; p->name; ++p) {
+	kvm_debugfs_num_entries = 0;
+	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
 		if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
 					 (void *)(long)p->offset,
 					 stat_fops[p->kind]))
-- 
cgit v1.2.3


From 13d1ad16d05eebb4db977eb955716b9da2c19fbd Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 27 Apr 2016 14:15:51 +0200
Subject: libceph: move message allocation out of ceph_osdc_alloc_request()

The size of ->r_request and ->r_reply messages depends on the size of
the object name (ceph_object_id), while the size of ceph_osd_request is
fixed.  Move message allocation into a separate function that would
have to be called after ceph_object_id and ceph_object_locator (which
is also going to become variable in size with RADOS namespaces) have
been filled in:

    req = ceph_osdc_alloc_request(...);
    <fill in req->r_base_oid>
    <fill in req->r_base_oloc>
    ceph_osdc_alloc_messages(req);

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 18 ++++++++-
 fs/ceph/addr.c                  |  8 ++++
 fs/ceph/file.c                  |  7 ++++
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 88 +++++++++++++++++++++++------------------
 5 files changed, 82 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c3089f32a392..bda4deade82e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1954,7 +1954,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
 					  GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1967,7 +1967,14 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 /*
@@ -2003,7 +2010,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
 						false, GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
 	osd_req->r_callback = rbd_osd_req_callback;
@@ -2012,7 +2019,14 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e61fc8bb371..6fee7e0b8931 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1762,6 +1762,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		 "%llx.00000000", ci->i_vino.ino);
 	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
 
+	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
 					 1, false, GFP_NOFS);
 	if (!wr_req) {
@@ -1775,6 +1779,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_base_oloc.pool = pool;
 	wr_req->r_base_oid = rd_req->r_base_oid;
 
+	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	/* one page should be large enough for STAT data */
 	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
 	if (IS_ERR(pages)) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a79f9269831e..5d46d106bbb7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -717,6 +717,13 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_base_oloc = orig_req->r_base_oloc;
 	req->r_base_oid = orig_req->r_base_oid;
 
+	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (ret) {
+		ceph_osdc_put_request(req);
+		req = orig_req;
+		goto out;
+	}
+
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..66a1fcd5bff7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -322,6 +322,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 				    struct ceph_snap_context *snapc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccb9539dc780..d66dacc9d0d4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -369,8 +369,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	size_t msg_size;
 
 	if (use_mempool) {
 		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -407,53 +405,59 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	req->r_base_oloc.pool = -1;
 	req->r_target_oloc.pool = -1;
 
-	msg_size = OSD_OPREPLY_FRONT_LEN;
-	if (num_ops > CEPH_OSD_SLAB_OPS) {
-		/* ceph_osd_op and rval */
-		msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-			    (sizeof(struct ceph_osd_op) + 4);
-	}
+	dout("%s req %p\n", __func__, req);
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-				   gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_msg *msg;
+	int msg_size;
 
+	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4; /* pgid */
-	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+	msg_size += 4 + req->r_base_oid.name_len; /* oid */
+	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 	msg_size += 8; /* snapid */
 	msg_size += 8; /* snap_seq */
-	msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
 	msg_size += 4; /* retry_attempt */
 
-	/* create request message; allow space for oid */
-	if (use_mempool)
+	if (req->r_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
 
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
-
 	req->r_request = msg;
 
-	return req;
+	/* create reply message */
+	msg_size = OSD_OPREPLY_FRONT_LEN;
+	if (req->r_num_ops > CEPH_OSD_SLAB_OPS) {
+		/* ceph_osd_op and rval */
+		msg_size += (req->r_num_ops - CEPH_OSD_SLAB_OPS) *
+			    (sizeof(struct ceph_osd_op) + 4);
+	}
+
+	if (req->r_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
+
+	req->r_reply = msg;
+
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -828,17 +832,17 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
+	if (!req) {
+		r = -ENOMEM;
+		goto fail;
+	}
 
 	req->r_flags = flags;
 
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-	if (r < 0) {
-		ceph_osdc_put_request(req);
-		return ERR_PTR(r);
-	}
+	if (r)
+		goto fail;
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
 		osd_req_op_init(req, which, opcode, 0);
@@ -864,7 +868,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		 "%llx.%08llx", vino.ino, objnum);
 	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
 
+	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (r)
+		goto fail;
+
 	return req;
+
+fail:
+	ceph_osdc_put_request(req);
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
-- 
cgit v1.2.3


From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 29 Apr 2016 19:54:20 +0200
Subject: libceph: variable-sized ceph_object_id

Currently ceph_object_id can hold object names of up to 100
(CEPH_MAX_OID_NAME_LEN) characters.  This is enough for all use cases,
expect one - long rbd image names:

- a format 1 header is named "<imgname>.rbd"
- an object that points to a format 2 header is named "rbd_id.<imgname>"

We operate on these potentially long-named objects during rbd map, and,
for format 1 images, during header refresh.  (A format 2 header name is
a small system-generated string.)

Lift this 100 character limit by making ceph_object_id be able to point
to an externally-allocated string.  Apart from being able to work with
almost arbitrarily-long named objects, this allows us to reduce the
size of ceph_object_id from >100 bytes to 64 bytes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c         |  8 +++-
 fs/ceph/addr.c              |  6 +--
 fs/ceph/file.c              |  2 +-
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h | 62 ++++++++++++++++++------------
 net/ceph/debugfs.c          |  2 +-
 net/ceph/osd_client.c       | 16 +++++---
 net/ceph/osdmap.c           | 93 ++++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 150 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bda4deade82e..3bf93a2a20f0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1965,7 +1965,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
@@ -2017,7 +2019,9 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6fee7e0b8931..6f28dd9bacb2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1758,9 +1758,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
 	rd_req->r_base_oloc.pool = pool;
-	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
-		 "%llx.00000000", ci->i_vino.ino);
-	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
 
 	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
 	if (err)
@@ -1777,7 +1775,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	wr_req->r_base_oloc.pool = pool;
-	wr_req->r_base_oid = rd_req->r_base_oid;
+	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
 	if (err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d46d106bbb7..9d470397e249 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -715,7 +715,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
 	req->r_base_oloc = orig_req->r_base_oloc;
-	req->r_base_oid = orig_req->r_base_oid;
+	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (ret) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..db296709784a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -213,7 +213,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		 ceph_ino(inode), dl.object_no);
 
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-	ceph_oid_set_name(&oid, dl.object_name);
+	ceph_oid_printf(&oid, "%s", dl.object_name);
 
 	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
 	if (r < 0) {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..777a29412706 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -64,11 +64,47 @@ struct ceph_object_locator {
  */
 #define CEPH_MAX_OID_NAME_LEN 100
 
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-	char name[CEPH_MAX_OID_NAME_LEN];
+	char *name;
+	char inline_name[CEPH_OID_INLINE_LEN];
 	int name_len;
 };
 
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+	oid->name = oid->inline_name;
+	oid->name_len = 0;
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+	return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
@@ -113,30 +149,6 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
-				     const char *name)
-{
-	int len;
-
-	len = strlen(name);
-	if (len > sizeof(oid->name)) {
-		WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-		     name, len, sizeof(oid->name));
-		len = sizeof(oid->name);
-	}
-
-	memcpy(oid->name, name, len);
-	oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-				 struct ceph_object_id *src)
-{
-	BUG_ON(src->name_len > sizeof(dest->name));
-	memcpy(dest->name, src->name, src->name_len);
-	dest->name_len = src->name_len;
-}
-
 static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..6f8413293d15 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			   req->r_osd ? req->r_osd->o_osd : -1,
 			   req->r_pgid.pool, req->r_pgid.seed);
 
-		seq_printf(s, "%.*s", req->r_base_oid.name_len,
+		seq_printf(s, "%*pE", req->r_base_oid.name_len,
 			   req->r_base_oid.name);
 
 		if (req->r_reassert_version.epoch)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 75e27bd3d372..95910aed8e2e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -334,7 +334,10 @@ static void ceph_osdc_release_request(struct kref *kref)
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
+	ceph_oid_destroy(&req->r_base_oid);
+	ceph_oid_destroy(&req->r_target_oid);
 	ceph_put_snap_context(req->r_snapc);
+
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -401,7 +404,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
+	ceph_oid_init(&req->r_base_oid);
 	req->r_base_oloc.pool = -1;
+	ceph_oid_init(&req->r_target_oid);
 	req->r_target_oloc.pool = -1;
 
 	dout("%s req %p\n", __func__, req);
@@ -415,6 +420,8 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 	struct ceph_msg *msg;
 	int msg_size;
 
+	WARN_ON(ceph_oid_empty(&req->r_base_oid));
+
 	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
@@ -859,10 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
-
-	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-		 "%llx.%08llx", vino.ino, objnum);
-	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
@@ -1410,7 +1414,7 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		req->r_target_oloc = req->r_base_oloc; /* struct */
 		need_check_tiering = true;
 	}
-	if (req->r_target_oid.name_len == 0) {
+	if (ceph_oid_empty(&req->r_target_oid)) {
 		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
 		need_check_tiering = true;
 	}
@@ -2501,7 +2505,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 	/* oid */
 	ceph_encode_32(&p, req->r_base_oid.name_len);
 	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
 	     req->r_base_oid.name, req->r_base_oid.name_len);
 	p += req->r_base_oid.name_len;
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..4668b871ca47 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1381,8 +1381,99 @@ bad:
 	return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src)
+{
+	WARN_ON(!ceph_oid_empty(dest));
+
+	if (src->name != src->inline_name) {
+		/* very rare, see ceph_object_id definition */
+		dest->name = kmalloc(src->name_len + 1,
+				     GFP_NOIO | __GFP_NOFAIL);
+	}
 
+	memcpy(dest->name, src->name, src->name_len + 1);
+	dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
 
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+	int len;
+
+	WARN_ON(!ceph_oid_empty(oid));
+
+	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+	if (len >= sizeof(oid->inline_name))
+		return len;
+
+	oid->name_len = len;
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	BUG_ON(oid_printf_vargs(oid, fmt, ap));
+	va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+		      const char *fmt, va_list ap)
+{
+	va_list aq;
+	int len;
+
+	va_copy(aq, ap);
+	len = oid_printf_vargs(oid, fmt, aq);
+	va_end(aq);
+
+	if (len) {
+		char *external_name;
+
+		external_name = kmalloc(len + 1, gfp);
+		if (!external_name)
+			return -ENOMEM;
+
+		oid->name = external_name;
+		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+		oid->name_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+	if (oid->name != oid->inline_name)
+		kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
 
 /*
  * calculate file layout from given offset, length.
@@ -1474,7 +1565,7 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
 				     oid->name_len);
 
-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
 	     pg_out->pool, pg_out->seed);
 	return 0;
 }
-- 
cgit v1.2.3


From 0c0a8de13f9612a663b050afa955e6668858d1eb Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:21 +0200
Subject: libceph: nuke unused fields and functions

Either unused or useless:

    osdmap->mkfs_epoch
    osd->o_marked_for_keepalive
    monc->num_generic_requests
    osdc->map_waiters
    osdc->last_requested_map
    osdc->timeout_tid

    osd_req_op_cls_response_data()

    osdmap_apply_incremental() @msgr arg

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |  1 -
 include/linux/ceph/osd_client.h |  8 --------
 include/linux/ceph/osdmap.h     |  6 ++----
 net/ceph/mon_client.c           |  3 ---
 net/ceph/osd_client.c           | 13 +------------
 net/ceph/osdmap.c               |  3 +--
 6 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..330d045e4092 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -77,7 +77,6 @@ struct ceph_mon_client {
 
 	/* pending generic requests */
 	struct rb_root generic_request_tree;
-	int num_generic_requests;
 	u64 last_tid;
 
 	/* subs, indexed with CEPH_SUB_* */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 66a1fcd5bff7..63854a8df183 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -37,11 +37,9 @@ struct ceph_osd {
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
-	int o_marked_for_keepalive;
 	struct list_head o_keepalive_item;
 };
 
-
 #define CEPH_OSD_SLAB_OPS	2
 #define CEPH_OSD_MAX_OPS	16
 
@@ -206,13 +204,10 @@ struct ceph_osd_client {
 
 	struct ceph_osdmap     *osdmap;       /* current map */
 	struct rw_semaphore    map_sem;
-	struct completion      map_waiters;
-	u64                    last_requested_map;
 
 	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
-	u64                    timeout_tid;   /* tid of timeout triggering rq */
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
 	struct list_head       req_lru;	      /* in-flight lru */
@@ -271,9 +266,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
 					struct ceph_osd_request *osd_req,
 					unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-					struct ceph_osd_request *osd_req,
-					unsigned int which);
 
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
 					unsigned int which,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 777a29412706..ce7a41a182d4 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -123,7 +123,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
 	struct ceph_fsid fsid;
 	u32 epoch;
-	u32 mkfs_epoch;
 	struct ceph_timespec created, modified;
 
 	u32 flags;         /* CEPH_OSDMAP_* */
@@ -205,9 +204,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					    struct ceph_osdmap *map,
-					    struct ceph_messenger *msgr);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..3dfafdad92aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -579,7 +579,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
 	__insert_generic_request(monc, req);
-	monc->num_generic_requests++;
 	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
@@ -587,7 +586,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 
 	mutex_lock(&monc->mutex);
 	rb_erase(&req->node, &monc->generic_request_tree);
-	monc->num_generic_requests--;
 
 	if (!err)
 		err = req->result;
@@ -914,7 +912,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 	monc->generic_request_tree = RB_ROOT;
-	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 
 	return 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 95910aed8e2e..32ba09be6ee6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -143,14 +143,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-			unsigned int which)
-{
-	return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
-
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages,
 			u64 length, u32 alignment,
@@ -2166,8 +2158,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
 			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap,
-							  &osdc->client->msgr);
+							  osdc->osdmap);
 			if (IS_ERR(newmap)) {
 				err = PTR_ERR(newmap);
 				goto bad;
@@ -2674,8 +2665,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->client = client;
 	osdc->osdmap = NULL;
 	init_rwsem(&osdc->map_sem);
-	init_completion(&osdc->map_waiters);
-	osdc->last_requested_map = 0;
 	mutex_init(&osdc->request_mutex);
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4668b871ca47..9a0cc072a909 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1204,8 +1204,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					     struct ceph_osdmap *map,
-					     struct ceph_messenger *msgr)
+					     struct ceph_osdmap *map)
 {
 	struct crush_map *newcrush = NULL;
 	struct ceph_fsid fsid;
-- 
cgit v1.2.3


From fcd00b68bbe2bf5606cb45c2cd4a250a390bcc1f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: DEFINE_RB_FUNCS macro

Given

    struct foo {
        u64 id;
        struct rb_node bar_node;
    };

generate insert_bar(), erase_bar() and lookup_bar() functions with

    DEFINE_RB_FUNCS(bar, struct foo, id, bar_node)

The key is assumed to be an integer (u64, int, etc), compared with
< and >.  nodefld has to be initialized with RB_CLEAR_NODE().

Start using it for MDS, MON and OSD requests and OSD sessions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 54 ++++++------------------
 include/linux/ceph/libceph.h | 57 ++++++++++++++++++++++++++
 net/ceph/mon_client.c        | 52 ++++--------------------
 net/ceph/osd_client.c        | 97 +++++---------------------------------------
 4 files changed, 88 insertions(+), 172 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..cff85af425d4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -567,51 +567,23 @@ void ceph_mdsc_release_request(struct kref *kref)
 	kfree(req);
 }
 
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
-					     u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
 	struct ceph_mds_request *req;
-	struct rb_node *n = mdsc->request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mds_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else {
-			ceph_mdsc_get_request(req);
-			return req;
-		}
-	}
-	return NULL;
-}
 
-static void __insert_request(struct ceph_mds_client *mdsc,
-			     struct ceph_mds_request *new)
-{
-	struct rb_node **p = &mdsc->request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mds_request *req = NULL;
+	req = lookup_request(&mdsc->request_tree, tid);
+	if (req)
+		ceph_mdsc_get_request(req);
 
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mds_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &mdsc->request_tree);
+	return req;
 }
 
 /*
@@ -630,7 +602,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 				  req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
-	__insert_request(mdsc, req);
+	insert_request(&mdsc->request_tree, req);
 
 	req->r_uid = current_fsuid();
 	req->r_gid = current_fsgid();
@@ -663,8 +635,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	rb_erase(&req->r_node, &mdsc->request_tree);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&mdsc->request_tree, req);
 
 	if (req->r_unsafe_dir && req->r_got_unsafe) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -1722,6 +1693,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	INIT_LIST_HEAD(&req->r_unsafe_target_item);
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
+	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_wait);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
@@ -2414,7 +2386,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	/* get request, session */
 	tid = le64_to_cpu(msg->hdr.tid);
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("handle_reply on unknown tid %llu\n", tid);
 		mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2576,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	fwd_seq = ceph_decode_32(&p);
 
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
 		goto out;  /* dup reply? */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
 		(off >> PAGE_SHIFT);
 }
 
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
+static void insert_##name(struct rb_root *root, type *t)		\
+{									\
+	struct rb_node **n = &root->rb_node;				\
+	struct rb_node *parent = NULL;					\
+									\
+	BUG_ON(!RB_EMPTY_NODE(&t->nodefld));				\
+									\
+	while (*n) {							\
+		type *cur = rb_entry(*n, type, nodefld);		\
+									\
+		parent = *n;						\
+		if (t->keyfld < cur->keyfld)				\
+			n = &(*n)->rb_left;				\
+		else if (t->keyfld > cur->keyfld)			\
+			n = &(*n)->rb_right;				\
+		else							\
+			BUG();						\
+	}								\
+									\
+	rb_link_node(&t->nodefld, parent, n);				\
+	rb_insert_color(&t->nodefld, root);				\
+}									\
+static void erase_##name(struct rb_root *root, type *t)			\
+{									\
+	BUG_ON(RB_EMPTY_NODE(&t->nodefld));				\
+	rb_erase(&t->nodefld, root);					\
+	RB_CLEAR_NODE(&t->nodefld);					\
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
+static type *lookup_##name(struct rb_root *root,			\
+			   typeof(((type *)0)->keyfld) key)		\
+{									\
+	struct rb_node *n = root->rb_node;				\
+									\
+	while (n) {							\
+		type *cur = rb_entry(n, type, nodefld);			\
+									\
+		if (key < cur->keyfld)					\
+			n = n->rb_left;					\
+		else if (key > cur->keyfld)				\
+			n = n->rb_right;				\
+		else							\
+			return cur;					\
+	}								\
+									\
+	return NULL;							\
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 3dfafdad92aa..a426a4b03e75 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -478,45 +478,7 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
@@ -551,7 +513,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	struct ceph_msg *m;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (!req) {
 		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
@@ -578,14 +540,14 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 	/* register request */
 	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
+	insert_generic_request(&monc->generic_request_tree, req);
 	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
 	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
+	erase_generic_request(&monc->generic_request_tree, req);
 
 	if (!err)
 		err = req->result;
@@ -619,7 +581,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (req) {
 		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
@@ -651,6 +613,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = buf;
 	init_completion(&req->completion);
 
@@ -696,7 +659,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 		goto bad;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, handle);
+	req = lookup_generic_request(&monc->generic_request_tree, handle);
 	if (req) {
 		*(u64 *)req->buf = ceph_decode_64(&p);
 		req->result = 0;
@@ -732,6 +695,7 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = newest;
 	init_completion(&req->completion);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c423e11d6857..8256051ed88f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -875,45 +875,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
 
 static struct ceph_osd_request *
 __lookup_request_ge(struct ceph_osd_client *osdc,
@@ -1101,6 +1063,8 @@ static void put_osd(struct ceph_osd *osd)
 	}
 }
 
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
+
 /*
  * remove an osd from our map
  */
@@ -1111,8 +1075,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 
 	list_del_init(&osd->o_osd_lru);
-	rb_erase(&osd->o_node, &osdc->osds);
-	RB_CLEAR_NODE(&osd->o_node);
+	erase_osd(&osdc->osds, osd);
 }
 
 static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
@@ -1188,45 +1151,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
-
-	dout("__insert_osd %p osd%d\n", new, new->o_osd);
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
-}
-
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
@@ -1248,7 +1172,7 @@ static void __register_request(struct ceph_osd_client *osdc,
 	req->r_tid = ++osdc->last_tid;
 	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
+	insert_request(&osdc->requests, req);
 	ceph_osdc_get_request(req);
 	osdc->num_requests++;
 	if (osdc->num_requests == 1) {
@@ -1270,8 +1194,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	}
 
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&osdc->requests, req);
 	osdc->num_requests--;
 
 	if (req->r_osd) {
@@ -1482,7 +1405,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = __lookup_osd(osdc, o);
+	req->r_osd = lookup_osd(&osdc->osds, o);
 	if (!req->r_osd && o >= 0) {
 		err = -ENOMEM;
 		req->r_osd = create_osd(osdc, o);
@@ -1492,7 +1415,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd, o);
-		__insert_osd(osdc, req->r_osd);
+		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
 			      CEPH_ENTITY_TYPE_OSD, o,
@@ -1822,7 +1745,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	/* lookup */
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (req == NULL) {
 		dout("handle_reply tid %llu dne\n", tid);
 		goto bad_mutex;
@@ -2880,7 +2803,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-- 
cgit v1.2.3


From 985c1673885b77b2e0167c6478a833817d1e2fe5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: fix ceph_eversion encoding

eversion_t is version+epoch in userspace and is encoded in that order.
ceph_eversion is defined as epoch+version in rados.h, yet we memcpy it
in __send_request().  Reoder ceph_eversion fields.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/rados.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..913c87c26d33 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
  * compound epoch+version, used by storage layer to serialize mutations
  */
 struct ceph_eversion {
-	__le32 epoch;
 	__le64 version;
+	__le32 epoch;
 } __attribute__ ((packed));
 
 /*
-- 
cgit v1.2.3


From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: rename ceph_oloc_oid_to_pg()

Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg().  Emphasise
that returned is raw PG and return -ENOENT instead of -EIO if the pool
doesn't exist.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h |  9 ++++-----
 net/ceph/osd_client.c       |  4 ++--
 net/ceph/osdmap.c           | 31 ++++++++++++++++---------------
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index db296709784a..cca7fff22725 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -215,7 +215,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
 	ceph_oid_printf(&oid, "%s", dl.object_name);
 
-	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
 		up_read(&osdc->map_sem);
 		return r;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index ce7a41a182d4..b70440c05b49 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			       struct ceph_object_locator *oloc,
-			       struct ceph_object_id *oid,
-			       struct ceph_pg *pg_out);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid);
 
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
 			       struct ceph_pg pgid,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8256051ed88f..cb9f1953f5fb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1324,8 +1324,8 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		/* !pi is caught in ceph_oloc_oid_to_pg() */
 	}
 
-	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-				   &req->r_target_oid, pg_out);
+	return ceph_object_locator_to_pg(osdmap, &req->r_target_oid,
+					 &req->r_target_oloc, pg_out);
 }
 
 static void __enqueue_request(struct ceph_osd_request *req)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9a0cc072a909..6267839cb246 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1545,30 +1545,31 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			struct ceph_object_locator *oloc,
-			struct ceph_object_id *oid,
-			struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid)
 {
 	struct ceph_pg_pool_info *pi;
 
-	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
 	if (!pi)
-		return -EIO;
+		return -ENOENT;
 
-	pg_out->pool = oloc->pool;
-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-				     oid->name_len);
+	raw_pgid->pool = oloc->pool;
+	raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+				       oid->name_len);
 
-	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-	     pg_out->pool, pg_out->seed);
+	dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
+	     oid->name, raw_pgid->pool, raw_pgid->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
-- 
cgit v1.2.3


From 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: ceph_osds, ceph_pg_to_up_acting_osds()

Knowning just acting set isn't enough, we need to be able to record up
set as well to detect interval changes.  This means returning (up[],
up_len, up_primary, acting[], acting_len, acting_primary) and passing
it around.  Introduce and switch to ceph_osds to help with that.

Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return
both up and acting sets from it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  21 ++-
 net/ceph/osd_client.c       |  36 +++---
 net/ceph/osdmap.c           | 304 ++++++++++++++++++++++++++------------------
 3 files changed, 215 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index b70440c05b49..942189d311e0 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
+struct ceph_osds {
+	int osds[CEPH_PG_MAX_SIZE];
+	int size;
+	int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+	set->size = 0;
+	set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
@@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 			      struct ceph_object_locator *oloc,
 			      struct ceph_pg *raw_pgid);
 
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-			       struct ceph_pg pgid,
-			       int *osds, int *primary);
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       const struct ceph_pg *raw_pgid,
+			       struct ceph_osds *up,
+			       struct ceph_osds *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cb9f1953f5fb..0ff400a56cd6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_pg pgid;
-	int acting[CEPH_PG_MAX_SIZE];
-	int num, o;
+	struct ceph_osds up, acting;
 	int err;
 	bool was_paused;
 
@@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 	}
 	req->r_pgid = pgid;
 
-	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-	if (num < 0)
-		num = 0;
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
 
 	was_paused = req->r_paused;
 	req->r_paused = __req_should_be_paused(osdc, req);
@@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc,
 		force_resend = 1;
 
 	if ((!force_resend &&
-	     req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_osd && req->r_osd->o_osd == acting.primary &&
 	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == num &&
-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-	    (req->r_osd == NULL && o == -1) ||
+	     req->r_num_pg_osds == acting.size &&
+	     memcmp(req->r_pg_osds, acting.osds,
+		    acting.size * sizeof(acting.osds[0])) == 0) ||
+	    (req->r_osd == NULL && acting.primary == -1) ||
 	    req->r_paused)
 		return 0;  /* no change */
 
 	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pool, pgid.seed, o,
+	     req->r_tid, pgid.pool, pgid.seed, acting.primary,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
 	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-	req->r_num_pg_osds = num;
+	memcpy(req->r_pg_osds, acting.osds,
+	       acting.size * sizeof(acting.osds[0]));
+	req->r_num_pg_osds = acting.size;
 
 	if (req->r_osd) {
 		__cancel_request(req);
@@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = lookup_osd(&osdc->osds, o);
-	if (!req->r_osd && o >= 0) {
+	req->r_osd = lookup_osd(&osdc->osds, acting.primary);
+	if (!req->r_osd && acting.primary >= 0) {
 		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, o);
+		req->r_osd = create_osd(osdc, acting.primary);
 		if (!req->r_osd) {
 			list_move(&req->r_req_lru_item, &osdc->req_notarget);
 			goto out;
 		}
 
-		dout("map_request osd %p is osd%d\n", req->r_osd, o);
+		dout("map_request osd %p is osd%d\n", req->r_osd,
+		     acting.primary);
 		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, o,
-			      &osdc->osdmap->osd_addr[o]);
+			      CEPH_ENTITY_TYPE_OSD, acting.primary,
+			      &osdc->osdmap->osd_addr[acting.primary]);
 	}
 
 	__enqueue_request(req);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 6267839cb246..f5fc8fc63879 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
 }
 EXPORT_SYMBOL(ceph_oid_destroy);
 
+static bool osds_valid(const struct ceph_osds *set)
+{
+	/* non-empty set */
+	if (set->size > 0 && set->primary >= 0)
+		return true;
+
+	/* empty can_shift_osds set */
+	if (!set->size && set->primary == -1)
+		return true;
+
+	/* empty !can_shift_osds set - all NONE */
+	if (set->size > 0 && set->primary == -1) {
+		int i;
+
+		for (i = 0; i < set->size; i++) {
+			if (set->osds[i] != CRUSH_ITEM_NONE)
+				break;
+		}
+		if (i == set->size)
+			return true;
+	}
+
+	return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+	dest->size = src->size;
+	dest->primary = src->primary;
+}
+
 /*
  * calculate file layout from given offset, length.
  * fill in correct oid, logical length, and object extent
@@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 }
 EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid,
+			 struct ceph_pg *pgid)
+{
+	pgid->pool = raw_pgid->pool;
+	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+				     pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid)
+{
+	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+		/* hash pool id and seed so that pool PGs do not overlap */
+		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				      ceph_stable_mod(raw_pgid->seed,
+						      pi->pgp_num,
+						      pi->pgp_num_mask),
+				      raw_pgid->pool);
+	} else {
+		/*
+		 * legacy behavior: add ps and pool together.  this is
+		 * not a great approach because the PGs from each pool
+		 * will overlap on top of each other: 0.5 == 1.4 ==
+		 * 2.3 == ...
+		 */
+		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+				       pi->pgp_num_mask) +
+		       (unsigned)raw_pgid->pool;
+	}
+}
+
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
 		    const __u32 *weight, int weight_max)
@@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
  *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
  */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   const struct ceph_pg *raw_pgid,
+			   struct ceph_osds *raw,
+			   u32 *ppps)
 {
+	u32 pps = raw_pg_to_pps(pi, raw_pgid);
 	int ruleno;
 	int len;
 
-	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-				 pool->type, pool->size);
+	ceph_osds_init(raw);
+	if (ppps)
+		*ppps = pps;
+
+	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+				 pi->size);
 	if (ruleno < 0) {
 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-		       pgid.pool, pool->crush_ruleset, pool->type,
-		       pool->size);
-		return -ENOENT;
+		       pi->id, pi->crush_ruleset, pi->type, pi->size);
+		return;
 	}
 
-	len = do_crush(osdmap, ruleno, pps, osds,
-		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+	len = do_crush(osdmap, ruleno, pps, raw->osds,
+		       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
 		       osdmap->osd_weight, osdmap->max_osd);
 	if (len < 0) {
 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-		       len, ruleno, pgid.pool, pool->crush_ruleset,
-		       pool->type, pool->size);
-		return len;
+		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+		       pi->size);
+		return;
 	}
 
-	return len;
+	raw->size = len;
 }
 
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
  *
- * Return up set length.  *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set.  If it's
+ * empty, ->primary will remain undefined.
  */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   struct ceph_osds *set)
 {
-	int up_primary = -1;
 	int i;
 
-	if (ceph_can_shift_osds(pool)) {
+	/* ->primary is undefined for a raw set */
+	BUG_ON(set->primary != -1);
+
+	if (ceph_can_shift_osds(pi)) {
 		int removed = 0;
 
-		for (i = 0; i < len; i++) {
-			if (ceph_osd_is_down(osdmap, osds[i])) {
+		/* shift left */
+		for (i = 0; i < set->size; i++) {
+			if (ceph_osd_is_down(osdmap, set->osds[i])) {
 				removed++;
 				continue;
 			}
 			if (removed)
-				osds[i - removed] = osds[i];
+				set->osds[i - removed] = set->osds[i];
 		}
-
-		len -= removed;
-		if (len > 0)
-			up_primary = osds[0];
+		set->size -= removed;
+		if (set->size > 0)
+			set->primary = set->osds[0];
 	} else {
-		for (i = len - 1; i >= 0; i--) {
-			if (ceph_osd_is_down(osdmap, osds[i]))
-				osds[i] = CRUSH_ITEM_NONE;
+		/* set down/dne devices to NONE */
+		for (i = set->size - 1; i >= 0; i--) {
+			if (ceph_osd_is_down(osdmap, set->osds[i]))
+				set->osds[i] = CRUSH_ITEM_NONE;
 			else
-				up_primary = osds[i];
+				set->primary = set->osds[i];
 		}
 	}
-
-	*primary = up_primary;
-	return len;
 }
 
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
-				   struct ceph_pg_pool_info *pool,
-				   int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+				   struct ceph_pg_pool_info *pi,
+				   u32 pps,
+				   struct ceph_osds *up)
 {
 	int i;
 	int pos = -1;
@@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (!osdmap->osd_primary_affinity)
 		return;
 
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 
 		if (osd != CRUSH_ITEM_NONE &&
 		    osdmap->osd_primary_affinity[osd] !=
@@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 			break;
 		}
 	}
-	if (i == len)
+	if (i == up->size)
 		return;
 
 	/*
@@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	 * osd into the hash/rng so that a proportional fraction of an
 	 * osd's pgs get rejected as primary.
 	 */
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 		u32 aff;
 
 		if (osd == CRUSH_ITEM_NONE)
@@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (pos < 0)
 		return;
 
-	*primary = osds[pos];
+	up->primary = up->osds[pos];
 
-	if (ceph_can_shift_osds(pool) && pos > 0) {
+	if (ceph_can_shift_osds(pi) && pos > 0) {
 		/* move the new primary to the front */
 		for (i = pos; i > 0; i--)
-			osds[i] = osds[i - 1];
-		osds[0] = *primary;
+			up->osds[i] = up->osds[i - 1];
+		up->osds[0] = up->primary;
 	}
 }
 
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
  *
- * Return acting set length.  *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
  */
-static int apply_temps(struct ceph_osdmap *osdmap,
-		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
-		       int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+			  struct ceph_pg_pool_info *pi,
+			  const struct ceph_pg *raw_pgid,
+			  struct ceph_osds *temp)
 {
+	struct ceph_pg pgid;
 	struct ceph_pg_mapping *pg;
-	int temp_len;
-	int temp_primary;
 	int i;
 
-	/* raw_pg -> pg */
-	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-				    pool->pg_num_mask);
+	raw_pg_to_pg(pi, raw_pgid, &pgid);
+	ceph_osds_init(temp);
 
 	/* pg_temp? */
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
-		temp_len = 0;
-		temp_primary = -1;
-
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-				if (ceph_can_shift_osds(pool))
+				if (ceph_can_shift_osds(pi))
 					continue;
-				else
-					osds[temp_len++] = CRUSH_ITEM_NONE;
+
+				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
 			} else {
-				osds[temp_len++] = pg->pg_temp.osds[i];
+				temp->osds[temp->size++] = pg->pg_temp.osds[i];
 			}
 		}
 
 		/* apply pg_temp's primary */
-		for (i = 0; i < temp_len; i++) {
-			if (osds[i] != CRUSH_ITEM_NONE) {
-				temp_primary = osds[i];
+		for (i = 0; i < temp->size; i++) {
+			if (temp->osds[i] != CRUSH_ITEM_NONE) {
+				temp->primary = temp->osds[i];
 				break;
 			}
 		}
-	} else {
-		temp_len = len;
-		temp_primary = *primary;
 	}
 
 	/* primary_temp? */
 	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
 	if (pg)
-		temp_primary = pg->primary_temp.osd;
-
-	*primary = temp_primary;
-	return temp_len;
+		temp->primary = pg->primary_temp.osd;
 }
 
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
  *
- * Return acting set length, or error.  *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       const struct ceph_pg *raw_pgid,
+			       struct ceph_osds *up,
+			       struct ceph_osds *acting)
 {
-	struct ceph_pg_pool_info *pool;
+	struct ceph_pg_pool_info *pi;
 	u32 pps;
-	int len;
 
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-	if (!pool) {
-		*primary = -1;
-		return -ENOENT;
+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+	if (!pi) {
+		ceph_osds_init(up);
+		ceph_osds_init(acting);
+		goto out;
 	}
 
-	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-		/* hash pool id and seed so that pool PGs do not overlap */
-		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-				     ceph_stable_mod(pgid.seed, pool->pgp_num,
-						     pool->pgp_num_mask),
-				     pgid.pool);
-	} else {
-		/*
-		 * legacy behavior: add ps and pool together.  this is
-		 * not a great approach because the PGs from each pool
-		 * will overlap on top of each other: 0.5 == 1.4 ==
-		 * 2.3 == ...
-		 */
-		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-				      pool->pgp_num_mask) +
-			(unsigned)pgid.pool;
-	}
-
-	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-	if (len < 0) {
-		*primary = -1;
-		return len;
+	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+	raw_to_up_osds(osdmap, pi, up);
+	apply_primary_affinity(osdmap, pi, pps, up);
+	get_temp_osds(osdmap, pi, raw_pgid, acting);
+	if (!acting->size) {
+		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+		acting->size = up->size;
+		if (acting->primary == -1)
+			acting->primary = up->primary;
 	}
-
-	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
-	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
-	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
-	return len;
+out:
+	WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 /*
@@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-	int osds[CEPH_PG_MAX_SIZE];
-	int primary;
-
-	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+	struct ceph_osds up, acting;
 
-	return primary;
+	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	return acting.primary;
 }
 EXPORT_SYMBOL(ceph_calc_pg_primary);
-- 
cgit v1.2.3


From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: rename ceph_calc_pg_primary()

Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to
emphasise that it returns acting primary.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             | 2 +-
 include/linux/ceph/osdmap.h | 4 ++--
 net/ceph/osdmap.c           | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index cca7fff22725..1831ad6cf066 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -221,7 +221,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return r;
 	}
 
-	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
 			ceph_osd_addr(osdc->osdmap, dl.osd);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 942189d311e0..3fd978a1639b 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid);
 
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f5fc8fc63879..656384a8fd1e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1896,13 +1896,14 @@ out:
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid)
 {
 	struct ceph_osds up, acting;
 
-	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
 	return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
-- 
cgit v1.2.3


From f984cb76cc5fb9fc76d6abb6c4694a5412e3f49b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: make pgid_cmp() global

calc_target() code is going to need to know how to compare PGs.  Take
lhs and rhs pgid by const * while at it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  2 ++
 net/ceph/osdmap.c           | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 3fd978a1639b..7783237ab06c 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,6 +24,8 @@ struct ceph_pg {
 	uint32_t seed;
 };
 
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+
 #define CEPH_POOL_FLAG_HASHPSPOOL  1
 
 struct ceph_pg_pool_info {
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 656384a8fd1e..3c7dc5e581ab 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
 	return ERR_PTR(err);
 }
 
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 {
-	if (l.pool < r.pool)
+	if (lhs->pool < rhs->pool)
 		return -1;
-	if (l.pool > r.pool)
+	if (lhs->pool > rhs->pool)
 		return 1;
-	if (l.seed < r.seed)
+	if (lhs->seed < rhs->seed)
 		return -1;
-	if (l.seed > r.seed)
+	if (lhs->seed > rhs->seed)
 		return 1;
+
 	return 0;
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 			       struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 	while (*p) {
 		parent = *p;
 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = pgid_cmp(new->pgid, pg->pgid);
+		c = ceph_pg_compare(&new->pgid, &pg->pgid);
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
 	while (n) {
 		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
+		c = ceph_pg_compare(&pgid, &pg->pgid);
 		if (c < 0) {
 			n = n->rb_left;
 		} else if (c > 0) {
-- 
cgit v1.2.3


From 04812acf572ef41fd51c11e0bf3385f34c0e1b5b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: pi->min_size, pi->last_force_request_resend

Add and decode pi->min_size and pi->last_force_request_resend.  These
are going to be used by calc_target().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  9 ++++++---
 net/ceph/debugfs.c          | 10 ++++++----
 net/ceph/osdmap.c           | 48 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 7783237ab06c..989294d0b8d2 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -26,20 +26,23 @@ struct ceph_pg {
 
 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+#define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
+						       together */
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
 	s64 id;
-	u8 type;
+	u8 type; /* CEPH_POOL_TYPE_* */
 	u8 size;
+	u8 min_size;
 	u8 crush_ruleset;
 	u8 object_hash;
+	u32 last_force_request_resend;
 	u32 pg_num, pgp_num;
 	int pg_num_mask, pgp_num_mask;
 	s64 read_tier;
 	s64 write_tier; /* wins for read+write ops */
-	u64 flags;
+	u64 flags; /* CEPH_POOL_FLAG_* */
 	char *name;
 };
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6f8413293d15..7f1cc22c3e8b 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,12 +66,14 @@ static int osdmap_show(struct seq_file *s, void *p)
 		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
 
 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
+		struct ceph_pg_pool_info *pi =
 			rb_entry(n, struct ceph_pg_pool_info, node);
 
-		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-			   pool->id, pool->pg_num, pool->pg_num_mask,
-			   pool->read_tier, pool->write_tier);
+		seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+			   pi->id, pi->name, pi->type, pi->size, pi->min_size,
+			   pi->pg_num, pi->pg_num_mask, pi->flags,
+			   pi->last_force_request_resend, pi->read_tier,
+			   pi->write_tier);
 	}
 	for (i = 0; i < map->max_osd; i++) {
 		struct ceph_entity_addr *addr = &map->osd_addr[i];
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3c7dc5e581ab..66c3ebead92f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -597,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 	*p += 4;  /* skip crash_replay_interval */
 
 	if (ev >= 7)
-		*p += 1;  /* skip min_size */
+		pi->min_size = ceph_decode_8(p);
+	else
+		pi->min_size = pi->size - pi->size / 2;
 
 	if (ev >= 8)
 		*p += 8 + 8;  /* skip quota_max_* */
@@ -617,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 		pi->write_tier = -1;
 	}
 
+	if (ev >= 10) {
+		/* skip properties */
+		num = ceph_decode_32(p);
+		while (num--) {
+			len = ceph_decode_32(p);
+			*p += len; /* key */
+			len = ceph_decode_32(p);
+			*p += len; /* val */
+		}
+	}
+
+	if (ev >= 11) {
+		/* skip hit_set_params */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
+
+		*p += 4; /* skip hit_set_period */
+		*p += 4; /* skip hit_set_count */
+	}
+
+	if (ev >= 12)
+		*p += 4; /* skip stripe_width */
+
+	if (ev >= 13) {
+		*p += 8; /* skip target_max_bytes */
+		*p += 8; /* skip target_max_objects */
+		*p += 4; /* skip cache_target_dirty_ratio_micro */
+		*p += 4; /* skip cache_target_full_ratio_micro */
+		*p += 4; /* skip cache_min_flush_age */
+		*p += 4; /* skip cache_min_evict_age */
+	}
+
+	if (ev >=  14) {
+		/* skip erasure_code_profile */
+		len = ceph_decode_32(p);
+		*p += len;
+	}
+
+	if (ev >= 15)
+		pi->last_force_request_resend = ceph_decode_32(p);
+	else
+		pi->last_force_request_resend = 0;
+
 	/* ignore the rest */
 
 	*p = pool_end;
-- 
cgit v1.2.3


From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: introduce ceph_osd_request_target, calc_target()

Introduce ceph_osd_request_target, containing all mapping-related
fields of ceph_osd_request and calc_target() for calculating mappings
and populating it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   2 +-
 fs/ceph/file.c                  |   2 +-
 include/linux/ceph/osd_client.h |  23 ++++++
 include/linux/ceph/osdmap.h     |  34 +++++++++
 include/linux/ceph/rados.h      |   5 ++
 net/ceph/osd_client.c           | 157 +++++++++++++++++++++++++++++++++++++++-
 net/ceph/osdmap.c               | 121 +++++++++++++++++++++++++++++++
 7 files changed, 340 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6f28dd9bacb2..c5d75486823b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-	wr_req->r_base_oloc.pool = pool;
+	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 9d470397e249..36b4a41dfa67 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
-	req->r_base_oloc = orig_req->r_base_oloc;
+	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
 	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 63854a8df183..48806ee4488d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
 				     struct ceph_msg *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
+#define CEPH_HOMELESS_OSD	-1
+
 /* a given osd we're communicating with */
 struct ceph_osd {
 	atomic_t o_ref;
@@ -118,6 +120,27 @@ struct ceph_osd_req_op {
 	};
 };
 
+struct ceph_osd_request_target {
+	struct ceph_object_id base_oid;
+	struct ceph_object_locator base_oloc;
+	struct ceph_object_id target_oid;
+	struct ceph_object_locator target_oloc;
+
+	struct ceph_pg pgid;
+	u32 pg_num;
+	u32 pg_num_mask;
+	struct ceph_osds acting;
+	struct ceph_osds up;
+	int size;
+	int min_size;
+	bool sort_bitwise;
+
+	unsigned int flags;                /* CEPH_OSD_FLAG_* */
+	bool paused;
+
+	int osd;
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 989294d0b8d2..420bb7968b25 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
 #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 						       together */
+#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
@@ -62,6 +63,22 @@ struct ceph_object_locator {
 	s64 pool;
 };
 
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+	oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+	return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+				  const struct ceph_object_locator *src)
+{
+	dest->pool = src->pool;
+}
+
 /*
  * Maximum supported by kernel client object name length
  *
@@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set)
 
 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
 
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 913c87c26d33..f28ed864e682 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0ff400a56cd6..cff3a7e29233 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	}
 }
 
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+	ceph_oid_init(&t->base_oid);
+	ceph_oloc_init(&t->base_oloc);
+	ceph_oid_init(&t->target_oid);
+	ceph_oloc_init(&t->target_oloc);
+
+	ceph_osds_init(&t->acting);
+	ceph_osds_init(&t->up);
+	t->size = -1;
+	t->min_size = -1;
+
+	t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+	ceph_oid_destroy(&t->base_oid);
+	ceph_oid_destroy(&t->target_oid);
+}
+
 /*
  * requests
  */
@@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
+static bool __pool_full(struct ceph_pg_pool_info *pi)
+{
+	return pi->flags & CEPH_POOL_FLAG_FULL;
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
@@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc,
 		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+				    const struct ceph_osd_request_target *t,
+				    struct ceph_pg_pool_info *pi)
+{
+	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		       __pool_full(pi);
+
+	WARN_ON(pi->id != t->base_oloc.pool);
+	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
 /*
  * Calculate mapping of a request to a PG.  Takes tiering into account.
  */
@@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 					 &req->r_target_oloc, pg_out);
 }
 
+enum calc_target_result {
+	CALC_TARGET_NO_ACTION = 0,
+	CALC_TARGET_NEED_RESEND,
+	CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+					   struct ceph_osd_request_target *t,
+					   u32 *last_force_resend,
+					   bool any_change)
+{
+	struct ceph_pg_pool_info *pi;
+	struct ceph_pg pgid, last_pgid;
+	struct ceph_osds up, acting;
+	bool force_resend = false;
+	bool need_check_tiering = false;
+	bool need_resend = false;
+	bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+					     CEPH_OSDMAP_SORTBITWISE);
+	enum calc_target_result ct_res;
+	int ret;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+	if (!pi) {
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+
+	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+		if (last_force_resend &&
+		    *last_force_resend < pi->last_force_request_resend) {
+			*last_force_resend = pi->last_force_request_resend;
+			force_resend = true;
+		} else if (!last_force_resend) {
+			force_resend = true;
+		}
+	}
+	if (ceph_oid_empty(&t->target_oid) || force_resend) {
+		ceph_oid_copy(&t->target_oid, &t->base_oid);
+		need_check_tiering = true;
+	}
+	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+		need_check_tiering = true;
+	}
+
+	if (need_check_tiering &&
+	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+			t->target_oloc.pool = pi->read_tier;
+		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+			t->target_oloc.pool = pi->write_tier;
+	}
+
+	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+					&t->target_oloc, &pgid);
+	if (ret) {
+		WARN_ON(ret != -ENOENT);
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+	last_pgid.pool = pgid.pool;
+	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+	if (any_change &&
+	    ceph_is_new_interval(&t->acting,
+				 &acting,
+				 &t->up,
+				 &up,
+				 t->size,
+				 pi->size,
+				 t->min_size,
+				 pi->min_size,
+				 t->pg_num,
+				 pi->pg_num,
+				 t->sort_bitwise,
+				 sort_bitwise,
+				 &last_pgid))
+		force_resend = true;
+
+	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+		t->paused = false;
+		need_resend = true;
+	}
+
+	if (ceph_pg_compare(&t->pgid, &pgid) ||
+	    ceph_osds_changed(&t->acting, &acting, any_change) ||
+	    force_resend) {
+		t->pgid = pgid; /* struct */
+		ceph_osds_copy(&t->acting, &acting);
+		ceph_osds_copy(&t->up, &up);
+		t->size = pi->size;
+		t->min_size = pi->min_size;
+		t->pg_num = pi->pg_num;
+		t->pg_num_mask = pi->pg_num_mask;
+		t->sort_bitwise = sort_bitwise;
+
+		t->osd = acting.primary;
+		need_resend = true;
+	}
+
+	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+	return ct_res;
+}
+
 static void __enqueue_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
@@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		redir.oloc.pool = -1;
 	}
 
-	if (redir.oloc.pool != -1) {
+	if (!ceph_oloc_empty(&redir.oloc)) {
 		dout("redirect pool %lld\n", redir.oloc.pool);
 
 		__unregister_request(osdc, req);
 
-		req->r_target_oloc = redir.oloc; /* struct */
+		ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 66c3ebead92f..7d4a5b43085e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
 }
 EXPORT_SYMBOL(ceph_oid_destroy);
 
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+			 const struct ceph_osds *rhs)
+{
+	if (lhs->size == rhs->size &&
+	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+		return true;
+
+	return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+		       const struct ceph_osds *rhs)
+{
+	if (__osds_equal(lhs, rhs) &&
+	    lhs->primary == rhs->primary)
+		return true;
+
+	return false;
+}
+
 static bool osds_valid(const struct ceph_osds *set)
 {
 	/* non-empty set */
@@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
 	dest->primary = src->primary;
 }
 
+static bool is_split(const struct ceph_pg *pgid,
+		     u32 old_pg_num,
+		     u32 new_pg_num)
+{
+	int old_bits = calc_bits_of(old_pg_num);
+	int old_mask = (1 << old_bits) - 1;
+	int n;
+
+	WARN_ON(pgid->seed >= old_pg_num);
+	if (new_pg_num <= old_pg_num)
+		return false;
+
+	for (n = 1; ; n++) {
+		int next_bit = n << (old_bits - 1);
+		u32 s = next_bit | pgid->seed;
+
+		if (s < old_pg_num || s == pgid->seed)
+			continue;
+		if (s >= new_pg_num)
+			break;
+
+		s = ceph_stable_mod(s, old_pg_num, old_mask);
+		if (s == pgid->seed)
+			return true;
+	}
+
+	return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid)
+{
+	return !osds_equal(old_acting, new_acting) ||
+	       !osds_equal(old_up, new_up) ||
+	       old_size != new_size ||
+	       old_min_size != new_min_size ||
+	       is_split(pgid, old_pg_num, new_pg_num) ||
+	       old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+	int i;
+
+	for (i = 0; i < acting->size; i++) {
+		if (acting->osds[i] == osd)
+			return i;
+	}
+
+	return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+			    const struct ceph_osds *new_acting)
+{
+	if (!old_acting->size && !new_acting->size)
+		return false; /* both still empty */
+
+	if (!old_acting->size ^ !new_acting->size)
+		return true; /* was empty, now not, or vice versa */
+
+	if (old_acting->primary != new_acting->primary)
+		return true; /* primary changed */
+
+	if (calc_pg_rank(old_acting->primary, old_acting) !=
+	    calc_pg_rank(new_acting->primary, new_acting))
+		return true;
+
+	return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change)
+{
+	if (primary_changed(old_acting, new_acting))
+		return true;
+
+	if (any_change && !__osds_equal(old_acting, new_acting))
+		return true;
+
+	return false;
+}
+
 /*
  * calculate file layout from given offset, length.
  * fill in correct oid, logical length, and object extent
-- 
cgit v1.2.3


From a66dd38309f5d9c66ec9bc7911ff8da8cc37bb9f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: switch to calc_target(), part 1

Replace __calc_request_pg() and most of __map_request() with
calc_target() and start using req->r_t.

ceph_osdc_build_request() however still encodes base_oid, because it's
called before calc_target() is and target_oid is empty at that point in
time; a printf in osdc_show() also shows base_oid.  This is fixed in
"libceph: switch to calc_target(), part 2".

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  15 ++---
 net/ceph/debugfs.c              |   2 +-
 net/ceph/osd_client.c           | 119 ++++++++--------------------------------
 3 files changed, 29 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 48806ee4488d..03bf9d9e1517 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -150,12 +150,13 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
-	int              r_pg_osds[CEPH_PG_MAX_SIZE];
-	int              r_num_pg_osds;
+
+	struct ceph_osd_request_target r_t;
+#define r_base_oid	r_t.base_oid
+#define r_base_oloc	r_t.base_oloc
+#define r_flags		r_t.flags
 
 	struct ceph_msg  *r_request, *r_reply;
-	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
 
 	/* request osd ops array  */
@@ -167,7 +168,6 @@ struct ceph_osd_request {
 	__le64           *r_request_pool;
 	void             *r_request_pgid;
 	__le32           *r_request_attempts;
-	bool              r_paused;
 	struct ceph_eversion *r_request_reassert_version;
 
 	int               r_result;
@@ -186,11 +186,6 @@ struct ceph_osd_request {
 	struct inode *r_inode;         	      /* for use by callbacks */
 	void *r_priv;			      /* ditto */
 
-	struct ceph_object_locator r_base_oloc;
-	struct ceph_object_id r_base_oid;
-	struct ceph_object_locator r_target_oloc;
-	struct ceph_object_id r_target_oid;
-
 	u64               r_snapid;
 	unsigned long     r_stamp;            /* send OR check time */
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 7f1cc22c3e8b..0c11ab5f8c30 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
 			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_pgid.pool, req->r_pgid.seed);
+			   req->r_t.pgid.pool, req->r_t.pgid.seed);
 
 		seq_printf(s, "%*pE", req->r_base_oid.name_len,
 			   req->r_base_oid.name);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cff3a7e29233..013101598c41 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -350,8 +350,7 @@ static void ceph_osdc_release_request(struct kref *kref)
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
-	ceph_oid_destroy(&req->r_base_oid);
-	ceph_oid_destroy(&req->r_target_oid);
+	target_destroy(&req->r_t);
 	ceph_put_snap_context(req->r_snapc);
 
 	if (req->r_mempool)
@@ -420,10 +419,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
-	ceph_oid_init(&req->r_base_oid);
-	req->r_base_oloc.pool = -1;
-	ceph_oid_init(&req->r_target_oid);
-	req->r_target_oloc.pool = -1;
+	target_init(&req->r_t);
 
 	dout("%s req %p\n", __func__, req);
 	return req;
@@ -1308,16 +1304,6 @@ static bool __pool_full(struct ceph_pg_pool_info *pi)
  *
  * Caller should hold map_sem for read.
  */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req)
-{
-	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
-		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
-}
-
 static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
 				    struct ceph_pg_pool_info *pi)
@@ -1332,45 +1318,6 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
-/*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
- */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
-			     struct ceph_osd_request *req,
-			     struct ceph_pg *pg_out)
-{
-	bool need_check_tiering;
-
-	need_check_tiering = false;
-	if (req->r_target_oloc.pool == -1) {
-		req->r_target_oloc = req->r_base_oloc; /* struct */
-		need_check_tiering = true;
-	}
-	if (ceph_oid_empty(&req->r_target_oid)) {
-		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
-		need_check_tiering = true;
-	}
-
-	if (need_check_tiering &&
-	    (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-		struct ceph_pg_pool_info *pi;
-
-		pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
-		if (pi) {
-			if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-			    pi->read_tier >= 0)
-				req->r_target_oloc.pool = pi->read_tier;
-			if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-			    pi->write_tier >= 0)
-				req->r_target_oloc.pool = pi->write_tier;
-		}
-		/* !pi is caught in ceph_oloc_oid_to_pg() */
-	}
-
-	return ceph_object_locator_to_pg(osdmap, &req->r_target_oid,
-					 &req->r_target_oloc, pg_out);
-}
-
 enum calc_target_result {
 	CALC_TARGET_NO_ACTION = 0,
 	CALC_TARGET_NEED_RESEND,
@@ -1510,46 +1457,26 @@ static void __enqueue_request(struct ceph_osd_request *req)
 static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
-	struct ceph_pg pgid;
-	struct ceph_osds up, acting;
+	enum calc_target_result ct_res;
 	int err;
-	bool was_paused;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
 
-	err = __calc_request_pg(osdc->osdmap, req, &pgid);
-	if (err) {
+	ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
+	switch (ct_res) {
+	case CALC_TARGET_POOL_DNE:
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return err;
-	}
-	req->r_pgid = pgid;
-
-	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
-
-	was_paused = req->r_paused;
-	req->r_paused = __req_should_be_paused(osdc, req);
-	if (was_paused && !req->r_paused)
-		force_resend = 1;
-
-	if ((!force_resend &&
-	     req->r_osd && req->r_osd->o_osd == acting.primary &&
-	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == acting.size &&
-	     memcmp(req->r_pg_osds, acting.osds,
-		    acting.size * sizeof(acting.osds[0])) == 0) ||
-	    (req->r_osd == NULL && acting.primary == -1) ||
-	    req->r_paused)
+		return -EIO;
+	case CALC_TARGET_NO_ACTION:
 		return 0;  /* no change */
+	default:
+		BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
+	}
 
 	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pool, pgid.seed, acting.primary,
+	     req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
-	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting.osds,
-	       acting.size * sizeof(acting.osds[0]));
-	req->r_num_pg_osds = acting.size;
-
 	if (req->r_osd) {
 		__cancel_request(req);
 		list_del_init(&req->r_osd_item);
@@ -1557,22 +1484,22 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = lookup_osd(&osdc->osds, acting.primary);
-	if (!req->r_osd && acting.primary >= 0) {
+	req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
+	if (!req->r_osd && req->r_t.osd >= 0) {
 		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, acting.primary);
+		req->r_osd = create_osd(osdc, req->r_t.osd);
 		if (!req->r_osd) {
 			list_move(&req->r_req_lru_item, &osdc->req_notarget);
 			goto out;
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd,
-		     acting.primary);
+		     req->r_osd->o_osd);
 		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, acting.primary,
-			      &osdc->osdmap->osd_addr[acting.primary]);
+			      CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
+			      &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
 	}
 
 	__enqueue_request(req);
@@ -1592,15 +1519,15 @@ static void __send_request(struct ceph_osd_client *osdc,
 
 	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
 	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+	     req->r_t.pgid.pool, req->r_t.pgid.seed);
 
 	/* fill in message content that changes each time we send it */
 	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
 	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
+	put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool);
 	p = req->r_request_pgid;
-	ceph_encode_64(&p, req->r_pgid.pool);
-	ceph_encode_32(&p, req->r_pgid.seed);
+	ceph_encode_64(&p, req->r_t.pgid.pool);
+	ceph_encode_32(&p, req->r_t.pgid.seed);
 	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
 	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
 	       sizeof(req->r_reassert_version));
@@ -1963,7 +1890,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 		__unregister_request(osdc, req);
 
-		ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
+		ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
-- 
cgit v1.2.3


From bb873b539154ab51893430b4ad6ba4051775276a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 00:29:52 +0200
Subject: libceph: switch to calc_target(), part 2

The crux of this is getting rid of ceph_osdc_build_request(), so that
MOSDOp can be encoded not before but after calc_target() calculates the
actual target.  Encoding now happens within ceph_osdc_start_request().

Also nuked is the accompanying bunch of pointers into the encoded
buffer that was used to update fields on each send - instead, the
entire front is re-encoded.  If we want to support target->name_len !=
base->name_len in the future, there is no other way, because oid is
surrounded by other fields in the encoded buffer.

Encoding OSD ops and adding data items to the request message were
mixed together in osd_req_encode_op().  While we want to re-encode OSD
ops, we don't want to add duplicate data items to the message when
resending, so all call to ceph_osdc_msg_data_add() are factored out
into a new setup_request_data().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  18 +-
 fs/ceph/addr.c                  |  16 +-
 fs/ceph/file.c                  |  16 +-
 include/linux/ceph/osd_client.h |  29 ++--
 include/linux/ceph/rados.h      |   7 +
 net/ceph/debugfs.c              |  61 ++++---
 net/ceph/osd_client.c           | 355 ++++++++++++++++++++--------------------
 7 files changed, 247 insertions(+), 255 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f3ea927f93de..0e598916e048 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1896,27 +1896,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	u64 snap_id;
-
-	rbd_assert(osd_req != NULL);
 
-	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			NULL, snap_id, NULL);
+	if (img_request)
+		osd_req->r_snapid = img_request->snap_id;
 }
 
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	struct ceph_snap_context *snapc;
-	struct timespec mtime = CURRENT_TIME;
-
-	rbd_assert(osd_req != NULL);
 
-	snapc = img_request ? img_request->snapc : NULL;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			snapc, CEPH_NOSNAP, &mtime);
+	osd_req->r_mtime = CURRENT_TIME;
+	osd_req->r_data_offset = obj_request->offset;
 }
 
 /*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c5d75486823b..59b3c3fbd3bd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
 	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
@@ -1063,10 +1061,7 @@ new_request:
 			pages = NULL;
 		}
 
-		vino = ceph_vino(inode);
-		ceph_osdc_build_request(req, offset, snapc, vino.snap,
-					&inode->i_mtime);
-
+		req->r_mtime = inode->i_mtime;
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
 		req = NULL;
@@ -1614,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1652,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 			goto out_put;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1790,12 +1785,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
 				     0, false, true);
-	ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-	ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
+	wr_req->r_mtime = ci->vfs_inode.i_mtime;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 36b4a41dfa67..52e4b72dd5de 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -727,8 +727,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
-				snapc, CEPH_NOSNAP, &aio_req->mtime);
+	req->r_mtime = aio_req->mtime;
+	req->r_data_offset = req->r_ops[0].extent.offset;
 
 	ceph_osdc_put_request(orig_req);
 
@@ -882,14 +882,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					(pos+len) | (PAGE_SIZE - 1));
 
 			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+			req->r_mtime = mtime;
 		}
 
-
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
 						 false, false);
 
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
 		if (aio_req) {
 			aio_req->total_len += len;
 			aio_req->num_reqs++;
@@ -1074,9 +1072,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
 						false, true);
 
-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+		req->r_mtime = mtime;
 		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1532,9 +1528,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
-				&inode->i_mtime);
-
+	req->r_mtime = inode->i_mtime;
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03bf9d9e1517..67a37d98e0ca 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -104,7 +104,7 @@ struct ceph_osd_req_op {
 			struct ceph_osd_data response_data;
 			__u8 class_len;
 			__u8 method_len;
-			__u8 argc;
+			u32 indata_len;
 		} cls;
 		struct {
 			u64 cookie;
@@ -162,14 +162,6 @@ struct ceph_osd_request {
 	/* request osd ops array  */
 	unsigned int		r_num_ops;
 
-	/* these are updated on each send */
-	__le32           *r_request_osdmap_epoch;
-	__le32           *r_request_flags;
-	__le64           *r_request_pool;
-	void             *r_request_pgid;
-	__le32           *r_request_attempts;
-	struct ceph_eversion *r_request_reassert_version;
-
 	int               r_result;
 	int               r_got_reply;
 	int		  r_linger;
@@ -180,16 +172,22 @@ struct ceph_osd_request {
 	struct completion r_completion, r_safe_completion;
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
-	struct ceph_eversion r_reassert_version;
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
 	void *r_priv;			      /* ditto */
 
-	u64               r_snapid;
-	unsigned long     r_stamp;            /* send OR check time */
+	/* set by submitter */
+	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
+	struct ceph_snap_context *r_snapc;    /* for writes */
+	struct timespec r_mtime;              /* ditto */
+	u64 r_data_offset;                    /* ditto */
 
-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	/* internal */
+	unsigned long r_stamp;                /* jiffies, send or check time */
+	int r_attempts;
+	struct ceph_eversion r_replay_version; /* aka reassert_version */
+	u32 r_last_force_resend;
 
 	struct ceph_osd_req_op r_ops[];
 };
@@ -334,11 +332,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       gfp_t gfp_flags);
 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				    struct ceph_snap_context *snapc,
-				    u64 snap_id,
-				    struct timespec *mtime);
-
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index f28ed864e682..28740a58f32c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -394,6 +394,13 @@ enum {
 	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
 	CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
 	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+	CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+						      pool uses pool snaps */
+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 0c11ab5f8c30..6d3ff713edeb 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -145,6 +145,43 @@ static int monc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
+{
+	int i;
+
+	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+	for (i = 0; i < t->up.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+	seq_printf(s, "]/%d\t[", t->up.primary);
+	for (i = 0; i < t->acting.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+	seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+		   t->target_oid.name_len, t->target_oid.name, t->flags);
+	if (t->paused)
+		seq_puts(s, "\tP");
+}
+
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+	int i;
+
+	seq_printf(s, "%llu\t", req->r_tid);
+	dump_target(s, &req->r_t);
+
+	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+		   le32_to_cpu(req->r_replay_version.epoch),
+		   le64_to_cpu(req->r_replay_version.version));
+
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+			   ceph_osd_op_name(op->op));
+	}
+
+	seq_putc(s, '\n');
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -154,32 +191,10 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
-		unsigned int i;
-		int opcode;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_t.pgid.pool, req->r_t.pgid.seed);
-
-		seq_printf(s, "%*pE", req->r_base_oid.name_len,
-			   req->r_base_oid.name);
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = req->r_ops[i].op;
-			seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-				   ceph_osd_op_name(opcode));
-		}
-
-		seq_printf(s, "\n");
+		dump_request(s, req);
 	}
 	mutex_unlock(&osdc->request_mutex);
 	return 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 013101598c41..8a008f083283 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -34,8 +34,6 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req);
 static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req);
 
 /*
  * Implement client access to distributed object storage cluster.
@@ -209,6 +207,8 @@ void osd_req_op_cls_request_data_pagelist(
 
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
+	osd_req->r_ops[which].cls.indata_len += pagelist->length;
+	osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -221,6 +221,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
+	osd_req->r_ops[which].cls.indata_len += length;
+	osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -610,8 +612,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-	op->cls.argc = 0;	/* currently unused */
-
 	op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -709,16 +709,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 	}
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+			     const struct ceph_osd_req_op *src)
 {
-	struct ceph_osd_req_op *src;
-	struct ceph_osd_data *osd_data;
-	u64 request_data_len = 0;
-	u64 data_length;
-
-	BUG_ON(which >= req->r_num_ops);
-	src = &req->r_ops[which];
 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
 		pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -727,49 +720,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 	switch (src->op) {
 	case CEPH_OSD_OP_STAT:
-		osd_data = &src->raw_data_in;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_TRUNCATE:
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		osd_data = &src->extent.osd_data;
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-		else
-			ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		osd_data = &src->cls.request_info;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = osd_data->pagelist->length;
-
-		osd_data = &src->cls.request_data;
-		data_length = ceph_osd_data_length(osd_data);
-		if (data_length) {
-			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-			dst->cls.indata_len = cpu_to_le32(data_length);
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-			src->indata_len += data_length;
-			request_data_len += data_length;
-		}
-		osd_data = &src->cls.response_data;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -791,9 +758,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		osd_data = &src->xattr.osd_data;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		request_data_len = osd_data->pagelist->length;
 		break;
 	case CEPH_OSD_OP_CREATE:
 	case CEPH_OSD_OP_DELETE:
@@ -810,7 +774,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	dst->flags = cpu_to_le32(src->flags);
 	dst->payload_len = cpu_to_le32(src->indata_len);
 
-	return request_data_len;
+	return src->indata_len;
 }
 
 /*
@@ -852,8 +816,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		goto fail;
 	}
 
-	req->r_flags = flags;
-
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
 	if (r)
@@ -877,9 +839,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_flags = flags;
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
 	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
+	req->r_snapid = vino.snap;
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		req->r_data_offset = off;
+
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
 		goto fail;
@@ -1509,37 +1476,173 @@ out:
 	return err;
 }
 
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
+static void setup_request_data(struct ceph_osd_request *req,
+			       struct ceph_msg *msg)
 {
-	void *p;
+	u32 data_len = 0;
+	int i;
+
+	if (!list_empty(&msg->data))
+		return;
 
-	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     req->r_t.pgid.pool, req->r_t.pgid.seed);
+	WARN_ON(msg->data_length);
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		switch (op->op) {
+		/* request */
+		case CEPH_OSD_OP_WRITE:
+		case CEPH_OSD_OP_WRITEFULL:
+			WARN_ON(op->indata_len != op->extent.length);
+			ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+			break;
+		case CEPH_OSD_OP_SETXATTR:
+		case CEPH_OSD_OP_CMPXATTR:
+			WARN_ON(op->indata_len != op->xattr.name_len +
+						  op->xattr.value_len);
+			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+			break;
+
+		/* reply */
+		case CEPH_OSD_OP_STAT:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->raw_data_in);
+			break;
+		case CEPH_OSD_OP_READ:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->extent.osd_data);
+			break;
+
+		/* both */
+		case CEPH_OSD_OP_CALL:
+			WARN_ON(op->indata_len != op->cls.class_len +
+						  op->cls.method_len +
+						  op->cls.indata_len);
+			ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->cls.response_data);
+			break;
+		}
+
+		data_len += op->indata_len;
+	}
 
-	/* fill in message content that changes each time we send it */
-	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool);
-	p = req->r_request_pgid;
+	WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+	u32 data_len = 0;
+	int i;
+
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+		/* snapshots aren't writeable */
+		WARN_ON(req->r_snapid != CEPH_NOSNAP);
+	} else {
+		WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+			req->r_data_offset || req->r_snapc);
+	}
+
+	setup_request_data(req, msg);
+
+	ceph_encode_32(&p, 1); /* client_inc, always 1 */
+	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+	ceph_encode_32(&p, req->r_flags);
+	ceph_encode_timespec(p, &req->r_mtime);
+	p += sizeof(struct ceph_timespec);
+	/* aka reassert_version */
+	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+	p += sizeof(req->r_replay_version);
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	ceph_encode_64(&p, req->r_t.target_oloc.pool);
+	ceph_encode_32(&p, -1); /* preferred */
+	ceph_encode_32(&p, 0); /* key len */
+
+	/* pgid */
+	ceph_encode_8(&p, 1);
 	ceph_encode_64(&p, req->r_t.pgid.pool);
 	ceph_encode_32(&p, req->r_t.pgid.seed);
-	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-	       sizeof(req->r_reassert_version));
+	ceph_encode_32(&p, -1); /* preferred */
 
-	req->r_stamp = jiffies;
-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+	/* oid */
+	ceph_encode_32(&p, req->r_t.target_oid.name_len);
+	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+	p += req->r_t.target_oid.name_len;
 
-	ceph_msg_get(req->r_request); /* send consumes a ref */
+	/* ops, can imply data */
+	ceph_encode_16(&p, req->r_num_ops);
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(p, &req->r_ops[i]);
+		p += sizeof(struct ceph_osd_op);
+	}
 
-	req->r_sent = req->r_osd->o_incarnation;
+	ceph_encode_64(&p, req->r_snapid); /* snapid */
+	if (req->r_snapc) {
+		ceph_encode_64(&p, req->r_snapc->seq);
+		ceph_encode_32(&p, req->r_snapc->num_snaps);
+		for (i = 0; i < req->r_snapc->num_snaps; i++)
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
+	} else {
+		ceph_encode_64(&p, 0); /* snap_seq */
+		ceph_encode_32(&p, 0); /* snaps len */
+	}
+
+	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	msg->hdr.data_len = cpu_to_le32(data_len);
+	/*
+	 * The header "data_off" is a hint to the receiver allowing it
+	 * to align received data into its buffers such that there's no
+	 * need to re-copy it before writing it to disk (direct I/O).
+	 */
+	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
 
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+	     req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+	     req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
+}
+
+/*
+ * @req has to be assigned a tid and registered.
+ */
+static void send_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd *osd = req->r_osd;
+
+	WARN_ON(osd->o_osd != req->r_t.osd);
+
+	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+	if (req->r_attempts)
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+	else
+		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+	encode_request(req, req->r_request);
+
+	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+	     req->r_t.osd, req->r_flags, req->r_attempts);
+
+	req->r_t.paused = false;
+	req->r_stamp = jiffies;
+	req->r_attempts++;
+
+	req->r_sent = osd->o_incarnation;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
 /*
@@ -1550,8 +1653,10 @@ static void __send_queued(struct ceph_osd_client *osdc)
 	struct ceph_osd_request *req, *tmp;
 
 	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-		__send_request(osdc, req);
+	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
+		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+		send_request(req);
+	}
 }
 
 /*
@@ -1915,8 +2020,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			req->r_result = bytes;
 
 		/* in case this is a write and we need to replay, */
-		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_reassert_version.version = cpu_to_le64(reassert_version);
+		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
+		req->r_replay_version.version = cpu_to_le64(reassert_version);
 
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -2432,105 +2537,6 @@ bad:
 	pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
-/*
- * build new request AND message
- *
- */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				struct ceph_snap_context *snapc, u64 snap_id,
-				struct timespec *mtime)
-{
-	struct ceph_msg *msg = req->r_request;
-	void *p;
-	size_t msg_size;
-	int flags = req->r_flags;
-	u64 data_len;
-	unsigned int i;
-
-	req->r_snapid = snap_id;
-	WARN_ON(snapc != req->r_snapc);
-
-	/* encode request */
-	msg->hdr.version = cpu_to_le16(4);
-
-	p = msg->front.iov_base;
-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-	req->r_request_osdmap_epoch = p;
-	p += 4;
-	req->r_request_flags = p;
-	p += 4;
-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(p, mtime);
-	p += sizeof(struct ceph_timespec);
-	req->r_request_reassert_version = p;
-	p += sizeof(struct ceph_eversion); /* will get filled in */
-
-	/* oloc */
-	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
-	req->r_request_pool = p;
-	p += 8;
-	ceph_encode_32(&p, -1);  /* preferred */
-	ceph_encode_32(&p, 0);   /* key len */
-
-	ceph_encode_8(&p, 1);
-	req->r_request_pgid = p;
-	p += 8 + 4;
-	ceph_encode_32(&p, -1);  /* preferred */
-
-	/* oid */
-	ceph_encode_32(&p, req->r_base_oid.name_len);
-	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
-	     req->r_base_oid.name, req->r_base_oid.name_len);
-	p += req->r_base_oid.name_len;
-
-	/* ops--can imply data */
-	ceph_encode_16(&p, (u16)req->r_num_ops);
-	data_len = 0;
-	for (i = 0; i < req->r_num_ops; i++) {
-		data_len += osd_req_encode_op(req, p, i);
-		p += sizeof(struct ceph_osd_op);
-	}
-
-	/* snaps */
-	ceph_encode_64(&p, req->r_snapid);
-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-	if (req->r_snapc) {
-		for (i = 0; i < req->r_snapc->num_snaps; i++) {
-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
-		}
-	}
-
-	req->r_request_attempts = p;
-	p += 4;
-
-	/* data */
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		u16 data_off;
-
-		/*
-		 * The header "data_off" is a hint to the receiver
-		 * allowing it to align received data into its
-		 * buffers such that there's no need to re-copy
-		 * it before writing it to disk (direct I/O).
-		 */
-		data_off = (u16) (off & 0xffff);
-		req->r_request->hdr.data_off = cpu_to_le16(data_off);
-	}
-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
-
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
-
-	dout("build_request msg_size was %d\n", (int)msg_size);
-}
-EXPORT_SYMBOL(ceph_osdc_build_request);
-
 /*
  * Register request, send initial attempt.
  */
@@ -2749,15 +2755,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-
 	osd_req_op_extent_osd_data_pages(req, 0,
 				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
@@ -2783,7 +2786,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2797,8 +2799,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+	req->r_mtime = *mtime;
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
-- 
cgit v1.2.3


From 85e084feb47349d62989efe1713a8723af95f4ea Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: drop msg argument from ceph_osdc_callback_t

finish_read(), its only user, uses it to get to hdr.data_len, which is
what ->r_result is set to on success.  This gains us the ability to
safely call callbacks from contexts other than reply, e.g. map check.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 5 ++---
 fs/ceph/addr.c                  | 9 ++++-----
 fs/ceph/file.c                  | 7 +++----
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 4 ++--
 5 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0e598916e048..82b03aa509e6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1828,13 +1828,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
 		obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
-				struct ceph_msg *msg)
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
 	u16 opcode;
 
-	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+	dout("%s: osd_req %p\n", __func__, osd_req);
 	rbd_assert(osd_req == obj_request->osd_req);
 	if (obj_request_img_data_test(obj_request)) {
 		rbd_assert(obj_request->img_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 59b3c3fbd3bd..a11756a39471 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
  * Finish an async read(ahead) op.
  */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_osd_data *osd_data;
-	int rc = req->r_result;
-	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int rc = req->r_result <= 0 ? req->r_result : 0;
+	int bytes = req->r_result >= 0 ? req->r_result : 0;
 	int num_pages;
 	int i;
 
@@ -598,8 +598,7 @@ static void ceph_release_pages(struct page **pages, int num)
  * If we get an error, set the mapping error bit, but not the individual
  * page error bits.
  */
-static void writepages_finish(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 52e4b72dd5de..e75fd0b028e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -616,8 +616,7 @@ static void ceph_aio_complete(struct inode *inode,
 	kfree(aio_req);
 }
 
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
-				  struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
 {
 	int rc = req->r_result;
 	struct inode *inode = req->r_inode;
@@ -740,7 +739,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
 	if (ret < 0) {
 		req->r_result = ret;
-		ceph_aio_complete_req(req, NULL);
+		ceph_aio_complete_req(req);
 	}
 
 	ceph_put_snap_context(snapc);
@@ -961,7 +960,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 							      req, false);
 			if (ret < 0) {
 				req->r_result = ret;
-				ceph_aio_complete_req(req, NULL);
+				ceph_aio_complete_req(req);
 			}
 		}
 		return -EIOCBQUEUED;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 67a37d98e0ca..3bebd60e7f9f 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,8 +20,7 @@ struct ceph_osd_client;
 /*
  * completion callback for async writepages
  */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-				     struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD	-1
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8a008f083283..2a30c0bb3045 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2048,7 +2048,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
 			req->r_unsafe_callback(req, true);
 		if (req->r_callback)
-			req->r_callback(req, msg);
+			req->r_callback(req);
 		else
 			complete_all(&req->r_completion);
 	}
@@ -2072,7 +2072,7 @@ bad_put:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
 	if (req->r_callback)
-		req->r_callback(req, msg);
+		req->r_callback(req);
 	else
 		complete_all(&req->r_completion);
 	complete_request(req);
-- 
cgit v1.2.3


From fe5da05e979830b43b115d8a18ead521d507c783 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: redo callbacks and factor out MOSDOpReply decoding

If you specify ACK | ONDISK and set ->r_unsafe_callback, both
->r_callback and ->r_unsafe_callback(true) are called on ack.  This is
very confusing.  Redo this so that only one of them is called:

    ->r_unsafe_callback(true), on ack
    ->r_unsafe_callback(false), on commit

or

    ->r_callback, on ack|commit

Decode everything in decode_MOSDOpReply() to reduce clutter.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   3 +-
 fs/ceph/file.c                  |   2 +
 include/linux/ceph/osd_client.h |   5 +-
 net/ceph/osd_client.c           | 362 +++++++++++++++++++++++-----------------
 4 files changed, 215 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a11756a39471..f47418477629 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1765,8 +1765,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		goto out_unlock;
 	}
 
-	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
-			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e75fd0b028e9..30fd49eb25b4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -770,6 +770,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 		list_add_tail(&req->r_unsafe_item,
 			      &ci->i_unsafe_writes);
 		spin_unlock(&ci->i_unsafe_lock);
+
+		complete_all(&req->r_completion);
 	} else {
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_item);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3bebd60e7f9f..2415dc0cb008 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -162,13 +162,14 @@ struct ceph_osd_request {
 	unsigned int		r_num_ops;
 
 	int               r_result;
-	int               r_got_reply;
+	bool              r_got_reply;
 	int		  r_linger;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
-	struct completion r_completion, r_safe_completion;
+	struct completion r_completion;
+	struct completion r_safe_completion;  /* fsync waiter */
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
 	struct list_head  r_unsafe_item;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2a30c0bb3045..baf2844b00d6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1693,6 +1693,14 @@ static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	return 0;
 }
 
+static void __complete_request(struct ceph_osd_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(req);
+	else
+		complete_all(&req->r_completion);
+}
+
 /*
  * Timeout callback, called every N seconds when 1 or more osd
  * requests has been active for more than N seconds.  When this
@@ -1875,107 +1883,76 @@ e_inval:
 	goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-	complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+	struct ceph_pg pgid;
+	u64 flags;
+	int result;
+	u32 epoch;
+	int num_ops;
+	u32 outdata_len[CEPH_OSD_MAX_OPS];
+	s32 rval[CEPH_OSD_MAX_OPS];
+	int retry_attempt;
+	struct ceph_eversion replay_version;
+	u64 user_version;
+	struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-	void *p, *end;
-	struct ceph_osd_request *req;
-	struct ceph_request_redirect redir;
-	u64 tid;
-	int object_len;
-	unsigned int numops;
-	int payload_len, flags;
-	s32 result;
-	s32 retry_attempt;
-	struct ceph_pg pg;
-	int err;
-	u32 reassert_epoch;
-	u64 reassert_version;
-	u32 osdmap_epoch;
-	int already_completed;
-	u32 bytes;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	u16 version = le16_to_cpu(msg->hdr.version);
+	struct ceph_eversion bad_replay_version;
 	u8 decode_redir;
-	unsigned int i;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	dout("handle_reply %p tid %llu\n", msg, tid);
-
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	u32 len;
+	int ret;
+	int i;
 
-	ceph_decode_need(&p, end, 4, bad);
-	object_len = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, object_len, bad);
-	p += object_len;
+	ceph_decode_32_safe(&p, end, len, e_inval);
+	ceph_decode_need(&p, end, len, e_inval);
+	p += len; /* skip oid */
 
-	err = ceph_decode_pgid(&p, end, &pg);
-	if (err)
-		goto bad;
+	ret = ceph_decode_pgid(&p, end, &m->pgid);
+	if (ret)
+		return ret;
 
-	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-	flags = ceph_decode_64(&p);
-	result = ceph_decode_32(&p);
-	reassert_epoch = ceph_decode_32(&p);
-	reassert_version = ceph_decode_64(&p);
-	osdmap_epoch = ceph_decode_32(&p);
+	ceph_decode_64_safe(&p, end, m->flags, e_inval);
+	ceph_decode_32_safe(&p, end, m->result, e_inval);
+	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+	p += sizeof(bad_replay_version);
+	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-	/* lookup */
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		goto bad_mutex;
-	}
-	ceph_osdc_get_request(req);
+	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+		goto e_inval;
 
-	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-	     req, result);
-
-	ceph_decode_need(&p, end, 4, bad_put);
-	numops = ceph_decode_32(&p);
-	if (numops > CEPH_OSD_MAX_OPS)
-		goto bad_put;
-	if (numops != req->r_num_ops)
-		goto bad_put;
-	payload_len = 0;
-	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-	for (i = 0; i < numops; i++) {
+	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+			 e_inval);
+	for (i = 0; i < m->num_ops; i++) {
 		struct ceph_osd_op *op = p;
-		int len;
 
-		len = le32_to_cpu(op->payload_len);
-		req->r_ops[i].outdata_len = len;
-		dout(" op %d has %d bytes\n", i, len);
-		payload_len += len;
+		m->outdata_len[i] = le32_to_cpu(op->payload_len);
 		p += sizeof(*op);
 	}
-	bytes = le32_to_cpu(msg->hdr.data_len);
-	if (payload_len != bytes) {
-		pr_warn("sum of op payload lens %d != data_len %d\n",
-			payload_len, bytes);
-		goto bad_put;
-	}
 
-	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-	retry_attempt = ceph_decode_32(&p);
-	for (i = 0; i < numops; i++)
-		req->r_ops[i].rval = ceph_decode_32(&p);
+	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+	for (i = 0; i < m->num_ops; i++)
+		ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-	if (le16_to_cpu(msg->hdr.version) >= 6) {
-		p += 8 + 4; /* skip replay_version */
-		p += 8; /* skip user_version */
+	if (version >= 5) {
+		ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+		memcpy(&m->replay_version, p, sizeof(m->replay_version));
+		p += sizeof(m->replay_version);
+		ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+	} else {
+		m->replay_version = bad_replay_version; /* struct */
+		m->user_version = le64_to_cpu(m->replay_version.version);
+	}
 
-		if (le16_to_cpu(msg->hdr.version) >= 7)
-			ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+	if (version >= 6) {
+		if (version >= 7)
+			ceph_decode_8_safe(&p, end, decode_redir, e_inval);
 		else
 			decode_redir = 1;
 	} else {
@@ -1983,19 +1960,96 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	}
 
 	if (decode_redir) {
-		err = ceph_redirect_decode(&p, end, &redir);
-		if (err)
-			goto bad_put;
+		ret = ceph_redirect_decode(&p, end, &m->redirect);
+		if (ret)
+			return ret;
 	} else {
-		redir.oloc.pool = -1;
+		ceph_oloc_init(&m->redirect.oloc);
 	}
 
-	if (!ceph_oloc_empty(&redir.oloc)) {
-		dout("redirect pool %lld\n", redir.oloc.pool);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+			 const struct MOSDOpReply *m)
+{
+	return (m->result < 0 ||
+		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
+		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set?	yes			no
+ *
+ * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
+ * any or needed/got safe)	r_safe_completion	r_safe_completion
+ *
+ * first reply is unsafe	r_unsafe_cb(true)	(nothing)
+ *
+ * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
+ *				r_safe_completion	r_safe_completion
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	struct ceph_osd_request *req;
+	struct MOSDOpReply m;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 data_len = 0;
+	bool already_acked;
+	int ret;
+	int i;
+
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	req = lookup_request(&osdc->requests, tid);
+	if (!req) {
+		dout("%s no tid %llu\n", __func__, tid);
+		goto out_unlock;
+	}
+	ceph_osdc_get_request(req);
+
+	ret = decode_MOSDOpReply(msg, &m);
+	if (ret) {
+		pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+		       req->r_tid, ret);
+		ceph_msg_dump(msg);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+	     __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+	     m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+	     le64_to_cpu(m.replay_version.version), m.user_version);
+
+	if (m.retry_attempt >= 0) {
+		if (m.retry_attempt != req->r_attempts - 1) {
+			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+			     req, req->r_tid, m.retry_attempt,
+			     req->r_attempts - 1);
+			goto out_put;
+		}
+	} else {
+		WARN_ON(1); /* MOSDOpReply v4 is assumed */
+	}
+
+	if (!ceph_oloc_empty(&m.redirect.oloc)) {
+		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+		     m.redirect.oloc.pool);
 		__unregister_request(osdc, req);
 
-		ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc);
+		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
@@ -2005,85 +2059,85 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		 * successfully.  In the future we might want to follow
 		 * original request's nofail setting here.
 		 */
-		err = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(err);
+		ret = __ceph_osdc_start_request(osdc, req, true);
+		BUG_ON(ret);
 
-		goto out_unlock;
+		goto out_put;
 	}
 
-	already_completed = req->r_got_reply;
-	if (!req->r_got_reply) {
-		req->r_result = result;
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
-
-		/* in case this is a write and we need to replay, */
-		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_replay_version.version = cpu_to_le64(reassert_version);
-
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		goto out_unlock;
+	if (m.num_ops != req->r_num_ops) {
+		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+		       req->r_num_ops, req->r_tid);
+		goto fail_request;
 	}
-
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-		__register_linger_request(osdc, req);
-
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+	for (i = 0; i < req->r_num_ops; i++) {
+		dout(" req %p tid %llu op %d rval %d len %u\n", req,
+		     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+		req->r_ops[i].rval = m.rval[i];
+		req->r_ops[i].outdata_len = m.outdata_len[i];
+		data_len += m.outdata_len[i];
+	}
+	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+	     req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+	already_acked = req->r_got_reply;
+	if (!already_acked) {
+		req->r_result = m.result ?: data_len;
+		req->r_replay_version = m.replay_version; /* struct */
+		req->r_got_reply = true;
+	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+		dout("req %p tid %llu dup ack\n", req, req->r_tid);
+		goto out_put;
+	}
+
+	if (done_request(req, &m)) {
 		__unregister_request(osdc, req);
+		if (req->r_linger) {
+			WARN_ON(req->r_unsafe_callback);
+			__register_linger_request(osdc, req);
+		}
+	}
 
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
 
-	if (!already_completed) {
-		if (req->r_unsafe_callback &&
-		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
-			req->r_unsafe_callback(req, true);
-		if (req->r_callback)
-			req->r_callback(req);
-		else
-			complete_all(&req->r_completion);
-	}
-
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_unsafe_callback && already_completed)
+	if (done_request(req, &m)) {
+		if (already_acked && req->r_unsafe_callback) {
+			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, false);
-		complete_request(req);
+		} else {
+			dout("req %p tid %llu cb\n", req, req->r_tid);
+			__complete_request(req);
+		}
+	} else {
+		if (req->r_unsafe_callback) {
+			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+			req->r_unsafe_callback(req, true);
+		} else {
+			WARN_ON(1);
+		}
 	}
+	if (m.flags & CEPH_OSD_FLAG_ONDISK)
+		complete_all(&req->r_safe_completion);
 
-out:
-	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
 	ceph_osdc_put_request(req);
 	return;
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	goto out;
 
-bad_put:
+fail_request:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
-	if (req->r_callback)
-		req->r_callback(req);
-	else
-		complete_all(&req->r_completion);
-	complete_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+out_put:
 	ceph_osdc_put_request(req);
-bad_mutex:
+out_unlock:
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
-bad:
-	pr_err("corrupt osd_op_reply got %d %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-	ceph_msg_dump(msg);
 }
 
 static void reset_changed_osds(struct ceph_osd_client *osdc)
@@ -2591,7 +2645,9 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 	if (rc < 0) {
 		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
 		ceph_osdc_cancel_request(req);
-		complete_request(req);
+
+		/* kludge - need to to wake ceph_osdc_sync() */
+		complete_all(&req->r_safe_completion);
 		return rc;
 	}
 
-- 
cgit v1.2.3


From e5253a7bde13788d9dc75f42eb47ea119af5609f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:25 +0200
Subject: libceph: allocate dummy osdmap in ceph_osdc_init()

This leads to a simpler osdmap handling code, particularly when dealing
with pi->was_full, which is introduced in a later commit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  1 +
 net/ceph/osd_client.c       | 22 +++++++++++-----------
 net/ceph/osdmap.c           | 23 ++++++++++++++++++-----
 3 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 420bb7968b25..8468c734d712 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -225,6 +225,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 	return 0;
 }
 
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 41dabce9c9c3..9c35fd84a410 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2255,7 +2255,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	struct ceph_fsid fsid;
 	bool was_full;
 
-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	dout("handle_map have %u\n", osdc->osdmap->epoch);
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
@@ -2278,7 +2278,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
 		next = p + maplen;
-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+		if (osdc->osdmap->epoch+1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
 			newmap = osdmap_apply_incremental(&p, next,
@@ -2317,7 +2317,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		if (nr_maps > 1) {
 			dout("skipping non-latest full map %u len %d\n",
 			     epoch, maplen);
-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+		} else if (osdc->osdmap->epoch >= epoch) {
 			dout("skipping full map %u len %d, "
 			     "older than our %u\n", epoch, maplen,
 			     osdc->osdmap->epoch);
@@ -2347,8 +2347,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		nr_maps--;
 	}
 
-	if (!osdc->osdmap)
-		goto bad;
 done:
 	downgrade_write(&osdc->map_sem);
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
@@ -2690,7 +2688,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	osdc->osdmap = NULL;
 	init_rwsem(&osdc->map_sem);
 	mutex_init(&osdc->request_mutex);
 	osdc->last_tid = 0;
@@ -2709,10 +2706,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->event_count = 0;
 
 	err = -ENOMEM;
+	osdc->osdmap = ceph_osdmap_alloc();
+	if (!osdc->osdmap)
+		goto out;
+
 	osdc->req_mempool = mempool_create_slab_pool(10,
 						     ceph_osd_request_cache);
 	if (!osdc->req_mempool)
-		goto out;
+		goto out_map;
 
 	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
 				PAGE_SIZE, 10, true, "osd_op");
@@ -2741,6 +2742,8 @@ out_msgpool:
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
 	mempool_destroy(osdc->req_mempool);
+out_map:
+	ceph_osdmap_destroy(osdc->osdmap);
 out:
 	return err;
 }
@@ -2760,10 +2763,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	}
 	mutex_unlock(&osdc->request_mutex);
 
-	if (osdc->osdmap) {
-		ceph_osdmap_destroy(osdc->osdmap);
-		osdc->osdmap = NULL;
-	}
+	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d4a5b43085e..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -707,6 +707,23 @@ bad:
 /*
  * osd map
  */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+	struct ceph_osdmap *map;
+
+	map = kzalloc(sizeof(*map), GFP_NOIO);
+	if (!map)
+		return NULL;
+
+	map->pg_pools = RB_ROOT;
+	map->pool_max = -1;
+	map->pg_temp = RB_ROOT;
+	map->primary_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
+
+	return map;
+}
+
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
 	dout("osdmap_destroy %p\n", map);
@@ -1230,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	int ret;
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
+	map = ceph_osdmap_alloc();
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	map->pg_temp = RB_ROOT;
-	map->primary_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
-
 	ret = osdmap_decode(p, end, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
-- 
cgit v1.2.3


From 42c1b1240326cbea86f15f5d4ce565d8b54be31f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:25 +0200
Subject: libceph: handle_one_map()

Separate osdmap handling from decoding and iterating over a bag of maps
in a fresh MOSDMap message.  This sets up the scene for the updated OSD
client.

Of particular importance here is the addition of pi->was_full, which
can be used to answer "did this pool go full -> not-full in this map?".
This is the key bit for supporting pool quotas.

We won't be able to downgrade map_sem for much longer, so drop
downgrade_write().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |   1 +
 include/linux/ceph/osdmap.h     |   2 +
 net/ceph/mon_client.c           |   8 ++
 net/ceph/osd_client.c           | 186 ++++++++++++++++++++++++++++------------
 4 files changed, 141 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 330d045e4092..c14e9d861cda 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -115,6 +115,7 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
 			bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8468c734d712..821e16fff39a 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -45,6 +45,8 @@ struct ceph_pg_pool_info {
 	s64 write_tier; /* wins for read+write ops */
 	u64 flags; /* CEPH_POOL_FLAG_* */
 	char *name;
+
+	bool was_full;  /* for handle_one_map() */
 };
 
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a426a4b03e75..98bfbe1f6807 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -376,6 +376,14 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
 
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
+{
+	mutex_lock(&monc->mutex);
+	__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+EXPORT_SYMBOL(ceph_monc_renew_subs);
+
 /*
  * Register interest in the next osdmap
  */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9c35fd84a410..4227c55226c3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1245,6 +1245,21 @@ static bool __pool_full(struct ceph_pg_pool_info *pi)
 	return pi->flags & CEPH_POOL_FLAG_FULL;
 }
 
+static bool have_pool_full(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		if (__pool_full(pi))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
@@ -1639,6 +1654,26 @@ static void __send_queued(struct ceph_osd_client *osdc)
 	}
 }
 
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+	bool continuous = false;
+
+	WARN_ON(!osdc->osdmap->epoch);
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+		dout("%s osdc %p continuous\n", __func__, osdc);
+		continuous = true;
+	} else {
+		dout("%s osdc %p onetime\n", __func__, osdc);
+	}
+
+	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			       osdc->osdmap->epoch + 1, continuous))
+		ceph_monc_renew_subs(&osdc->client->monc);
+}
+
 /*
  * Caller should hold map_sem for read and request_mutex.
  */
@@ -2119,6 +2154,18 @@ out_unlock:
 	up_read(&osdc->map_sem);
 }
 
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		pi->was_full = __pool_full(pi);
+	}
+}
+
 static void reset_changed_osds(struct ceph_osd_client *osdc)
 {
 	struct rb_node *p, *n;
@@ -2237,6 +2284,57 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
 	}
 }
 
+static int handle_one_map(struct ceph_osd_client *osdc,
+			  void *p, void *end, bool incremental)
+{
+	struct ceph_osdmap *newmap;
+	struct rb_node *n;
+	bool skipped_map = false;
+	bool was_full;
+
+	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	set_pool_was_full(osdc);
+
+	if (incremental)
+		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+	else
+		newmap = ceph_osdmap_decode(&p, end);
+	if (IS_ERR(newmap))
+		return PTR_ERR(newmap);
+
+	if (newmap != osdc->osdmap) {
+		/*
+		 * Preserve ->was_full before destroying the old map.
+		 * For pools that weren't in the old map, ->was_full
+		 * should be false.
+		 */
+		for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+			struct ceph_pg_pool_info *pi =
+			    rb_entry(n, struct ceph_pg_pool_info, node);
+			struct ceph_pg_pool_info *old_pi;
+
+			old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+			if (old_pi)
+				pi->was_full = old_pi->was_full;
+			else
+				WARN_ON(pi->was_full);
+		}
+
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 < newmap->epoch) {
+			WARN_ON(incremental);
+			skipped_map = true;
+		}
+
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = newmap;
+	}
+
+	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	kick_requests(osdc, skipped_map, was_full);
+
+	return 0;
+}
 
 /*
  * Process updated osd map.
@@ -2247,27 +2345,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-	void *p, *end, *next;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
 	u32 nr_maps, maplen;
 	u32 epoch;
-	struct ceph_osdmap *newmap = NULL, *oldmap;
-	int err;
 	struct ceph_fsid fsid;
-	bool was_full;
+	bool handled_incremental = false;
+	bool was_pauserd, was_pausewr;
+	bool pauserd, pausewr;
+	int err;
 
-	dout("handle_map have %u\n", osdc->osdmap->epoch);
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+	down_write(&osdc->map_sem);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
 	if (ceph_check_fsid(osdc->client, &fsid) < 0)
-		return;
-
-	down_write(&osdc->map_sem);
+		goto bad;
 
-	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		      have_pool_full(osdc);
 
 	/* incremental maps */
 	ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2277,33 +2377,22 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		epoch = ceph_decode_32(&p);
 		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
-		next = p + maplen;
-		if (osdc->osdmap->epoch+1 == epoch) {
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, true);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			if (newmap != osdc->osdmap) {
-				ceph_osdmap_destroy(osdc->osdmap);
-				osdc->osdmap = newmap;
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, 0, was_full);
+			handled_incremental = true;
 		} else {
 			dout("ignoring incremental map %u len %d\n",
 			     epoch, maplen);
 		}
-		p = next;
+		p += maplen;
 		nr_maps--;
 	}
-	if (newmap)
+	if (handled_incremental)
 		goto done;
 
 	/* full maps */
@@ -2322,50 +2411,35 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     "older than our %u\n", epoch, maplen,
 			     osdc->osdmap->epoch);
 		} else {
-			int skipped_map = 0;
-
 			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = ceph_osdmap_decode(&p, p+maplen);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, false);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			oldmap = osdc->osdmap;
-			osdc->osdmap = newmap;
-			if (oldmap) {
-				if (oldmap->epoch + 1 < newmap->epoch)
-					skipped_map = 1;
-				ceph_osdmap_destroy(oldmap);
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, skipped_map, was_full);
 		}
 		p += maplen;
 		nr_maps--;
 	}
 
 done:
-	downgrade_write(&osdc->map_sem);
-	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-			  osdc->osdmap->epoch);
-
 	/*
 	 * subscribe to subsequent osdmap updates if full to ensure
 	 * we find out when we are no longer full and stop returning
 	 * ENOSPC.
 	 */
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		  have_pool_full(osdc);
+	if (was_pauserd || was_pausewr || pauserd || pausewr)
+		maybe_request_map(osdc);
 
 	mutex_lock(&osdc->request_mutex);
 	__send_queued(osdc);
 	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+
+	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			  osdc->osdmap->epoch);
+	up_write(&osdc->map_sem);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
-- 
cgit v1.2.3


From 9dd2845ccb40452d4ac943231ea34aade4a02c68 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:26 +0200
Subject: libceph: protect osdc->osd_lru list with a spinlock

OSD client is getting moved from the big per-client lock to a set of
per-session locks.  The big rwlock would only be held for read most of
the time, so a global osdc->osd_lru needs additional protection.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 29 ++++++++++++++++++-----------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2415dc0cb008..486d681694c4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,6 +224,7 @@ struct ceph_osd_client {
 	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
+	spinlock_t             osd_lru_lock;
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
 	struct list_head       req_lru;	      /* in-flight lru */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b6950c2c6cc4..d1c8e06f1261 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1101,31 +1101,37 @@ static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	}
 }
 
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-			      struct ceph_osd *osd)
+static void __move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osd);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
+	spin_lock(&osdc->osd_lru_lock);
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
+
 	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
-				  struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osd);
-
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests))
-		__move_osd_to_lru(osdc, osd);
+		__move_osd_to_lru(osd);
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-	dout("__remove_osd_from_lru %p\n", osd);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	spin_lock(&osdc->osd_lru_lock);
 	if (!list_empty(&osd->o_osd_lru))
 		list_del_init(&osd->o_osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
 }
 
 /*
@@ -1199,7 +1205,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 		ceph_msg_revoke(req->r_request);
 
 		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
+		maybe_move_osd_to_lru(req->r_osd);
 		if (list_empty(&req->r_linger_osd_item))
 			req->r_osd = NULL;
 	}
@@ -1248,7 +1254,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 
 	if (req->r_osd) {
 		list_del_init(&req->r_linger_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
+		maybe_move_osd_to_lru(req->r_osd);
 		if (list_empty(&req->r_osd_item))
 			req->r_osd = NULL;
 	}
@@ -2792,6 +2798,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
+	spin_lock_init(&osdc->osd_lru_lock);
 	osdc->requests = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->req_lru);
 	INIT_LIST_HEAD(&osdc->req_unsent);
-- 
cgit v1.2.3


From 5aea3dcd50215fa9563270251ad7323e2f2490ee Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:26 +0200
Subject: libceph: a major OSD client update

This is a major sync up, up to ~Jewel.  The highlights are:

- per-session request trees (vs a global per-client tree)
- per-session locking (vs a global per-client rwlock)
- homeless OSD session
- no ad-hoc global per-client lists
- support for pool quotas
- foundation for watch/notify v2 support
- foundation for map check (pool deletion detection) support

The switchover is incomplete: lingering requests can be setup and
teared down but aren't ever reestablished.  This functionality is
restored with the introduction of the new lingering infrastructure
(ceph_osd_linger_request, linger_work, etc) in a later commit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c                 |    8 +-
 fs/ceph/xattr.c                 |    8 +-
 include/linux/ceph/osd_client.h |   18 +-
 net/ceph/debugfs.c              |   34 +-
 net/ceph/osd_client.c           | 1164 +++++++++++++++++++--------------------
 5 files changed, 602 insertions(+), 630 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 1831ad6cf066..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	if (copy_from_user(&dl, arg, sizeof(dl)))
 		return -EFAULT;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return -EIO;
 	}
 	dl.file_offset -= dl.object_offset;
@@ -217,7 +217,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return r;
 	}
 
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	} else {
 		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 
 	/* send result back to user */
 	if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9410abdef3ce..5afabc4bf4c7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -75,7 +75,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	char buf[128];
 
 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name) {
 		size_t len = strlen(pool_name);
@@ -107,7 +107,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				ret = -ERANGE;
 		}
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
@@ -141,13 +141,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
 	const char *pool_name;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name)
 		ret = snprintf(val, size, "%s", pool_name);
 	else
 		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 486d681694c4..342f22f1f040 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -33,12 +33,13 @@ struct ceph_osd {
 	int o_incarnation;
 	struct rb_node o_node;
 	struct ceph_connection o_con;
-	struct list_head o_requests;
+	struct rb_root o_requests;
 	struct list_head o_linger_requests;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
 	struct list_head o_keepalive_item;
+	struct mutex lock;
 };
 
 #define CEPH_OSD_SLAB_OPS	2
@@ -144,8 +145,6 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
-	struct list_head r_req_lru_item;
-	struct list_head r_osd_item;
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
@@ -219,19 +218,16 @@ struct ceph_osd_client {
 	struct ceph_client     *client;
 
 	struct ceph_osdmap     *osdmap;       /* current map */
-	struct rw_semaphore    map_sem;
+	struct rw_semaphore    lock;
 
-	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
-	u64                    last_tid;      /* tid of last request */
-	struct rb_root         requests;      /* pending requests */
-	struct list_head       req_lru;	      /* in-flight lru */
-	struct list_head       req_unsent;    /* unsent/need-resend queue */
-	struct list_head       req_notarget;  /* map to no osd */
 	struct list_head       req_linger;    /* lingering requests */
-	int                    num_requests;
+	struct ceph_osd        homeless_osd;
+	atomic64_t             last_tid;      /* tid of last request */
+	atomic_t               num_requests;
+	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6d3ff713edeb..61dbd9de4650 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -182,21 +182,39 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 	seq_putc(s, '\n');
 }
 
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		dump_request(s, req);
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
 	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
+	struct rb_node *n;
 
-		req = rb_entry(p, struct ceph_osd_request, r_node);
+	down_read(&osdc->lock);
+	seq_printf(s, "REQUESTS %d homeless %d\n",
+		   atomic_read(&osdc->num_requests),
+		   atomic_read(&osdc->num_homeless));
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-		dump_request(s, req);
+		dump_requests(s, osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	dump_requests(s, &osdc->homeless_osd);
+
+	up_read(&osdc->lock);
 	return 0;
 }
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d1c8e06f1261..4c856c87b1a9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -25,16 +25,6 @@ static struct kmem_cache	*ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -53,6 +43,43 @@ static void __enqueue_request(struct ceph_osd_request *req);
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+	bool wrlocked = true;
+
+	if (unlikely(down_read_trylock(sem))) {
+		wrlocked = false;
+		up_read(sem);
+	}
+
+	return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	WARN_ON(!(mutex_is_locked(&osd->lock) &&
+		  rwsem_is_locked(&osdc->lock)) &&
+		!rwsem_is_wrlocked(&osdc->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -336,18 +363,14 @@ static void ceph_osdc_release_request(struct kref *kref)
 	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
 	     req->r_request, req->r_reply);
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_req_lru_item));
-	WARN_ON(!list_empty(&req->r_osd_item));
 	WARN_ON(!list_empty(&req->r_linger_item));
 	WARN_ON(!list_empty(&req->r_linger_osd_item));
 	WARN_ON(req->r_osd);
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply) {
-		ceph_msg_revoke_incoming(req->r_reply);
+	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	}
 
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
@@ -418,8 +441,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd_item);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-	INIT_LIST_HEAD(&req->r_osd_item);
 
 	target_init(&req->r_t);
 
@@ -869,141 +890,11 @@ static bool osd_homeless(struct ceph_osd *osd)
 	return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-	struct ceph_osd *osd = req->r_osd;
-
-	/*
-	 * Linger requests need to be resent with a new tid to avoid
-	 * the dup op detection logic on the OSDs.  Achieve this with
-	 * a re-register dance instead of open-coding.
-	 */
-	ceph_osdc_get_request(req);
-	if (!list_empty(&req->r_linger_item))
-		__unregister_linger_request(osdc, req);
-	else
-		__unregister_request(osdc, req);
-	__register_request(osdc, req);
-	ceph_osdc_put_request(req);
-
-	/*
-	 * Unless request has been registered as both normal and
-	 * lingering, __unregister{,_linger}_request clears r_osd.
-	 * However, here we need to preserve r_osd to make sure we
-	 * requeue on the same OSD.
-	 */
-	WARN_ON(req->r_osd || !osd);
-	req->r_osd = osd;
-
-	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-	__enqueue_request(req);
-}
-
-/*
- * Resubmit requests pending on the given osd.
- */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-				struct ceph_osd *osd)
-{
-	struct ceph_osd_request *req, *nreq;
-	LIST_HEAD(resend);
-	LIST_HEAD(resend_linger);
-	int err;
-
-	dout("%s osd%d\n", __func__, osd->o_osd);
-	err = __reset_osd(osdc, osd);
-	if (err)
-		return;
-
-	/*
-	 * Build up a list of requests to resend by traversing the
-	 * osd's list of requests.  Requests for a given object are
-	 * sent in tid order, and that is also the order they're
-	 * kept on this list.  Therefore all requests that are in
-	 * flight will be found first, followed by all requests that
-	 * have not yet been sent.  And to resend requests while
-	 * preserving this order we will want to put any sent
-	 * requests back on the front of the osd client's unsent
-	 * list.
-	 *
-	 * So we build a separate ordered list of already-sent
-	 * requests for the affected osd and splice it onto the
-	 * front of the osd client's unsent list.  Once we've seen a
-	 * request that has not yet been sent we're done.  Those
-	 * requests are already sitting right where they belong.
-	 */
-	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-		if (!req->r_sent)
-			break;
-
-		if (!req->r_linger) {
-			dout("%s requeueing %p tid %llu\n", __func__, req,
-			     req->r_tid);
-			list_move_tail(&req->r_req_lru_item, &resend);
-			req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		} else {
-			list_move_tail(&req->r_req_lru_item, &resend_linger);
-		}
-	}
-	list_splice(&resend, &osdc->req_unsent);
-
-	/*
-	 * Both registered and not yet registered linger requests are
-	 * enqueued with a new tid on the same OSD.  We add/move them
-	 * to req_unsent/o_requests at the end to keep things in tid
-	 * order.
-	 */
-	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-				 r_linger_osd_item) {
-		WARN_ON(!list_empty(&req->r_req_lru_item));
-		__kick_linger_request(req);
-	}
-
-	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-		__kick_linger_request(req);
-}
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static bool osd_registered(struct ceph_osd *osd)
 {
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	verify_osdc_locked(osd->o_osdc);
 
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, osd);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
@@ -1013,17 +904,18 @@ static void osd_init(struct ceph_osd *osd)
 {
 	atomic_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
-	INIT_LIST_HEAD(&osd->o_requests);
+	osd->o_requests = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
+	mutex_init(&osd->lock);
 }
 
 static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
-	WARN_ON(!list_empty(&osd->o_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
@@ -1077,30 +969,6 @@ static void put_osd(struct ceph_osd *osd)
 
 DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-	WARN_ON(!list_empty(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
-
-	list_del_init(&osd->o_osd_lru);
-	erase_osd(&osdc->osds, osd);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-	if (!RB_EMPTY_NODE(&osd->o_node)) {
-		ceph_con_close(&osd->o_con);
-		__remove_osd(osdc, osd);
-		put_osd(osd);
-	}
-}
-
 static void __move_osd_to_lru(struct ceph_osd *osd)
 {
 	struct ceph_osd_client *osdc = osd->o_osdc;
@@ -1117,7 +985,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd)
 
 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	if (list_empty(&osd->o_requests) &&
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests))
 		__move_osd_to_lru(osd);
 }
@@ -1134,30 +1002,64 @@ static void __remove_osd_from_lru(struct ceph_osd *osd)
 	spin_unlock(&osdc->osd_lru_lock);
 }
 
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+
+	verify_osdc_wrlocked(osdc);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	ceph_con_close(&osd->o_con);
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+		unlink_request(osd, req);
+		link_request(&osdc->homeless_osd, req);
+	}
+
+	__remove_osd_from_lru(osd);
+	erase_osd(&osdc->osds, osd);
+	put_osd(osd);
+}
+
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
 	struct ceph_entity_addr *peer_addr;
 
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests) &&
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
-		remove_osd(osdc, osd);
+		close_osd(osd);
 		return -ENODEV;
 	}
 
-	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
 	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
 			!ceph_con_opened(&osd->o_con)) {
-		struct ceph_osd_request *req;
+		struct rb_node *n;
 
 		dout("osd addr hasn't changed and connection never opened, "
 		     "letting msgr retry\n");
 		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+		for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+			struct ceph_osd_request *req =
+			    rb_entry(n, struct ceph_osd_request, r_node);
 			req->r_stamp = jiffies;
+		}
 
 		return -EAGAIN;
 	}
@@ -1169,73 +1071,84 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req)
-{
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	insert_request(&osdc->requests, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+					  bool wrlocked)
 {
-	if (RB_EMPTY_NODE(&req->r_node)) {
-		dout("__unregister_request %p tid %lld not registered\n",
-			req, req->r_tid);
-		return;
-	}
+	struct ceph_osd *osd;
 
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	erase_request(&osdc->requests, req);
-	osdc->num_requests--;
+	if (wrlocked)
+		verify_osdc_wrlocked(osdc);
+	else
+		verify_osdc_locked(osdc);
 
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_msg_revoke(req->r_request);
+	if (o != CEPH_HOMELESS_OSD)
+		osd = lookup_osd(&osdc->osds, o);
+	else
+		osd = &osdc->homeless_osd;
+	if (!osd) {
+		if (!wrlocked)
+			return ERR_PTR(-EAGAIN);
 
-		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_linger_osd_item))
-			req->r_osd = NULL;
+		osd = create_osd(osdc, o);
+		insert_osd(&osdc->osds, osd);
+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+			      &osdc->osdmap->osd_addr[osd->o_osd]);
 	}
 
-	list_del_init(&req->r_req_lru_item);
-	ceph_osdc_put_request(req);
+	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+	return osd;
 }
 
 /*
- * Cancel a previously queued request message
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-	if (req->r_sent && req->r_osd) {
-		ceph_msg_revoke(req->r_request);
-		req->r_sent = 0;
-	}
+	verify_osd_locked(osd);
+	WARN_ON(!req->r_tid || req->r_osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_request(&osd->o_requests, req);
+	req->r_osd = osd;
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
+{
+	verify_osd_locked(osd);
+	WARN_ON(req->r_osd != osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	req->r_osd = NULL;
+	erase_request(&osd->o_requests, req);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static void __register_linger_request(struct ceph_osd *osd,
 				    struct ceph_osd_request *req)
 {
 	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
 	WARN_ON(!req->r_linger);
 
 	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	if (req->r_osd)
-		list_add_tail(&req->r_linger_osd_item,
-			      &req->r_osd->o_linger_requests);
+	list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
+	list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
+	__remove_osd_from_lru(osd);
+	req->r_osd = osd;
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1255,7 +1168,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 	if (req->r_osd) {
 		list_del_init(&req->r_linger_osd_item);
 		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_osd_item))
+		if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
 			req->r_osd = NULL;
 	}
 	ceph_osdc_put_request(req);
@@ -1291,11 +1204,20 @@ static bool have_pool_full(struct ceph_osd_client *osdc)
 	return false;
 }
 
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
+
+	return __pool_full(pi);
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
  */
 static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
@@ -1421,87 +1343,6 @@ out:
 	return ct_res;
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-
-	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
-	} else {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
-	}
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-			 struct ceph_osd_request *req, int force_resend)
-{
-	enum calc_target_result ct_res;
-	int err;
-
-	dout("map_request %p tid %lld\n", req, req->r_tid);
-
-	ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
-	switch (ct_res) {
-	case CALC_TARGET_POOL_DNE:
-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return -EIO;
-	case CALC_TARGET_NO_ACTION:
-		return 0;  /* no change */
-	default:
-		BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
-	}
-
-	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		list_del_init(&req->r_linger_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
-	if (!req->r_osd && req->r_t.osd >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, req->r_t.osd);
-		if (!req->r_osd) {
-			list_move(&req->r_req_lru_item, &osdc->req_notarget);
-			goto out;
-		}
-
-		dout("map_request osd %p is osd%d\n", req->r_osd,
-		     req->r_osd->o_osd);
-		insert_osd(&osdc->osds, req->r_osd);
-
-		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
-			      &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
-	}
-
-	__enqueue_request(req);
-	err = 1;   /* osd or pg changed */
-
-out:
-	return err;
-}
-
 static void setup_request_data(struct ceph_osd_request *req,
 			       struct ceph_msg *msg)
 {
@@ -1648,8 +1489,16 @@ static void send_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd *osd = req->r_osd;
 
+	verify_osd_locked(osd);
 	WARN_ON(osd->o_osd != req->r_t.osd);
 
+	/*
+	 * We may have a previously queued request message hanging
+	 * around.  Cancel it to avoid corrupting the msgr.
+	 */
+	if (req->r_sent)
+		ceph_msg_revoke(req->r_request);
+
 	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
 	if (req->r_attempts)
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
@@ -1671,24 +1520,11 @@ static void send_request(struct ceph_osd_request *req)
 	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
-/*
- * Send any requests in the queue (req_unsent).
- */
-static void __send_queued(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd_request *req, *tmp;
-
-	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-		send_request(req);
-	}
-}
-
 static void maybe_request_map(struct ceph_osd_client *osdc)
 {
 	bool continuous = false;
 
+	verify_osdc_locked(osdc);
 	WARN_ON(!osdc->osdmap->epoch);
 
 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
@@ -1705,38 +1541,121 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				     struct ceph_osd_request *req,
-				     bool nofail)
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-	int rc;
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd;
+	bool need_send = false;
+	bool promoted = false;
 
-	__register_request(osdc, req);
-	req->r_sent = 0;
-	req->r_got_reply = 0;
-	rc = __map_request(osdc, req, 0);
-	if (rc < 0) {
-		if (nofail) {
-			dout("osdc_start_request failed map, "
-				" will retry %lld\n", req->r_tid);
-			rc = 0;
-		} else {
-			__unregister_request(osdc, req);
-		}
-		return rc;
+	WARN_ON(req->r_tid || req->r_got_reply);
+	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+	calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+	if (IS_ERR(osd)) {
+		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+		goto promote;
 	}
 
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+		dout("req %p pausewr\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+		   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		dout("req %p pauserd\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+				     CEPH_OSD_FLAG_FULL_FORCE)) &&
+		   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		    pool_full(osdc, req->r_t.base_oloc.pool))) {
+		dout("req %p full/pool_full\n", req);
+		pr_warn_ratelimited("FULL or reached pool quota\n");
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if (!osd_homeless(osd)) {
+		need_send = true;
 	} else {
-		__send_queued(osdc);
+		maybe_request_map(osdc);
 	}
 
-	return 0;
+	mutex_lock(&osd->lock);
+	/*
+	 * Assign the tid atomically with send_request() to protect
+	 * multiple writes to the same object from racing with each
+	 * other, resulting in out of order ops on the OSDs.
+	 */
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(osd, req);
+	if (need_send)
+		send_request(req);
+	mutex_unlock(&osd->lock);
+
+	if (promoted)
+		downgrade_write(&osdc->lock);
+	return;
+
+promote:
+	up_read(&osdc->lock);
+	down_write(&osdc->lock);
+	wrlocked = true;
+	promoted = true;
+	goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
+		WARN_ON(req->r_flags & mask);
+		req->r_flags |= CEPH_OSD_FLAG_ACK;
+	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		WARN_ON(!(req->r_flags & mask));
+	else
+		WARN_ON(1);
+
+	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
+	ceph_osdc_get_request(req);
+	account_request(req);
+	__submit_request(req, wrlocked);
+}
+
+static void __finish_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd = req->r_osd;
+
+	verify_osd_locked(osd);
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	unlink_request(osd, req);
+	atomic_dec(&osdc->num_requests);
+
+	/*
+	 * If an OSD has failed or returned and a request has been sent
+	 * twice, it's possible to get a reply and end up here while the
+	 * request message is queued for delivery.  We will ignore the
+	 * reply, so not a big deal, but better to try and catch it.
+	 */
+	ceph_msg_revoke(req->r_request);
+	ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+	__finish_request(req);
+	ceph_osdc_put_request(req);
 }
 
 static void __complete_request(struct ceph_osd_request *req)
@@ -1747,6 +1666,13 @@ static void __complete_request(struct ceph_osd_request *req)
 		complete_all(&req->r_completion);
 }
 
+static void cancel_request(struct ceph_osd_request *req)
+{
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	finish_request(req);
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -1758,44 +1684,49 @@ static void handle_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
-	struct ceph_osd_request *req;
-	struct ceph_osd *osd;
-	struct list_head slow_osds;
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
+	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	LIST_HEAD(slow_osds);
+	struct rb_node *n, *p;
 
-	mutex_lock(&osdc->request_mutex);
+	dout("%s osdc %p\n", __func__, osdc);
+	down_write(&osdc->lock);
 
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen
 	 * a connection with that osd (from the fault callback).
 	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies,
-				req->r_stamp + opts->osd_keepalive_timeout))
-			break;
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		bool found = false;
+
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (time_before(req->r_stamp, cutoff)) {
+				dout(" req %p tid %llu on osd%d is laggy\n",
+				     req, req->r_tid, osd->o_osd);
+				found = true;
+			}
+		}
 
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+		if (found)
+			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
+
+	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+		maybe_request_map(osdc);
+
 	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
+		struct ceph_osd *osd = list_first_entry(&slow_osds,
+							struct ceph_osd,
+							o_keepalive_item);
 		list_del_init(&osd->o_keepalive_item);
 		ceph_con_keepalive(&osd->o_con);
 	}
 
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->timeout_work,
 			      osdc->client->options->osd_keepalive_timeout);
 }
@@ -1809,18 +1740,17 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd *osd, *nosd;
 
 	dout("%s osdc %p\n", __func__, osdc);
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
+	down_write(&osdc->lock);
 	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
 		if (time_before(jiffies, osd->lru_ttl))
 			break;
 
-		remove_osd(osdc, osd);
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+		WARN_ON(!list_empty(&osd->o_linger_requests));
+		close_osd(osd);
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->osds_timeout_work,
 			      round_jiffies_relative(delay));
 }
@@ -2045,8 +1975,9 @@ static bool done_request(const struct ceph_osd_request *req,
  * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
  *				r_safe_completion	r_safe_completion
  */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	struct ceph_osd_request *req;
 	struct MOSDOpReply m;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
@@ -2057,14 +1988,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
-		dout("%s no tid %llu\n", __func__, tid);
-		goto out_unlock;
+		dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+		goto out_unlock_session;
 	}
-	ceph_osdc_get_request(req);
 
 	ret = decode_MOSDOpReply(msg, &m);
 	if (ret) {
@@ -2083,7 +2019,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
 			     req, req->r_tid, m.retry_attempt,
 			     req->r_attempts - 1);
-			goto out_put;
+			goto out_unlock_session;
 		}
 	} else {
 		WARN_ON(1); /* MOSDOpReply v4 is assumed */
@@ -2092,22 +2028,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	if (!ceph_oloc_empty(&m.redirect.oloc)) {
 		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
 		     m.redirect.oloc.pool);
-		__unregister_request(osdc, req);
+		unlink_request(osd, req);
+		mutex_unlock(&osd->lock);
 
 		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
-
-		/*
-		 * Start redirect requests with nofail=true.  If
-		 * mapping fails, request will end up on the notarget
-		 * list, waiting for the new osdmap (which can take
-		 * a while), even though the original request mapped
-		 * successfully.  In the future we might want to follow
-		 * original request's nofail setting here.
-		 */
-		ret = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(ret);
-
-		goto out_put;
+		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+		req->r_tid = 0;
+		__submit_request(req, false);
+		goto out_unlock_osdc;
 	}
 
 	if (m.num_ops != req->r_num_ops) {
@@ -2137,19 +2065,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		req->r_got_reply = true;
 	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
 		dout("req %p tid %llu dup ack\n", req, req->r_tid);
-		goto out_put;
+		goto out_unlock_session;
 	}
 
 	if (done_request(req, &m)) {
-		__unregister_request(osdc, req);
+		__finish_request(req);
 		if (req->r_linger) {
 			WARN_ON(req->r_unsafe_callback);
-			__register_linger_request(osdc, req);
+			__register_linger_request(osd, req);
 		}
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	mutex_unlock(&osd->lock);
+	up_read(&osdc->lock);
 
 	if (done_request(req, &m)) {
 		if (already_acked && req->r_unsafe_callback) {
@@ -2175,14 +2103,13 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 fail_request:
 	req->r_result = -EIO;
-	__unregister_request(osdc, req);
+	__finish_request(req);
 	__complete_request(req);
 	complete_all(&req->r_safe_completion);
-out_put:
-	ceph_osdc_put_request(req);
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 }
 
 static void set_pool_was_full(struct ceph_osd_client *osdc)
@@ -2197,126 +2124,66 @@ static void set_pool_was_full(struct ceph_osd_client *osdc)
 	}
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
 {
-	struct rb_node *p, *n;
+	struct ceph_pg_pool_info *pi;
 
-	dout("%s %p\n", __func__, osdc);
-	for (p = rb_first(&osdc->osds); p; p = n) {
-		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
 
-		n = rb_next(p);
-		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-		    memcmp(&osd->o_con.peer_addr,
-			   ceph_osd_addr(osdc->osdmap,
-					 osd->o_osd),
-			   sizeof(struct ceph_entity_addr)) != 0)
-			__reset_osd(osdc, osd);
-	}
+	return pi->was_full && !__pool_full(pi);
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-			  bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+			  bool force_resend,
+			  bool cleared_full,
+			  bool check_pool_cleared_full,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
-	struct ceph_osd_request *req, *nreq;
-	struct rb_node *p;
-	int needmap = 0;
-	int err;
-	bool force_resend_req;
-
-	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-		force_resend_writes ? " (force resend writes)" : "");
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; ) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-		p = rb_next(p);
-
-		/*
-		 * For linger requests that have not yet been
-		 * registered, move them to the linger list; they'll
-		 * be sent to the osd in the loop below.  Unregister
-		 * the request before re-registering it as a linger
-		 * request to ensure the __map_request() below
-		 * will decide it needs to be sent.
-		 */
-		if (req->r_linger && list_empty(&req->r_linger_item)) {
-			dout("%p tid %llu restart on osd%d\n",
-			     req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			ceph_osdc_get_request(req);
-			__unregister_request(osdc, req);
-			__register_linger_request(osdc, req);
-			ceph_osdc_put_request(req);
-			continue;
-		}
-
-		force_resend_req = force_resend ||
-			(force_resend_writes &&
-				req->r_flags & CEPH_OSD_FLAG_WRITE);
-		err = __map_request(osdc, req, force_resend_req);
-		if (err < 0)
-			continue;  /* error */
-		if (req->r_osd == NULL) {
-			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-			needmap++;  /* request a newer map */
-		} else if (err > 0) {
-			if (!req->r_linger) {
-				dout("%p tid %llu requeued on osd%d\n", req,
-				     req->r_tid,
-				     req->r_osd ? req->r_osd->o_osd : -1);
-				req->r_flags |= CEPH_OSD_FLAG_RETRY;
-			}
-		}
-	}
-
-	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-				 r_linger_item) {
-		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-		err = __map_request(osdc, req,
-				    force_resend || force_resend_writes);
-		dout("__map_request returned %d\n", err);
-		if (err < 0)
-			continue;  /* hrm! */
-		if (req->r_osd == NULL || err > 0) {
-			if (req->r_osd == NULL) {
-				dout("lingering %p tid %llu maps to no osd\n",
-				     req, req->r_tid);
-				/*
-				 * A homeless lingering request makes
-				 * no sense, as it's job is to keep
-				 * a particular OSD connection open.
-				 * Request a newer map and kick the
-				 * request, knowing that it won't be
-				 * resent until we actually get a map
-				 * that can tell us where to send it.
-				 */
-				needmap++;
-			}
-
-			dout("kicking lingering %p tid %llu osd%d\n", req,
-			     req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-			__register_request(osdc, req);
-			__unregister_linger_request(osdc, req);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+	bool force_resend_writes;
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+		ct_res = calc_target(osdc, &req->r_t,
+				     &req->r_last_force_resend, false);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+			if (!force_resend &&
+			    (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+			     !force_resend_writes))
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			unlink_request(osd, req);
+			insert_request(need_resend, req);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			break;
 		}
 	}
-	reset_changed_osds(osdc);
-	mutex_unlock(&osdc->request_mutex);
-
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-	}
 }
 
 static int handle_one_map(struct ceph_osd_client *osdc,
-			  void *p, void *end, bool incremental)
+			  void *p, void *end, bool incremental,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
 	struct ceph_osdmap *newmap;
 	struct rb_node *n;
@@ -2362,11 +2229,51 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	}
 
 	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	kick_requests(osdc, skipped_map, was_full);
+	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+		      need_resend, need_resend_linger);
+
+	for (n = rb_first(&osdc->osds); n; ) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		n = rb_next(n); /* close_osd() */
+
+		scan_requests(osd, skipped_map, was_full, true, need_resend,
+			      need_resend_linger);
+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+		    memcmp(&osd->o_con.peer_addr,
+			   ceph_osd_addr(osdc->osdmap, osd->o_osd),
+			   sizeof(struct ceph_entity_addr)))
+			close_osd(osd);
+	}
 
 	return 0;
 }
 
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(need_resend); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		struct ceph_osd *osd;
+
+		n = rb_next(n);
+		erase_request(need_resend, req); /* before link_request() */
+
+		WARN_ON(req->r_osd);
+		calc_target(osdc, &req->r_t, NULL, false);
+		osd = lookup_create_osd(osdc, req->r_t.osd, true);
+		link_request(osd, req);
+		if (!req->r_linger) {
+			if (!osd_homeless(osd) && !req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
 /*
  * Process updated osd map.
  *
@@ -2381,13 +2288,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	u32 nr_maps, maplen;
 	u32 epoch;
 	struct ceph_fsid fsid;
+	struct rb_root need_resend = RB_ROOT;
+	LIST_HEAD(need_resend_linger);
 	bool handled_incremental = false;
 	bool was_pauserd, was_pausewr;
 	bool pauserd, pausewr;
 	int err;
 
 	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
-	down_write(&osdc->map_sem);
+	down_write(&osdc->lock);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
@@ -2412,7 +2321,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, true);
+			err = handle_one_map(osdc, p, p + maplen, true,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 			handled_incremental = true;
@@ -2443,7 +2353,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     osdc->osdmap->epoch);
 		} else {
 			dout("taking full map %u len %d\n", epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, false);
+			err = handle_one_map(osdc, p, p + maplen, false,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 		}
@@ -2464,20 +2375,60 @@ done:
 	if (was_pauserd || was_pausewr || pauserd || pausewr)
 		maybe_request_map(osdc);
 
-	mutex_lock(&osdc->request_mutex);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
+	kick_requests(osdc, &need_resend, &need_resend_linger);
 
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
 			  osdc->osdmap->epoch);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
 bad:
 	pr_err("osdc handle_map corrupt msg\n");
 	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		if (!req->r_linger) {
+			if (!req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	down_write(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock;
+	}
+
+	if (!reopen_osd(osd))
+		kick_osd_requests(osd);
+	maybe_request_map(osdc);
+
+out_unlock:
+	up_write(&osdc->lock);
 }
 
 /*
@@ -2680,17 +2631,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			    struct ceph_osd_request *req,
 			    bool nofail)
 {
-	int rc;
-
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
-	rc = __ceph_osdc_start_request(osdc, req, nofail);
+	down_read(&osdc->lock);
+	submit_request(req, false);
+	up_read(&osdc->lock);
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
@@ -2703,13 +2648,12 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	if (req->r_linger)
 		__unregister_linger_request(osdc, req);
-	__unregister_request(osdc, req);
-	mutex_unlock(&osdc->request_mutex);
-
-	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+	if (req->r_osd)
+		cancel_request(req);
+	up_write(&osdc->lock);
 }
 EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
@@ -2744,32 +2688,40 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
  */
 void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
+	struct rb_node *n, *p;
+	u64 last_tid = atomic64_read(&osdc->last_tid);
 
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
+again:
+	down_read(&osdc->lock);
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		mutex_lock(&osd->lock);
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (req->r_tid > last_tid)
+				break;
+
+			if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+				continue;
 
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
+			ceph_osdc_get_request(req);
+			mutex_unlock(&osd->lock);
+			up_read(&osdc->lock);
+			dout("%s waiting on req %p tid %llu last_tid %llu\n",
+			     __func__, req, req->r_tid, last_tid);
+			wait_for_completion(&req->r_safe_completion);
+			ceph_osdc_put_request(req);
+			goto again;
+		}
 
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
+		mutex_unlock(&osd->lock);
 	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
+
+	up_read(&osdc->lock);
+	dout("%s done last_tid %llu\n", __func__, last_tid);
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
@@ -2793,18 +2745,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	init_rwsem(&osdc->map_sem);
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
+	init_rwsem(&osdc->lock);
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	spin_lock_init(&osdc->osd_lru_lock);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	INIT_LIST_HEAD(&osdc->req_unsent);
-	INIT_LIST_HEAD(&osdc->req_notarget);
 	INIT_LIST_HEAD(&osdc->req_linger);
-	osdc->num_requests = 0;
+	osd_init(&osdc->homeless_osd);
+	osdc->homeless_osd.o_osdc = osdc;
+	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 	spin_lock_init(&osdc->event_lock);
@@ -2861,13 +2809,19 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	cancel_delayed_work_sync(&osdc->timeout_work);
 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	while (!RB_EMPTY_ROOT(&osdc->osds)) {
 		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
 						struct ceph_osd, o_node);
-		remove_osd(osdc, osd);
+		close_osd(osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	up_write(&osdc->lock);
+	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	osd_cleanup(&osdc->homeless_osd);
+
+	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(atomic_read(&osdc->num_requests));
+	WARN_ON(atomic_read(&osdc->num_homeless));
 
 	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
@@ -2982,19 +2936,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(msg->hdr.type);
 
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(osdc, msg);
 		break;
 	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg);
+		handle_reply(osd, msg);
 		break;
 	case CEPH_MSG_WATCH_NOTIFY:
 		handle_watch_notify(osdc, msg);
@@ -3004,7 +2954,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
-out:
+
 	ceph_msg_put(msg);
 }
 
@@ -3019,21 +2969,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
+	struct ceph_msg *m = NULL;
 	struct ceph_osd_request *req;
 	int front_len = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
+	u64 tid = le64_to_cpu(hdr->tid);
 
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+		*skip = 1;
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	ceph_msg_revoke_incoming(req->r_reply);
@@ -3045,7 +3001,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
 				 false);
 		if (!m)
-			goto out;
+			goto out_unlock_session;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
 	}
@@ -3056,14 +3012,16 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			req->r_reply->data_length);
 		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	m = ceph_msg_get(req->r_reply);
 	dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-	mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 	return m;
 }
 
@@ -3083,8 +3041,8 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
+		pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+			osd->o_osd, type);
 		*skip = 1;
 		return NULL;
 	}
@@ -3188,5 +3146,5 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.alloc_msg = alloc_msg,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
-	.fault = osd_reset,
+	.fault = osd_fault,
 };
-- 
cgit v1.2.3


From 922dab6134178cae317ae00de86376cba59f3147 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 01:15:02 +0200
Subject: libceph, rbd: ceph_osd_linger_request, watch/notify v2

This adds support and switches rbd to a new, more reliable version of
watch/notify protocol.  As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed.  watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().

A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request.  ceph_osd_event has been merged into
ceph_osd_linger_request.

All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.

ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.

Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  179 ++----
 include/linux/ceph/ceph_fs.h    |    5 +-
 include/linux/ceph/osd_client.h |   97 ++--
 include/linux/ceph/rados.h      |   17 +-
 net/ceph/ceph_strings.c         |   16 +
 net/ceph/debugfs.c              |   36 ++
 net/ceph/osd_client.c           | 1148 ++++++++++++++++++++++++++++++---------
 7 files changed, 1067 insertions(+), 431 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fce23dc908e3..d0834c477f96 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -351,11 +351,11 @@ struct rbd_device {
 	struct rbd_options	*opts;
 
 	struct ceph_object_id	header_oid;
+	struct ceph_object_locator header_oloc;
 
 	struct ceph_file_layout	layout;
 
-	struct ceph_osd_event   *watch_event;
-	struct rbd_obj_request	*watch_request;
+	struct ceph_osd_linger_request *watch_handle;
 
 	struct rbd_spec		*parent_spec;
 	u64			parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 	return __rbd_obj_request_wait(obj_request, 0);
 }
 
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-					unsigned long timeout)
-{
-	return __rbd_obj_request_wait(obj_request, timeout);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-	dout("%s: obj %p\n", __func__, obj_request);
-	obj_request_done_set(obj_request);
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = NULL;
@@ -1877,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 	case CEPH_OSD_OP_CALL:
 		rbd_osd_call_callback(obj_request);
 		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
-	case CEPH_OSD_OP_WATCH:
-		rbd_osd_trivial_callback(obj_request);
-		break;
 	default:
 		rbd_warn(NULL, "%s: unsupported op %hu",
 			obj_request->object_name, (unsigned short) opcode);
@@ -3100,45 +3084,18 @@ out_err:
 	obj_request_done_set(obj_request);
 }
 
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
-{
-	struct rbd_obj_request *obj_request;
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	int ret;
-
-	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
-							OBJ_REQUEST_NODATA);
-	if (!obj_request)
-		return -ENOMEM;
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
 
-	ret = -ENOMEM;
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-						  obj_request);
-	if (!obj_request->osd_req)
-		goto out;
-
-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-					notify_id, 0, 0);
-	rbd_osd_req_format_read(obj_request);
-
-	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (ret)
-		goto out;
-	ret = rbd_obj_request_wait(obj_request);
-out:
-	rbd_obj_request_put(obj_request);
-
-	return ret;
-}
-
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
+			 u64 notifier_id, void *data, size_t data_len)
 {
-	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	struct rbd_device *rbd_dev = arg;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	int ret;
 
-	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
-		rbd_dev->header_oid.name, (unsigned long long)notify_id,
-		(unsigned int)opcode);
+	dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
+	     cookie, notify_id);
 
 	/*
 	 * Until adequate refresh error handling is in place, there is
@@ -3150,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (ret)
 		rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+				   &rbd_dev->header_oloc, notify_id, cookie,
+				   NULL, 0);
 	if (ret)
 		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
-/*
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-						struct rbd_device *rbd_dev,
-						bool watch)
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
 {
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	struct ceph_options *opts = osdc->client->options;
-	struct rbd_obj_request *obj_request;
+	struct rbd_device *rbd_dev = arg;
 	int ret;
 
-	obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
-					     OBJ_REQUEST_NODATA);
-	if (!obj_request)
-		return ERR_PTR(-ENOMEM);
-
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-						  obj_request);
-	if (!obj_request->osd_req) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-			      rbd_dev->watch_event->cookie, 0, watch);
-	rbd_osd_req_format_write(obj_request);
-
-	if (watch)
-		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
-	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (ret)
-		goto out;
+	rbd_warn(rbd_dev, "encountered watch error: %d", err);
 
-	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
-	if (ret)
-		goto out;
+	__rbd_dev_header_unwatch_sync(rbd_dev);
 
-	ret = obj_request->result;
+	ret = rbd_dev_header_watch_sync(rbd_dev);
 	if (ret) {
-		if (watch)
-			rbd_obj_request_end(obj_request);
-		goto out;
+		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
+		return;
 	}
 
-	return obj_request;
-
-out:
-	rbd_obj_request_put(obj_request);
-	return ERR_PTR(ret);
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
 }
 
 /*
@@ -3215,57 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	struct rbd_obj_request *obj_request;
-	int ret;
+	struct ceph_osd_linger_request *handle;
 
-	rbd_assert(!rbd_dev->watch_event);
-	rbd_assert(!rbd_dev->watch_request);
-
-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-				     &rbd_dev->watch_event);
-	if (ret < 0)
-		return ret;
-
-	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
-	if (IS_ERR(obj_request)) {
-		ceph_osdc_cancel_event(rbd_dev->watch_event);
-		rbd_dev->watch_event = NULL;
-		return PTR_ERR(obj_request);
-	}
+	rbd_assert(!rbd_dev->watch_handle);
 
-	/*
-	 * A watch request is set to linger, so the underlying osd
-	 * request won't go away until we unregister it.  We retain
-	 * a pointer to the object request during that time (in
-	 * rbd_dev->watch_request), so we'll keep a reference to it.
-	 * We'll drop that reference after we've unregistered it in
-	 * rbd_dev_header_unwatch_sync().
-	 */
-	rbd_dev->watch_request = obj_request;
+	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
+				 &rbd_dev->header_oloc, rbd_watch_cb,
+				 rbd_watch_errcb, rbd_dev);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
+	rbd_dev->watch_handle = handle;
 	return 0;
 }
 
 static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-	struct rbd_obj_request *obj_request;
-
-	rbd_assert(rbd_dev->watch_event);
-	rbd_assert(rbd_dev->watch_request);
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
 
-	rbd_obj_request_end(rbd_dev->watch_request);
-	rbd_obj_request_put(rbd_dev->watch_request);
-	rbd_dev->watch_request = NULL;
+	if (!rbd_dev->watch_handle)
+		return;
 
-	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-	if (!IS_ERR(obj_request))
-		rbd_obj_request_put(obj_request);
-	else
-		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-			 PTR_ERR(obj_request));
+	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+	if (ret)
+		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
 
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
+	rbd_dev->watch_handle = NULL;
 }
 
 /*
@@ -4081,6 +3982,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	init_rwsem(&rbd_dev->header_rwsem);
 
 	ceph_oid_init(&rbd_dev->header_oid);
+	ceph_oloc_init(&rbd_dev->header_oloc);
 
 	rbd_dev->dev.bus = &rbd_bus_type;
 	rbd_dev->dev.type = &rbd_device_type;
@@ -5285,6 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 
 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
+	rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	if (rbd_dev->image_format == 1)
 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
 				       spec->image_name, RBD_SUFFIX);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..3b911ff889dd 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY				= 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE			= 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
+	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
 
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 342f22f1f040..cd2dcb8939de 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -34,7 +34,7 @@ struct ceph_osd {
 	struct rb_node o_node;
 	struct ceph_connection o_con;
 	struct rb_root o_requests;
-	struct list_head o_linger_requests;
+	struct rb_root o_linger_requests;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
@@ -108,11 +108,12 @@ struct ceph_osd_req_op {
 		} cls;
 		struct {
 			u64 cookie;
-			u64 ver;
-			u32 prot_ver;
-			u32 timeout;
-			__u8 flag;
+			__u8 op;           /* CEPH_OSD_WATCH_OP_ */
+			u32 gen;
 		} watch;
+		struct {
+			struct ceph_osd_data request_data;
+		} notify_ack;
 		struct {
 			u64 expected_object_size;
 			u64 expected_write_size;
@@ -145,8 +146,6 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
-	struct list_head r_linger_item;
-	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
 
 	struct ceph_osd_request_target r_t;
@@ -162,7 +161,6 @@ struct ceph_osd_request {
 
 	int               r_result;
 	bool              r_got_reply;
-	int		  r_linger;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
@@ -181,6 +179,7 @@ struct ceph_osd_request {
 	struct ceph_snap_context *r_snapc;    /* for writes */
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
+	bool r_linger;                        /* don't resend on failure */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
@@ -195,23 +194,40 @@ struct ceph_request_redirect {
 	struct ceph_object_locator oloc;
 };
 
-struct ceph_osd_event {
-	u64 cookie;
-	int one_shot;
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+				 u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
 	struct ceph_osd_client *osdc;
-	void (*cb)(u64, u64, u8, void *);
-	void *data;
-	struct rb_node node;
-	struct list_head osd_node;
+	u64 linger_id;
+	bool committed;
+
+	struct ceph_osd *osd;
+	struct ceph_osd_request *reg_req;
+	struct ceph_osd_request *ping_req;
+	unsigned long ping_sent;
+
+	struct ceph_osd_request_target t;
+	u32 last_force_resend;
+
+	struct timespec mtime;
+
 	struct kref kref;
-};
+	struct mutex lock;
+	struct rb_node node;            /* osd */
+	struct rb_node osdc_node;       /* osdc */
+	struct list_head scan_item;
+
+	struct completion reg_commit_wait;
+	int reg_commit_error;
+	int last_error;
+
+	u32 register_gen;
 
-struct ceph_osd_event_work {
-	struct work_struct work;
-	struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
+	rados_watchcb2_t wcb;
+	rados_watcherrcb_t errcb;
+	void *data;
 };
 
 struct ceph_osd_client {
@@ -223,9 +239,10 @@ struct ceph_osd_client {
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
-	struct list_head       req_linger;    /* lingering requests */
 	struct ceph_osd        homeless_osd;
 	atomic64_t             last_tid;      /* tid of last request */
+	u64                    last_linger_id;
+	struct rb_root         linger_requests; /* lingering requests */
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
@@ -239,10 +256,6 @@ struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op;
 	struct ceph_msgpool	msgpool_op_reply;
 
-	spinlock_t		event_lock;
-	struct rb_root		event_tree;
-	u64			event_count;
-
 	struct workqueue_struct	*notify_wq;
 };
 
@@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 				 u16 opcode, const char *name, const void *value,
 				 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-					unsigned int which, u16 opcode,
-					u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				       unsigned int which,
 				       u64 expected_object_size,
@@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
 
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-					 struct ceph_osd_request *req);
-
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages);
 
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-				  void (*event_cb)(u64, u64, u8, void *),
-				  void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+		struct ceph_object_id *oid,
+		struct ceph_object_locator *oloc,
+		rados_watchcb2_t wcb,
+		rados_watcherrcb_t errcb,
+		void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+		      struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+			 struct ceph_object_id *oid,
+			 struct ceph_object_locator *oloc,
+			 u64 notify_id,
+			 u64 cookie,
+			 void *payload,
+			 size_t payload_len);
 #endif
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 28740a58f32c..204c8c944703 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -427,7 +427,17 @@ enum {
 	CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
 
-#define RADOS_NOTIFY_VER	1
+enum {
+	CEPH_OSD_WATCH_OP_UNWATCH = 0,
+	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+	/* note: use only ODD ids to prevent pre-giant code from
+	   interpreting the op as UNWATCH */
+	CEPH_OSD_WATCH_OP_WATCH = 3,
+	CEPH_OSD_WATCH_OP_RECONNECT = 5,
+	CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
 
 /*
  * an individual object operation.  each may be accompanied by some data
@@ -462,8 +472,9 @@ struct ceph_osd_op {
 	        } __attribute__ ((packed)) snap;
 		struct {
 			__le64 cookie;
-			__le64 ver;
-			__u8 flag;	/* 0 = unwatch, 1 = watch */
+			__le64 ver;     /* no longer used */
+			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
+			__le32 gen;     /* registration generation */
 		} __attribute__ ((packed)) watch;
 		struct {
 			__le64 offset, length;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
 	}
 }
 
+const char *ceph_osd_watch_op_name(int o)
+{
+	switch (o) {
+	case CEPH_OSD_WATCH_OP_UNWATCH:
+		return "unwatch";
+	case CEPH_OSD_WATCH_OP_WATCH:
+		return "watch";
+	case CEPH_OSD_WATCH_OP_RECONNECT:
+		return "reconnect";
+	case CEPH_OSD_WATCH_OP_PING:
+		return "ping";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_state_name(int s)
 {
 	switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 61dbd9de4650..e64cb8583533 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -177,6 +177,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 
 		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
 			   ceph_osd_op_name(op->op));
+		if (op->op == CEPH_OSD_OP_WATCH)
+			seq_printf(s, "-%s",
+				   ceph_osd_watch_op_name(op->watch.op));
 	}
 
 	seq_putc(s, '\n');
@@ -197,6 +200,31 @@ static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
 	mutex_unlock(&osd->lock);
 }
 
+static void dump_linger_request(struct seq_file *s,
+				struct ceph_osd_linger_request *lreq)
+{
+	seq_printf(s, "%llu\t", lreq->linger_id);
+	dump_target(s, &lreq->t);
+
+	seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
+		   lreq->committed ? "C" : "", lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		dump_linger_request(s, lreq);
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -214,6 +242,14 @@ static int osdc_show(struct seq_file *s, void *pp)
 	}
 	dump_requests(s, &osdc->homeless_osd);
 
+	seq_puts(s, "LINGER REQUESTS\n");
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		dump_linger_requests(s, osd);
+	}
+	dump_linger_requests(s, &osdc->homeless_osd);
+
 	up_read(&osdc->lock);
 	return 0;
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ef1bcbe9af2d..ca0a7b58ba4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -45,6 +45,10 @@ static const struct ceph_connection_operations osd_con_ops;
 
 static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
 static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq);
 
 #if 1
 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -74,10 +78,15 @@ static inline void verify_osd_locked(struct ceph_osd *osd)
 		  rwsem_is_locked(&osdc->lock)) &&
 		!rwsem_is_wrlocked(&osdc->lock));
 }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+	WARN_ON(!mutex_is_locked(&lreq->lock));
+}
 #else
 static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
 static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
 static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
 #endif
 
 /*
@@ -322,6 +331,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_STAT:
 		ceph_osd_data_release(&op->raw_data_in);
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+		ceph_osd_data_release(&op->notify_ack.request_data);
+		break;
 	default:
 		break;
 	}
@@ -345,6 +357,29 @@ static void target_init(struct ceph_osd_request_target *t)
 	t->osd = CEPH_HOMELESS_OSD;
 }
 
+static void target_copy(struct ceph_osd_request_target *dest,
+			const struct ceph_osd_request_target *src)
+{
+	ceph_oid_copy(&dest->base_oid, &src->base_oid);
+	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+	ceph_oid_copy(&dest->target_oid, &src->target_oid);
+	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+	dest->pgid = src->pgid; /* struct */
+	dest->pg_num = src->pg_num;
+	dest->pg_num_mask = src->pg_num_mask;
+	ceph_osds_copy(&dest->acting, &src->acting);
+	ceph_osds_copy(&dest->up, &src->up);
+	dest->size = src->size;
+	dest->min_size = src->min_size;
+	dest->sort_bitwise = src->sort_bitwise;
+
+	dest->flags = src->flags;
+	dest->paused = src->paused;
+
+	dest->osd = src->osd;
+}
+
 static void target_destroy(struct ceph_osd_request_target *t)
 {
 	ceph_oid_destroy(&t->base_oid);
@@ -357,8 +392,6 @@ static void target_destroy(struct ceph_osd_request_target *t)
 static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_linger_item));
-	WARN_ON(!list_empty(&req->r_linger_osd_item));
 	WARN_ON(!list_empty(&req->r_unsafe_item));
 	WARN_ON(req->r_osd);
 }
@@ -419,13 +452,48 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	RB_CLEAR_NODE(&req->r_node);
-	INIT_LIST_HEAD(&req->r_linger_item);
-	INIT_LIST_HEAD(&req->r_linger_osd_item);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
 	target_init(&req->r_t);
 }
 
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	bool mempool = req->r_mempool;
+	unsigned int num_ops = req->r_num_ops;
+	u64 snapid = req->r_snapid;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	bool linger = req->r_linger;
+	struct ceph_msg *request_msg = req->r_request;
+	struct ceph_msg *reply_msg = req->r_reply;
+
+	dout("%s req %p\n", __func__, req);
+	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	request_release_checks(req);
+
+	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	target_destroy(&req->r_t);
+
+	request_init(req);
+	req->r_osdc = osdc;
+	req->r_mempool = mempool;
+	req->r_num_ops = num_ops;
+	req->r_snapid = snapid;
+	req->r_snapc = snapc;
+	req->r_linger = linger;
+	req->r_request = request_msg;
+	req->r_reply = reply_msg;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_ops,
@@ -681,21 +749,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-				unsigned int which, u16 opcode,
-				u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u8 watch_opcode)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-						      opcode, 0);
-
-	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+	struct ceph_osd_req_op *op;
 
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
 	op->watch.cookie = cookie;
-	op->watch.ver = version;
-	if (opcode == CEPH_OSD_OP_WATCH && flag)
-		op->watch.flag = (u8)1;
+	op->watch.op = watch_opcode;
+	op->watch.gen = 0;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				unsigned int which,
@@ -771,11 +837,13 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-		dst->watch.ver = cpu_to_le64(src->watch.ver);
-		dst->watch.flag = src->watch.flag;
+		dst->watch.ver = cpu_to_le64(0);
+		dst->watch.op = src->watch.op;
+		dst->watch.gen = cpu_to_le32(src->watch.gen);
+		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
 		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
@@ -915,7 +983,7 @@ static void osd_init(struct ceph_osd *osd)
 	atomic_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
 	osd->o_requests = RB_ROOT;
-	INIT_LIST_HEAD(&osd->o_linger_requests);
+	osd->o_linger_requests = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
@@ -926,7 +994,7 @@ static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
 	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
 
@@ -996,7 +1064,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd)
 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
 	if (RB_EMPTY_ROOT(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests))
+	    RB_EMPTY_ROOT(&osd->o_linger_requests))
 		__move_osd_to_lru(osd);
 }
 
@@ -1036,6 +1104,17 @@ static void close_osd(struct ceph_osd *osd)
 		unlink_request(osd, req);
 		link_request(&osdc->homeless_osd, req);
 	}
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		n = rb_next(n); /* unlink_linger() */
+
+		dout(" reassigning lreq %p linger_id %llu\n", lreq,
+		     lreq->linger_id);
+		unlink_linger(osd, lreq);
+		link_linger(&osdc->homeless_osd, lreq);
+	}
 
 	__remove_osd_from_lru(osd);
 	erase_osd(&osdc->osds, osd);
@@ -1052,7 +1131,7 @@ static int reopen_osd(struct ceph_osd *osd)
 	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
 
 	if (RB_EMPTY_ROOT(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests)) {
+	    RB_EMPTY_ROOT(&osd->o_linger_requests)) {
 		close_osd(osd);
 		return -ENODEV;
 	}
@@ -1148,52 +1227,6 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 		atomic_dec(&osd->o_osdc->num_homeless);
 }
 
-static void __register_linger_request(struct ceph_osd *osd,
-				    struct ceph_osd_request *req)
-{
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	WARN_ON(!req->r_linger);
-
-	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
-	list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
-	__remove_osd_from_lru(osd);
-	req->r_osd = osd;
-}
-
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req)
-{
-	WARN_ON(!req->r_linger);
-
-	if (list_empty(&req->r_linger_item)) {
-		dout("%s %p tid %llu not registered\n", __func__, req,
-		     req->r_tid);
-		return;
-	}
-
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	list_del_init(&req->r_linger_item);
-
-	if (req->r_osd) {
-		list_del_init(&req->r_linger_osd_item);
-		maybe_move_osd_to_lru(req->r_osd);
-		if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
-			req->r_osd = NULL;
-	}
-	ceph_osdc_put_request(req);
-}
-
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-				  struct ceph_osd_request *req)
-{
-	if (!req->r_linger) {
-		dout("set_request_linger %p\n", req);
-		req->r_linger = 1;
-	}
-}
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
-
 static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
 	return pi->flags & CEPH_POOL_FLAG_FULL;
@@ -1379,6 +1412,10 @@ static void setup_request_data(struct ceph_osd_request *req,
 						  op->xattr.value_len);
 			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
 			break;
+		case CEPH_OSD_OP_NOTIFY_ACK:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify_ack.request_data);
+			break;
 
 		/* reply */
 		case CEPH_OSD_OP_STAT:
@@ -1683,6 +1720,460 @@ static void cancel_request(struct ceph_osd_request *req)
 	finish_request(req);
 }
 
+/*
+ * lingering requests, watch/notify v2 infrastructure
+ */
+static void linger_release(struct kref *kref)
+{
+	struct ceph_osd_linger_request *lreq =
+	    container_of(kref, struct ceph_osd_linger_request, kref);
+
+	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+	     lreq->reg_req, lreq->ping_req);
+	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+	WARN_ON(!list_empty(&lreq->scan_item));
+	WARN_ON(lreq->osd);
+
+	if (lreq->reg_req)
+		ceph_osdc_put_request(lreq->reg_req);
+	if (lreq->ping_req)
+		ceph_osdc_put_request(lreq->ping_req);
+	target_destroy(&lreq->t);
+	kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+	if (lreq)
+		kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+	kref_get(&lreq->kref);
+	return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_linger_request *lreq;
+
+	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+	if (!lreq)
+		return NULL;
+
+	kref_init(&lreq->kref);
+	mutex_init(&lreq->lock);
+	RB_CLEAR_NODE(&lreq->node);
+	RB_CLEAR_NODE(&lreq->osdc_node);
+	INIT_LIST_HEAD(&lreq->scan_item);
+	init_completion(&lreq->reg_commit_wait);
+
+	lreq->osdc = osdc;
+	target_init(&lreq->t);
+
+	dout("%s lreq %p\n", __func__, lreq);
+	return lreq;
+}
+
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+
+/*
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
+ */
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq)
+{
+	verify_osd_locked(osd);
+	WARN_ON(!lreq->linger_id || lreq->osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_linger(&osd->o_linger_requests, lreq);
+	lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq)
+{
+	verify_osd_locked(osd);
+	WARN_ON(lreq->osd != osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
+
+	lreq->osd = NULL;
+	erase_linger(&osd->o_linger_requests, lreq);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	verify_osdc_locked(lreq->osdc);
+
+	return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	bool registered;
+
+	down_read(&osdc->lock);
+	registered = __linger_registered(lreq);
+	up_read(&osdc->lock);
+
+	return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(lreq->linger_id);
+
+	linger_get(lreq);
+	lreq->linger_id = ++osdc->last_linger_id;
+	insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+
+	erase_linger_osdc(&osdc->linger_requests, lreq);
+	linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	WARN_ON(!req->r_linger);
+	cancel_request(req);
+	linger_put(lreq);
+}
+
+struct linger_work {
+	struct work_struct work;
+	struct ceph_osd_linger_request *lreq;
+
+	union {
+		struct {
+			u64 notify_id;
+			u64 notifier_id;
+			void *payload; /* points into @msg front */
+			size_t payload_len;
+
+			struct ceph_msg *msg; /* for ceph_msg_put() */
+		} notify;
+		struct {
+			int err;
+		} error;
+	};
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+				       work_func_t workfn)
+{
+	struct linger_work *lwork;
+
+	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+	if (!lwork)
+		return NULL;
+
+	INIT_WORK(&lwork->work, workfn);
+	lwork->lreq = linger_get(lreq);
+
+	return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	linger_put(lreq);
+	kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_lreq_locked(lreq);
+	queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
+	}
+
+	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+	     lwork->notify.payload_len);
+	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+		  lwork->notify.notifier_id, lwork->notify.payload,
+		  lwork->notify.payload_len);
+
+out:
+	ceph_msg_put(lwork->notify.msg);
+	lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
+	}
+
+	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+	lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+	struct linger_work *lwork;
+
+	lwork = lwork_alloc(lreq, do_watch_error);
+	if (!lwork) {
+		pr_err("failed to allocate error-lwork\n");
+		return;
+	}
+
+	lwork->error.err = lreq->last_error;
+	lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+				       int result)
+{
+	if (!completion_done(&lreq->reg_commit_wait)) {
+		lreq->reg_commit_error = (result <= 0 ? result : 0);
+		complete_all(&lreq->reg_commit_wait);
+	}
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+	     lreq->linger_id, req->r_result);
+	WARN_ON(!__linger_registered(lreq));
+	linger_reg_commit_complete(lreq, req->r_result);
+	lreq->committed = true;
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static int normalize_watch_error(int err)
+{
+	/*
+	 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+	 * notification and a failure to reconnect because we raced with
+	 * the delete appear the same to the user.
+	 */
+	if (err == -ENOENT)
+		err = -ENOTCONN;
+
+	return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+	     lreq, lreq->linger_id, req->r_result, lreq->last_error);
+	if (req->r_result < 0) {
+		if (!lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
+		}
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req = lreq->reg_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
+
+	verify_osdc_wrlocked(req->r_osdc);
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = lreq->t.flags;
+	req->r_mtime = lreq->mtime;
+
+	mutex_lock(&lreq->lock);
+	if (lreq->committed) {
+		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+			op->watch.cookie != lreq->linger_id);
+		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+		op->watch.gen = ++lreq->register_gen;
+		dout("lreq %p reconnect register_gen %u\n", lreq,
+		     op->watch.gen);
+		req->r_callback = linger_reconnect_cb;
+	} else {
+		WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+		dout("lreq %p register\n", lreq);
+		req->r_callback = linger_commit_cb;
+	}
+	mutex_unlock(&lreq->lock);
+
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	submit_request(req, true);
+}
+
+static void linger_ping_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+	     lreq->last_error);
+	if (lreq->register_gen == req->r_ops[0].watch.gen) {
+		if (req->r_result && !lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
+		}
+	} else {
+		dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+		     lreq->register_gen, req->r_ops[0].watch.gen);
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_request *req = lreq->ping_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		dout("%s PAUSERD\n", __func__);
+		return;
+	}
+
+	lreq->ping_sent = jiffies;
+	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+	     __func__, lreq, lreq->linger_id, lreq->ping_sent,
+	     lreq->register_gen);
+
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	target_copy(&req->r_t, &lreq->t);
+
+	WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+		op->watch.cookie != lreq->linger_id ||
+		op->watch.op != CEPH_OSD_WATCH_OP_PING);
+	op->watch.gen = lreq->register_gen;
+	req->r_callback = linger_ping_cb;
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	ceph_osdc_get_request(req);
+	account_request(req);
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(lreq->osd, req);
+	send_request(req);
+}
+
+static void linger_submit(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd *osd;
+
+	calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+	osd = lookup_create_osd(osdc, lreq->t.osd, true);
+	link_linger(osd, lreq);
+
+	send_linger(lreq);
+}
+
+/*
+ * @lreq has to be both registered and linked.
+ */
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+	if (lreq->ping_req->r_osd)
+		cancel_linger_request(lreq->ping_req);
+	if (lreq->reg_req->r_osd)
+		cancel_linger_request(lreq->reg_req);
+	unlink_linger(lreq->osd, lreq);
+	linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	down_write(&osdc->lock);
+	if (__linger_registered(lreq))
+		__linger_cancel(lreq);
+	up_write(&osdc->lock);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+	return ret ?: lreq->reg_commit_error;
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -1720,6 +2211,19 @@ static void handle_timeout(struct work_struct *work)
 				found = true;
 			}
 		}
+		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+			struct ceph_osd_linger_request *lreq =
+			    rb_entry(p, struct ceph_osd_linger_request, node);
+
+			dout(" lreq %p linger_id %llu is served by osd%d\n",
+			     lreq, lreq->linger_id, osd->o_osd);
+			found = true;
+
+			mutex_lock(&lreq->lock);
+			if (lreq->committed && !lreq->last_error)
+				send_linger_ping(lreq);
+			mutex_unlock(&lreq->lock);
+		}
 
 		if (found)
 			list_move_tail(&osd->o_keepalive_item, &slow_osds);
@@ -1756,7 +2260,7 @@ static void handle_osds_timeout(struct work_struct *work)
 			break;
 
 		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
-		WARN_ON(!list_empty(&osd->o_linger_requests));
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
 		close_osd(osd);
 	}
 
@@ -2082,7 +2586,8 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		__finish_request(req);
 		if (req->r_linger) {
 			WARN_ON(req->r_unsafe_callback);
-			__register_linger_request(osd, req);
+			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+			__complete_request(req);
 		}
 	}
 
@@ -2093,7 +2598,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		if (already_acked && req->r_unsafe_callback) {
 			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, false);
-		} else {
+		} else if (!req->r_linger) {
 			dout("req %p tid %llu cb\n", req, req->r_tid);
 			__complete_request(req);
 		}
@@ -2145,6 +2650,26 @@ static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
 	return pi->was_full && !__pool_full(pi);
 }
 
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	enum calc_target_result ct_res;
+
+	ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+	if (ct_res == CALC_TARGET_NEED_RESEND) {
+		struct ceph_osd *osd;
+
+		osd = lookup_create_osd(osdc, lreq->t.osd, true);
+		if (osd != lreq->osd) {
+			unlink_linger(lreq->osd, lreq);
+			link_linger(osd, lreq);
+		}
+	}
+
+	return ct_res;
+}
+
 /*
  * Requeue requests whose mapping to an OSD has changed.
  */
@@ -2159,6 +2684,39 @@ static void scan_requests(struct ceph_osd *osd,
 	struct rb_node *n;
 	bool force_resend_writes;
 
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* recalc_linger_target() */
+
+		dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+		     lreq->linger_id);
+		ct_res = recalc_linger_target(lreq);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+			if (!force_resend && !force_resend_writes)
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			/*
+			 * scan_requests() for the previous epoch(s)
+			 * may have already added it to the list, since
+			 * it's not unlinked here.
+			 */
+			if (list_empty(&lreq->scan_item))
+				list_add_tail(&lreq->scan_item, need_resend_linger);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			break;
+		}
+	}
+
 	for (n = rb_first(&osd->o_requests); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
@@ -2263,6 +2821,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
 			  struct rb_root *need_resend,
 			  struct list_head *need_resend_linger)
 {
+	struct ceph_osd_linger_request *lreq, *nlreq;
 	struct rb_node *n;
 
 	for (n = rb_first(need_resend); n; ) {
@@ -2280,8 +2839,17 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		if (!req->r_linger) {
 			if (!osd_homeless(osd) && !req->r_t.paused)
 				send_request(req);
+		} else {
+			cancel_linger_request(req);
 		}
 	}
+
+	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+		if (!osd_homeless(lreq->osd))
+			send_linger(lreq);
+
+		list_del_init(&lreq->scan_item);
+	}
 }
 
 /*
@@ -2406,15 +2974,25 @@ static void kick_osd_requests(struct ceph_osd *osd)
 {
 	struct rb_node *n;
 
-	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+	for (n = rb_first(&osd->o_requests); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
 
+		n = rb_next(n); /* cancel_linger_request() */
+
 		if (!req->r_linger) {
 			if (!req->r_t.paused)
 				send_request(req);
+		} else {
+			cancel_linger_request(req);
 		}
 	}
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		send_linger(lreq);
+	}
 }
 
 /*
@@ -2441,193 +3019,77 @@ out_unlock:
 	up_write(&osdc->lock);
 }
 
-/*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
- */
-static void __release_event(struct kref *kref)
-{
-	struct ceph_osd_event *event =
-		container_of(kref, struct ceph_osd_event, kref);
-
-	dout("__release_event %p\n", event);
-	kfree(event);
-}
-
-static void get_event(struct ceph_osd_event *event)
-{
-	kref_get(&event->kref);
-}
-
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
-	kref_put(&event->kref, __release_event);
-}
-EXPORT_SYMBOL(ceph_osdc_put_event);
-
-static void __insert_event(struct ceph_osd_client *osdc,
-			     struct ceph_osd_event *new)
-{
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
-
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (new->cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (new->cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &osdc->event_tree);
-}
-
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
-					        u64 cookie)
-{
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
-
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			return event;
-	}
-	return NULL;
-}
-
-static void __remove_event(struct ceph_osd_event *event)
-{
-	struct ceph_osd_client *osdc = event->osdc;
-
-	if (!RB_EMPTY_NODE(&event->node)) {
-		dout("__remove_event removed %p\n", event);
-		rb_erase(&event->node, &osdc->event_tree);
-		ceph_osdc_put_event(event);
-	} else {
-		dout("__remove_event didn't remove %p\n", event);
-	}
-}
-
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-			   void (*event_cb)(u64, u64, u8, void *),
-			   void *data, struct ceph_osd_event **pevent)
-{
-	struct ceph_osd_event *event;
-
-	event = kmalloc(sizeof(*event), GFP_NOIO);
-	if (!event)
-		return -ENOMEM;
-
-	dout("create_event %p\n", event);
-	event->cb = event_cb;
-	event->one_shot = 0;
-	event->data = data;
-	event->osdc = osdc;
-	INIT_LIST_HEAD(&event->osd_node);
-	RB_CLEAR_NODE(&event->node);
-	kref_init(&event->kref);   /* one ref for us */
-	kref_get(&event->kref);    /* one ref for the caller */
-
-	spin_lock(&osdc->event_lock);
-	event->cookie = ++osdc->event_count;
-	__insert_event(osdc, event);
-	spin_unlock(&osdc->event_lock);
-
-	*pevent = event;
-	return 0;
-}
-EXPORT_SYMBOL(ceph_osdc_create_event);
-
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
-{
-	struct ceph_osd_client *osdc = event->osdc;
-
-	dout("cancel_event %p\n", event);
-	spin_lock(&osdc->event_lock);
-	__remove_event(event);
-	spin_unlock(&osdc->event_lock);
-	ceph_osdc_put_event(event); /* caller's */
-}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
-
-static void do_event_work(struct work_struct *work)
-{
-	struct ceph_osd_event_work *event_work =
-		container_of(work, struct ceph_osd_event_work, work);
-	struct ceph_osd_event *event = event_work->event;
-	u64 ver = event_work->ver;
-	u64 notify_id = event_work->notify_id;
-	u8 opcode = event_work->opcode;
-
-	dout("do_event_work completing %p\n", event);
-	event->cb(ver, notify_id, opcode, event->data);
-	dout("do_event_work completed %p\n", event);
-	ceph_osdc_put_event(event);
-	kfree(event_work);
-}
-
-
 /*
  * Process osd watch notifications
  */
 static void handle_watch_notify(struct ceph_osd_client *osdc,
 				struct ceph_msg *msg)
 {
-	void *p, *end;
-	u8 proto_ver;
-	u64 cookie, ver, notify_id;
-	u8 opcode;
-	struct ceph_osd_event *event;
-	struct ceph_osd_event_work *event_work;
-
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	struct ceph_osd_linger_request *lreq;
+	struct linger_work *lwork;
+	u8 proto_ver, opcode;
+	u64 cookie, notify_id;
+	u64 notifier_id = 0;
+	void *payload = NULL;
+	u32 payload_len = 0;
 
 	ceph_decode_8_safe(&p, end, proto_ver, bad);
 	ceph_decode_8_safe(&p, end, opcode, bad);
 	ceph_decode_64_safe(&p, end, cookie, bad);
-	ceph_decode_64_safe(&p, end, ver, bad);
+	p += 8; /* skip ver */
 	ceph_decode_64_safe(&p, end, notify_id, bad);
 
-	spin_lock(&osdc->event_lock);
-	event = __find_event(osdc, cookie);
-	if (event) {
-		BUG_ON(event->one_shot);
-		get_event(event);
-	}
-	spin_unlock(&osdc->event_lock);
-	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
-	     cookie, ver, event);
-	if (event) {
-		event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-		if (!event_work) {
-			pr_err("couldn't allocate event_work\n");
-			ceph_osdc_put_event(event);
-			return;
+	if (proto_ver >= 1) {
+		ceph_decode_32_safe(&p, end, payload_len, bad);
+		ceph_decode_need(&p, end, payload_len, bad);
+		payload = p;
+		p += payload_len;
+	}
+
+	if (le16_to_cpu(msg->hdr.version) >= 2)
+		p += 4; /* skip return_code */
+
+	if (le16_to_cpu(msg->hdr.version) >= 3)
+		ceph_decode_64_safe(&p, end, notifier_id, bad);
+
+	down_read(&osdc->lock);
+	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+	if (!lreq) {
+		dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+		     cookie);
+		goto out_unlock_osdc;
+	}
+
+	mutex_lock(&lreq->lock);
+	dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie,
+	     lreq);
+	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+		if (!lreq->last_error) {
+			lreq->last_error = -ENOTCONN;
+			queue_watch_error(lreq);
+		}
+	} else {
+		/* CEPH_WATCH_EVENT_NOTIFY */
+		lwork = lwork_alloc(lreq, do_watch_notify);
+		if (!lwork) {
+			pr_err("failed to allocate notify-lwork\n");
+			goto out_unlock_lreq;
 		}
-		INIT_WORK(&event_work->work, do_event_work);
-		event_work->event = event;
-		event_work->ver = ver;
-		event_work->notify_id = notify_id;
-		event_work->opcode = opcode;
 
-		queue_work(osdc->notify_wq, &event_work->work);
+		lwork->notify.notify_id = notify_id;
+		lwork->notify.notifier_id = notifier_id;
+		lwork->notify.payload = payload;
+		lwork->notify.payload_len = payload_len;
+		lwork->notify.msg = ceph_msg_get(msg);
+		lwork_queue(lwork);
 	}
 
+out_unlock_lreq:
+	mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 	return;
 
 bad:
@@ -2659,8 +3121,6 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 	struct ceph_osd_client *osdc = req->r_osdc;
 
 	down_write(&osdc->lock);
-	if (req->r_linger)
-		__unregister_linger_request(osdc, req);
 	if (req->r_osd)
 		cancel_request(req);
 	up_write(&osdc->lock);
@@ -2743,6 +3203,198 @@ again:
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req;
+
+	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return NULL;
+
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	return req;
+}
+
+/*
+ * Returns a handle, caller owns a ref.
+ */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+		struct ceph_object_id *oid,
+		struct ceph_object_locator *oloc,
+		rados_watchcb2_t wcb,
+		rados_watcherrcb_t errcb,
+		void *data)
+{
+	struct ceph_osd_linger_request *lreq;
+	int ret;
+
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return ERR_PTR(-ENOMEM);
+
+	lreq->wcb = wcb;
+	lreq->errcb = errcb;
+	lreq->data = data;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	lreq->mtime = CURRENT_TIME;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
+	}
+
+	lreq->ping_req = alloc_linger_request(lreq);
+	if (!lreq->ping_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
+	}
+
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_WATCH);
+	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_PING);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (ret) {
+		linger_cancel(lreq);
+		goto err_put_lreq;
+	}
+
+	return lreq;
+
+err_put_lreq:
+	linger_put(lreq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ceph_osdc_watch);
+
+/*
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
+ */
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+		      struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_options *opts = osdc->client->options;
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	req->r_mtime = CURRENT_TIME;
+	osd_req_op_watch_init(req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_UNWATCH);
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	linger_cancel(lreq);
+	linger_put(lreq);
+	ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_unwatch);
+
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+				      u64 notify_id, u64 cookie, void *payload,
+				      size_t payload_len)
+{
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
+
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
+
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_64(pl, notify_id);
+	ret |= ceph_pagelist_encode_64(pl, cookie);
+	if (payload) {
+		ret |= ceph_pagelist_encode_32(pl, payload_len);
+		ret |= ceph_pagelist_append(pl, payload, payload_len);
+	} else {
+		ret |= ceph_pagelist_encode_32(pl, 0);
+	}
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
+	}
+
+	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
+}
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+			 struct ceph_object_id *oid,
+			 struct ceph_object_locator *oloc,
+			 u64 notify_id,
+			 u64 cookie,
+			 void *payload,
+			 size_t payload_len)
+{
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+					 payload_len);
+	if (ret)
+		goto out_put_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
@@ -2767,15 +3419,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	spin_lock_init(&osdc->osd_lru_lock);
-	INIT_LIST_HEAD(&osdc->req_linger);
 	osd_init(&osdc->homeless_osd);
 	osdc->homeless_osd.o_osdc = osdc;
 	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+	osdc->linger_requests = RB_ROOT;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-	spin_lock_init(&osdc->event_lock);
-	osdc->event_tree = RB_ROOT;
-	osdc->event_count = 0;
 
 	err = -ENOMEM;
 	osdc->osdmap = ceph_osdmap_alloc();
@@ -2838,6 +3487,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	osd_cleanup(&osdc->homeless_osd);
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
-- 
cgit v1.2.3


From 1907920324f1f3ebb6618344417c03a2863bba01 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: support for sending notifies

Implement ceph_osdc_notify() for sending notifies.

Due to the fact that the current messenger can't do read-in into
pagelists (it can only do write-out from them), I had to go with a page
vector for a NOTIFY_COMPLETE payload, for now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  20 ++++
 include/linux/ceph/rados.h      |   3 +
 net/ceph/debugfs.c              |   5 +-
 net/ceph/osd_client.c           | 232 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 249 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cd2dcb8939de..63054fae4f15 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -114,6 +114,11 @@ struct ceph_osd_req_op {
 		struct {
 			struct ceph_osd_data request_data;
 		} notify_ack;
+		struct {
+			u64 cookie;
+			struct ceph_osd_data request_data;
+			struct ceph_osd_data response_data;
+		} notify;
 		struct {
 			u64 expected_object_size;
 			u64 expected_write_size;
@@ -202,6 +207,7 @@ struct ceph_osd_linger_request {
 	struct ceph_osd_client *osdc;
 	u64 linger_id;
 	bool committed;
+	bool is_watch;                  /* watch or notify */
 
 	struct ceph_osd *osd;
 	struct ceph_osd_request *reg_req;
@@ -220,14 +226,20 @@ struct ceph_osd_linger_request {
 	struct list_head scan_item;
 
 	struct completion reg_commit_wait;
+	struct completion notify_finish_wait;
 	int reg_commit_error;
+	int notify_finish_error;
 	int last_error;
 
 	u32 register_gen;
+	u64 notify_id;
 
 	rados_watchcb2_t wcb;
 	rados_watcherrcb_t errcb;
 	void *data;
+
+	struct page ***preply_pages;
+	size_t *preply_len;
 };
 
 struct ceph_osd_client {
@@ -397,5 +409,13 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
 			 u64 cookie,
 			 void *payload,
 			 size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+		     struct ceph_object_id *oid,
+		     struct ceph_object_locator *oloc,
+		     void *payload,
+		     size_t payload_len,
+		     u32 timeout,
+		     struct page ***preply_pages,
+		     size_t *preply_len);
 #endif
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 204c8c944703..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -476,6 +476,9 @@ struct ceph_osd_op {
 			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
 			__le32 gen;     /* registration generation */
 		} __attribute__ ((packed)) watch;
+		struct {
+			__le64 cookie;
+		} __attribute__ ((packed)) notify;
 		struct {
 			__le64 offset, length;
 			__le64 src_offset;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e64cb8583533..39f91c7250f6 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -206,8 +206,9 @@ static void dump_linger_request(struct seq_file *s,
 	seq_printf(s, "%llu\t", lreq->linger_id);
 	dump_target(s, &lreq->t);
 
-	seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
-		   lreq->committed ? "C" : "", lreq->last_error);
+	seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+		   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+		   lreq->last_error);
 }
 
 static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ca0a7b58ba4f..e6e3ab4223db 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -334,6 +334,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_NOTIFY_ACK:
 		ceph_osd_data_release(&op->notify_ack.request_data);
 		break;
+	case CEPH_OSD_OP_NOTIFY:
+		ceph_osd_data_release(&op->notify.request_data);
+		ceph_osd_data_release(&op->notify.response_data);
+		break;
 	default:
 		break;
 	}
@@ -845,6 +849,9 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		break;
 	case CEPH_OSD_OP_NOTIFY_ACK:
 		break;
+	case CEPH_OSD_OP_NOTIFY:
+		dst->notify.cookie = cpu_to_le64(src->notify.cookie);
+		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
 		    cpu_to_le64(src->alloc_hint.expected_object_size);
@@ -1439,6 +1446,12 @@ static void setup_request_data(struct ceph_osd_request *req,
 			ceph_osdc_msg_data_add(req->r_reply,
 					       &op->cls.response_data);
 			break;
+		case CEPH_OSD_OP_NOTIFY:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify.request_data);
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->notify.response_data);
+			break;
 		}
 
 		data_len += op->indata_len;
@@ -1771,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	RB_CLEAR_NODE(&lreq->osdc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
 	init_completion(&lreq->reg_commit_wait);
+	init_completion(&lreq->notify_finish_wait);
 
 	lreq->osdc = osdc;
 	target_init(&lreq->t);
@@ -1934,6 +1948,7 @@ static void do_watch_notify(struct work_struct *w)
 		goto out;
 	}
 
+	WARN_ON(!lreq->is_watch);
 	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
 	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
 	     lwork->notify.payload_len);
@@ -1997,6 +2012,24 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 	linger_reg_commit_complete(lreq, req->r_result);
 	lreq->committed = true;
 
+	if (!lreq->is_watch) {
+		struct ceph_osd_data *osd_data =
+		    osd_req_op_data(req, 0, notify, response_data);
+		void *p = page_address(osd_data->pages[0]);
+
+		WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+			osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+		/* make note of the notify_id */
+		if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+			lreq->notify_id = ceph_decode_64(&p);
+			dout("lreq %p notify_id %llu\n", lreq,
+			     lreq->notify_id);
+		} else {
+			dout("lreq %p no notify_id\n", lreq);
+		}
+	}
+
 	mutex_unlock(&lreq->lock);
 	linger_put(lreq);
 }
@@ -2050,7 +2083,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
 	req->r_mtime = lreq->mtime;
 
 	mutex_lock(&lreq->lock);
-	if (lreq->committed) {
+	if (lreq->is_watch && lreq->committed) {
 		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
 			op->watch.cookie != lreq->linger_id);
 		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
@@ -2059,7 +2092,10 @@ static void send_linger(struct ceph_osd_linger_request *lreq)
 		     op->watch.gen);
 		req->r_callback = linger_reconnect_cb;
 	} else {
-		WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+		if (!lreq->is_watch)
+			lreq->notify_id = 0;
+		else
+			WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
 		dout("lreq %p register\n", lreq);
 		req->r_callback = linger_commit_cb;
 	}
@@ -2147,7 +2183,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
  */
 static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-	if (lreq->ping_req->r_osd)
+	if (lreq->is_watch && lreq->ping_req->r_osd)
 		cancel_linger_request(lreq->ping_req);
 	if (lreq->reg_req->r_osd)
 		cancel_linger_request(lreq->reg_req);
@@ -2174,6 +2210,15 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 	return ret ?: lreq->reg_commit_error;
 }
 
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+	return ret ?: lreq->notify_finish_error;
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -2220,7 +2265,7 @@ static void handle_timeout(struct work_struct *work)
 			found = true;
 
 			mutex_lock(&lreq->lock);
-			if (lreq->committed && !lreq->last_error)
+			if (lreq->is_watch && lreq->committed && !lreq->last_error)
 				send_linger_ping(lreq);
 			mutex_unlock(&lreq->lock);
 		}
@@ -3032,6 +3077,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	u8 proto_ver, opcode;
 	u64 cookie, notify_id;
 	u64 notifier_id = 0;
+	s32 return_code = 0;
 	void *payload = NULL;
 	u32 payload_len = 0;
 
@@ -3049,7 +3095,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	}
 
 	if (le16_to_cpu(msg->hdr.version) >= 2)
-		p += 4; /* skip return_code */
+		ceph_decode_32_safe(&p, end, return_code, bad);
 
 	if (le16_to_cpu(msg->hdr.version) >= 3)
 		ceph_decode_64_safe(&p, end, notifier_id, bad);
@@ -3063,13 +3109,38 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	}
 
 	mutex_lock(&lreq->lock);
-	dout("%s opcode %d cookie %llu lreq %p\n", __func__, opcode, cookie,
-	     lreq);
+	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+	     opcode, cookie, lreq, lreq->is_watch);
 	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
 		if (!lreq->last_error) {
 			lreq->last_error = -ENOTCONN;
 			queue_watch_error(lreq);
 		}
+	} else if (!lreq->is_watch) {
+		/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+		if (lreq->notify_id && lreq->notify_id != notify_id) {
+			dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+			     lreq->notify_id, notify_id);
+		} else if (!completion_done(&lreq->notify_finish_wait)) {
+			struct ceph_msg_data *data =
+			    list_first_entry_or_null(&msg->data,
+						     struct ceph_msg_data,
+						     links);
+
+			if (data) {
+				if (lreq->preply_pages) {
+					WARN_ON(data->type !=
+							CEPH_MSG_DATA_PAGES);
+					*lreq->preply_pages = data->pages;
+					*lreq->preply_len = data->length;
+				} else {
+					ceph_release_page_vector(data->pages,
+					       calc_pages_for(0, data->length));
+				}
+			}
+			lreq->notify_finish_error = return_code;
+			complete_all(&lreq->notify_finish_wait);
+		}
 	} else {
 		/* CEPH_WATCH_EVENT_NOTIFY */
 		lwork = lwork_alloc(lreq, do_watch_notify);
@@ -3241,6 +3312,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	if (!lreq)
 		return ERR_PTR(-ENOMEM);
 
+	lreq->is_watch = true;
 	lreq->wcb = wcb;
 	lreq->errcb = errcb;
 	lreq->data = data;
@@ -3395,6 +3467,116 @@ out_put_req:
 }
 EXPORT_SYMBOL(ceph_osdc_notify_ack);
 
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u32 prot_ver, u32 timeout,
+				  void *payload, size_t payload_len)
+{
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
+
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+	op->notify.cookie = cookie;
+
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
+
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+	ret |= ceph_pagelist_encode_32(pl, timeout);
+	ret |= ceph_pagelist_encode_32(pl, payload_len);
+	ret |= ceph_pagelist_append(pl, payload, payload_len);
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
+	}
+
+	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
+}
+
+/*
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
+ */
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+		     struct ceph_object_id *oid,
+		     struct ceph_object_locator *oloc,
+		     void *payload,
+		     size_t payload_len,
+		     u32 timeout,
+		     struct page ***preply_pages,
+		     size_t *preply_len)
+{
+	struct ceph_osd_linger_request *lreq;
+	struct page **pages;
+	int ret;
+
+	WARN_ON(!timeout);
+	if (preply_pages) {
+		*preply_pages = NULL;
+		*preply_len = 0;
+	}
+
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return -ENOMEM;
+
+	lreq->preply_pages = preply_pages;
+	lreq->preply_len = preply_len;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto out_put_lreq;
+	}
+
+	/* for notify_id */
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_put_lreq;
+	}
+
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+				     timeout, payload, payload_len);
+	if (ret) {
+		linger_unregister(lreq);
+		up_write(&osdc->lock);
+		ceph_release_page_vector(pages, 1);
+		goto out_put_lreq;
+	}
+	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+						 response_data),
+				 pages, PAGE_SIZE, 0, false, true);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (!ret)
+		ret = linger_notify_finish_wait(lreq);
+	else
+		dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+	linger_cancel(lreq);
+out_put_lreq:
+	linger_put(lreq);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
@@ -3693,19 +3875,51 @@ out_unlock_osdc:
 	return m;
 }
 
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+	struct ceph_msg *m;
+	int type = le16_to_cpu(hdr->type);
+	u32 front_len = le32_to_cpu(hdr->front_len);
+	u32 data_len = le32_to_cpu(hdr->data_len);
+
+	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+	if (!m)
+		return NULL;
+
+	if (data_len) {
+		struct page **pages;
+		struct ceph_osd_data osd_data;
+
+		pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+					       GFP_NOIO);
+		if (!pages) {
+			ceph_msg_put(m);
+			return NULL;
+		}
+
+		ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+					 false);
+		ceph_osdc_msg_data_add(m, &osd_data);
+	}
+
+	return m;
+}
+
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
 				  int *skip)
 {
 	struct ceph_osd *osd = con->private;
 	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
 
 	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 	case CEPH_MSG_WATCH_NOTIFY:
-		return ceph_msg_new(type, front, GFP_NOFS, false);
+		return alloc_msg_with_page_vector(hdr);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-- 
cgit v1.2.3


From b07d3c4bd7270c74e2b6803af8ac8a00cb3e5ed2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: support for checking on status of watch

Implement ceph_osdc_watch_check() to be able to check on status of
watch.  Note that the time it takes for a watch/notify event to get
delivered through the notify_wq is taken into account.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  4 ++++
 net/ceph/osd_client.c           | 52 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 63054fae4f15..2ae7cfd82ec9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -213,6 +213,8 @@ struct ceph_osd_linger_request {
 	struct ceph_osd_request *reg_req;
 	struct ceph_osd_request *ping_req;
 	unsigned long ping_sent;
+	unsigned long watch_valid_thru;
+	struct list_head pending_lworks;
 
 	struct ceph_osd_request_target t;
 	u32 last_force_resend;
@@ -417,5 +419,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 		     u32 timeout,
 		     struct page ***preply_pages,
 		     size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+			  struct ceph_osd_linger_request *lreq);
 #endif
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e6e3ab4223db..5ac6dce74f07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1746,6 +1746,7 @@ static void linger_release(struct kref *kref)
 	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
 	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
 	WARN_ON(!list_empty(&lreq->scan_item));
+	WARN_ON(!list_empty(&lreq->pending_lworks));
 	WARN_ON(lreq->osd);
 
 	if (lreq->reg_req)
@@ -1783,6 +1784,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	RB_CLEAR_NODE(&lreq->node);
 	RB_CLEAR_NODE(&lreq->osdc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
+	INIT_LIST_HEAD(&lreq->pending_lworks);
 	init_completion(&lreq->reg_commit_wait);
 	init_completion(&lreq->notify_finish_wait);
 
@@ -1890,6 +1892,8 @@ static void cancel_linger_request(struct ceph_osd_request *req)
 struct linger_work {
 	struct work_struct work;
 	struct ceph_osd_linger_request *lreq;
+	struct list_head pending_item;
+	unsigned long queued_stamp;
 
 	union {
 		struct {
@@ -1916,6 +1920,7 @@ static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
 		return NULL;
 
 	INIT_WORK(&lwork->work, workfn);
+	INIT_LIST_HEAD(&lwork->pending_item);
 	lwork->lreq = linger_get(lreq);
 
 	return lwork;
@@ -1925,6 +1930,10 @@ static void lwork_free(struct linger_work *lwork)
 {
 	struct ceph_osd_linger_request *lreq = lwork->lreq;
 
+	mutex_lock(&lreq->lock);
+	list_del(&lwork->pending_item);
+	mutex_unlock(&lreq->lock);
+
 	linger_put(lreq);
 	kfree(lwork);
 }
@@ -1935,6 +1944,10 @@ static void lwork_queue(struct linger_work *lwork)
 	struct ceph_osd_client *osdc = lreq->osdc;
 
 	verify_lreq_locked(lreq);
+	WARN_ON(!list_empty(&lwork->pending_item));
+
+	lwork->queued_stamp = jiffies;
+	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
 	queue_work(osdc->notify_wq, &lwork->work);
 }
 
@@ -2116,7 +2129,9 @@ static void linger_ping_cb(struct ceph_osd_request *req)
 	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
 	     lreq->last_error);
 	if (lreq->register_gen == req->r_ops[0].watch.gen) {
-		if (req->r_result && !lreq->last_error) {
+		if (!req->r_result) {
+			lreq->watch_valid_thru = lreq->ping_sent;
+		} else if (!lreq->last_error) {
 			lreq->last_error = normalize_watch_error(req->r_result);
 			queue_watch_error(lreq);
 		}
@@ -3316,6 +3331,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	lreq->wcb = wcb;
 	lreq->errcb = errcb;
 	lreq->data = data;
+	lreq->watch_valid_thru = jiffies;
 
 	ceph_oid_copy(&lreq->t.base_oid, oid);
 	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
@@ -3577,6 +3593,40 @@ out_put_lreq:
 }
 EXPORT_SYMBOL(ceph_osdc_notify);
 
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+			  struct ceph_osd_linger_request *lreq)
+{
+	unsigned long stamp, age;
+	int ret;
+
+	down_read(&osdc->lock);
+	mutex_lock(&lreq->lock);
+	stamp = lreq->watch_valid_thru;
+	if (!list_empty(&lreq->pending_lworks)) {
+		struct linger_work *lwork =
+		    list_first_entry(&lreq->pending_lworks,
+				     struct linger_work,
+				     pending_item);
+
+		if (time_before(lwork->queued_stamp, stamp))
+			stamp = lwork->queued_stamp;
+	}
+	age = jiffies - stamp;
+	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+	     lreq, lreq->linger_id, age, lreq->last_error);
+	/* we are truncating to msecs, so return a safe upper bound */
+	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+	mutex_unlock(&lreq->lock);
+	up_read(&osdc->lock);
+	return ret;
+}
+
 /*
  * Call all pending notify callbacks - for use after a watch is
  * unregistered, to make sure no more callbacks for it will be invoked
-- 
cgit v1.2.3


From d0b19705e99939f5ae5aa9b57bfe41dd4777d951 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: async MON client generic requests

For map check, we are going to need to send CEPH_MSG_MON_GET_VERSION
messages asynchronously and get a callback on completion.  Refactor MON
client to allow firing off generic requests asynchronously and add an
async variant of ceph_monc_get_version().  ceph_monc_do_statfs() is
switched over and remains sync.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |   4 +-
 include/linux/ceph/mon_client.h |  19 ++-
 net/ceph/mon_client.c           | 316 ++++++++++++++++++++++++++--------------
 3 files changed, 228 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d0834c477f96..8eae6f56194d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4896,8 +4896,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
 	if (ret == -ENOENT && tries++ < 1) {
-		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
-					       &newest_epoch);
+		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
+					    &newest_epoch);
 		if (ret < 0)
 			return ret;
 
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index c14e9d861cda..19800d9b45f3 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
 	ceph_monc_request_func_t do_request;
 };
 
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
 /*
  * ceph_mon_generic_request is being used for the statfs and
  * mon_get_version requests which are being done a bit differently
  * because we need to get data back to the caller
  */
 struct ceph_mon_generic_request {
+	struct ceph_mon_client *monc;
 	struct kref kref;
 	u64 tid;
 	struct rb_node node;
 	int result;
-	void *buf;
+
 	struct completion completion;
+	ceph_monc_callback_t complete_cb;
+	u64 private_data;          /* r_tid/linger_id */
+
 	struct ceph_msg *request;  /* original request */
 	struct ceph_msg *reply;    /* and reply */
+
+	union {
+		struct ceph_statfs *st;
+		u64 newest;
+	} u;
 };
 
 struct ceph_mon_client {
@@ -124,8 +135,10 @@ extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 			       struct ceph_statfs *buf);
 
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
-				    const char *what, u64 *newest);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			  u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+				ceph_monc_callback_t cb, u64 private_data);
 
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 98bfbe1f6807..4e49b2296920 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -493,6 +493,10 @@ static void release_generic_request(struct kref *kref)
 	struct ceph_mon_generic_request *req =
 		container_of(kref, struct ceph_mon_generic_request, kref);
 
+	dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+	     req->reply);
+	WARN_ON(!RB_EMPTY_NODE(&req->node));
+
 	if (req->reply)
 		ceph_msg_put(req->reply);
 	if (req->request)
@@ -503,7 +507,8 @@ static void release_generic_request(struct kref *kref)
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-	kref_put(&req->kref, release_generic_request);
+	if (req)
+		kref_put(&req->kref, release_generic_request);
 }
 
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -511,6 +516,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
 	kref_get(&req->kref);
 }
 
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = kzalloc(sizeof(*req), gfp);
+	if (!req)
+		return NULL;
+
+	req->monc = monc;
+	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
+	init_completion(&req->completion);
+
+	dout("%s greq %p\n", __func__, req);
+	return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	WARN_ON(req->tid);
+
+	get_generic_request(req);
+	req->tid = ++monc->last_tid;
+	insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+				 struct ceph_mon_generic_request *req)
+{
+	WARN_ON(!req->tid);
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	erase_generic_request(&monc->generic_request_tree, req);
+
+	ceph_msg_revoke(req->request);
+	ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	__finish_generic_request(req);
+	put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+	if (req->complete_cb)
+		req->complete_cb(req);
+	else
+		complete_all(&req->completion);
+	put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+	struct ceph_mon_generic_request *lookup_req;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+	mutex_lock(&monc->mutex);
+	lookup_req = lookup_generic_request(&monc->generic_request_tree,
+					    req->tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		finish_generic_request(req);
+	}
+
+	mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+	int ret;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	ret = wait_for_completion_interruptible(&req->completion);
+	if (ret)
+		cancel_generic_request(req);
+	else
+		ret = req->result; /* completed */
+
+	return ret;
+}
+
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 					 struct ceph_msg_header *hdr,
 					 int *skip)
@@ -540,40 +642,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	return m;
 }
 
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-				struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	/* register request */
-	req->tid = tid != 0 ? tid : ++monc->last_tid;
-	req->request->hdr.tid = cpu_to_le64(req->tid);
-	insert_generic_request(&monc->generic_request_tree, req);
-	ceph_con_send(&monc->con, ceph_msg_get(req->request));
-	mutex_unlock(&monc->mutex);
-
-	err = wait_for_completion_interruptible(&req->completion);
-
-	mutex_lock(&monc->mutex);
-	erase_generic_request(&monc->generic_request_tree, req);
-
-	if (!err)
-		err = req->result;
-	return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-			      struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	mutex_lock(&monc->mutex);
-	err = __do_generic_request(monc, 0, req);
-	mutex_unlock(&monc->mutex);
-
-	return err;
-}
-
 /*
  * statfs
  */
@@ -584,22 +652,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
 	req = lookup_generic_request(&monc->generic_request_tree, tid);
-	if (req) {
-		*(struct ceph_statfs *)req->buf = reply->st;
-		req->result = 0;
-		get_generic_request(req);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	*req->u.st = reply->st; /* struct */
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
+
+	complete_generic_request(req);
 	return;
 
 bad:
@@ -614,39 +684,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
-	int err;
+	int ret = -ENOMEM;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOFS);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	RB_CLEAR_NODE(&req->node);
-	req->buf = buf;
-	init_completion(&req->completion);
+		goto out;
 
-	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 				    true);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-				  true);
+
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
 	if (!req->reply)
 		goto out;
 
+	req->u.st = buf;
+
+	mutex_lock(&monc->mutex);
+	register_generic_request(req);
 	/* fill out request */
 	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = do_generic_request(monc, req);
-
+	ret = wait_generic_request(req);
 out:
 	put_generic_request(req);
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
@@ -659,7 +728,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 	void *end = p + msg->front_alloc_len;
 	u64 handle;
 
-	dout("%s %p tid %llu\n", __func__, msg, tid);
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
 	ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 	handle = ceph_decode_64(&p);
@@ -668,77 +737,110 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 
 	mutex_lock(&monc->mutex);
 	req = lookup_generic_request(&monc->generic_request_tree, handle);
-	if (req) {
-		*(u64 *)req->buf = ceph_decode_64(&p);
-		req->result = 0;
-		get_generic_request(req);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	req->u.newest = ceph_decode_64(&p);
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
 
+	complete_generic_request(req);
 	return;
+
 bad:
 	pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-			     u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			ceph_monc_callback_t cb, u64 private_data)
 {
 	struct ceph_mon_generic_request *req;
-	void *p, *end;
-	u64 tid;
-	int err;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOIO);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	RB_CLEAR_NODE(&req->node);
-	req->buf = newest;
-	init_completion(&req->completion);
+		goto err_put_req;
 
 	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 				    sizeof(u64) + sizeof(u32) + strlen(what),
-				    GFP_NOFS, true);
-	if (!req->request) {
-		err = -ENOMEM;
-		goto out;
-	}
+				    GFP_NOIO, true);
+	if (!req->request)
+		goto err_put_req;
 
-	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
-				  GFP_NOFS, true);
-	if (!req->reply) {
-		err = -ENOMEM;
-		goto out;
-	}
+	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+				  true);
+	if (!req->reply)
+		goto err_put_req;
 
-	p = req->request->front.iov_base;
-	end = p + req->request->front_alloc_len;
+	req->complete_cb = cb;
+	req->private_data = private_data;
 
-	/* fill out request */
 	mutex_lock(&monc->mutex);
-	tid = ++monc->last_tid;
-	ceph_encode_64(&p, tid); /* handle */
-	ceph_encode_string(&p, end, what, strlen(what));
+	register_generic_request(req);
+	{
+		void *p = req->request->front.iov_base;
+		void *const end = p + req->request->front_alloc_len;
+
+		ceph_encode_64(&p, req->tid); /* handle */
+		ceph_encode_string(&p, end, what, strlen(what));
+		WARN_ON(p != end);
+	}
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = __do_generic_request(monc, tid, req);
+	return req;
 
-	mutex_unlock(&monc->mutex);
-out:
+err_put_req:
 	put_generic_request(req);
-	return err;
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			  u64 *newest)
+{
+	struct ceph_mon_generic_request *req;
+	int ret;
+
+	req = __ceph_monc_get_version(monc, what, NULL, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	ret = wait_generic_request(req);
+	if (!ret)
+		*newest = req->u.newest;
+
+	put_generic_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+				ceph_monc_callback_t cb, u64 private_data)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = __ceph_monc_get_version(monc, what, cb, private_data);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	put_generic_request(req);
+	return 0;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 
 /*
  * Resend pending generic requests.
@@ -923,6 +1025,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_auth_destroy(monc->auth);
 
+	WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
 	ceph_msg_put(monc->m_subscribe);
-- 
cgit v1.2.3


From 4609245e2670e3698b083bcd9cc69a65b2b6f9a6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:27 +0200
Subject: libceph: pool deletion detection

This adds the "map check" infrastructure for sending osdmap version
checks on CALC_TARGET_POOL_DNE and completing in-flight requests with
-ENOENT if the target pool doesn't exist or has just been deleted.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |   6 +
 net/ceph/osd_client.c           | 248 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 248 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2ae7cfd82ec9..3e7bf72e4078 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -151,6 +151,7 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
+	struct rb_node  r_mc_node;          /* map check */
 	struct ceph_osd *r_osd;
 
 	struct ceph_osd_request_target r_t;
@@ -191,6 +192,7 @@ struct ceph_osd_request {
 	int r_attempts;
 	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
+	u32 r_map_dne_bound;
 
 	struct ceph_osd_req_op r_ops[];
 };
@@ -218,6 +220,7 @@ struct ceph_osd_linger_request {
 
 	struct ceph_osd_request_target t;
 	u32 last_force_resend;
+	u32 map_dne_bound;
 
 	struct timespec mtime;
 
@@ -225,6 +228,7 @@ struct ceph_osd_linger_request {
 	struct mutex lock;
 	struct rb_node node;            /* osd */
 	struct rb_node osdc_node;       /* osdc */
+	struct rb_node mc_node;         /* map check */
 	struct list_head scan_item;
 
 	struct completion reg_commit_wait;
@@ -257,6 +261,8 @@ struct ceph_osd_client {
 	atomic64_t             last_tid;      /* tid of last request */
 	u64                    last_linger_id;
 	struct rb_root         linger_requests; /* lingering requests */
+	struct rb_root         map_checks;
+	struct rb_root         linger_map_checks;
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5ac6dce74f07..ece2d10a1208 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -396,6 +396,7 @@ static void target_destroy(struct ceph_osd_request_target *t)
 static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
 	WARN_ON(!list_empty(&req->r_unsafe_item));
 	WARN_ON(req->r_osd);
 }
@@ -456,6 +457,7 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	RB_CLEAR_NODE(&req->r_node);
+	RB_CLEAR_NODE(&req->r_mc_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
 	target_init(&req->r_t);
@@ -969,6 +971,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
 DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
 static bool osd_homeless(struct ceph_osd *osd)
 {
@@ -1601,10 +1604,13 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
+static void send_map_check(struct ceph_osd_request *req);
+
 static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 	struct ceph_osd *osd;
+	enum calc_target_result ct_res;
 	bool need_send = false;
 	bool promoted = false;
 
@@ -1612,7 +1618,10 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
-	calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+		goto promote;
+
 	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
 	if (IS_ERR(osd)) {
 		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
@@ -1656,6 +1665,9 @@ again:
 		send_request(req);
 	mutex_unlock(&osd->lock);
 
+	if (ct_res == CALC_TARGET_POOL_DNE)
+		send_map_check(req);
+
 	if (promoted)
 		downgrade_write(&osdc->lock);
 	return;
@@ -1699,6 +1711,7 @@ static void __finish_request(struct ceph_osd_request *req)
 	verify_osd_locked(osd);
 	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 
+	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
 	unlink_request(osd, req);
 	atomic_dec(&osdc->num_requests);
 
@@ -1726,13 +1739,127 @@ static void __complete_request(struct ceph_osd_request *req)
 		complete_all(&req->r_completion);
 }
 
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+	req->r_result = err;
+	__finish_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+	ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (!lookup_req)
+		return;
+
+	WARN_ON(lookup_req != req);
+	erase_request_mc(&osdc->map_checks, req);
+	ceph_osdc_put_request(req);
+}
+
 static void cancel_request(struct ceph_osd_request *req)
 {
 	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 
+	cancel_map_check(req);
 	finish_request(req);
 }
 
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (req->r_attempts) {
+		/*
+		 * We sent a request earlier, which means that
+		 * previously the pool existed, and now it does not
+		 * (i.e., it was deleted).
+		 */
+		req->r_map_dne_bound = map->epoch;
+		dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+		     req->r_tid);
+	} else {
+		dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+		     req, req->r_tid, req->r_map_dne_bound, map->epoch);
+	}
+
+	if (req->r_map_dne_bound) {
+		if (map->epoch >= req->r_map_dne_bound) {
+			/* we had a new enough map */
+			pr_info_ratelimited("tid %llu pool does not exist\n",
+					    req->r_tid);
+			complete_request(req, -ENOENT);
+		}
+	} else {
+		send_map_check(req);
+	}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_request *req;
+	u64 tid = greq->private_data;
+
+	WARN_ON(greq->result || !greq->u.newest);
+
+	down_write(&osdc->lock);
+	req = lookup_request_mc(&osdc->map_checks, tid);
+	if (!req) {
+		dout("%s tid %llu dne\n", __func__, tid);
+		goto out_unlock;
+	}
+
+	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+	     req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+	if (!req->r_map_dne_bound)
+		req->r_map_dne_bound = greq->u.newest;
+	erase_request_mc(&osdc->map_checks, req);
+	check_pool_dne(req);
+
+	ceph_osdc_put_request(req);
+out_unlock:
+	up_write(&osdc->lock);
+}
+
+static void send_map_check(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		return;
+	}
+
+	ceph_osdc_get_request(req);
+	insert_request_mc(&osdc->map_checks, req);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  map_check_cb, req->r_tid);
+	WARN_ON(ret);
+}
+
 /*
  * lingering requests, watch/notify v2 infrastructure
  */
@@ -1745,6 +1872,7 @@ static void linger_release(struct kref *kref)
 	     lreq->reg_req, lreq->ping_req);
 	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
 	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
 	WARN_ON(!list_empty(&lreq->scan_item));
 	WARN_ON(!list_empty(&lreq->pending_lworks));
 	WARN_ON(lreq->osd);
@@ -1783,6 +1911,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 	mutex_init(&lreq->lock);
 	RB_CLEAR_NODE(&lreq->node);
 	RB_CLEAR_NODE(&lreq->osdc_node);
+	RB_CLEAR_NODE(&lreq->mc_node);
 	INIT_LIST_HEAD(&lreq->scan_item);
 	INIT_LIST_HEAD(&lreq->pending_lworks);
 	init_completion(&lreq->reg_commit_wait);
@@ -1797,6 +1926,7 @@ linger_alloc(struct ceph_osd_client *osdc)
 
 DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
 DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
 
 /*
  * Create linger request <-> OSD session relation.
@@ -2193,6 +2323,23 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
 	send_linger(lreq);
 }
 
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (!lookup_lreq)
+		return;
+
+	WARN_ON(lookup_lreq != lreq);
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	linger_put(lreq);
+}
+
 /*
  * @lreq has to be both registered and linked.
  */
@@ -2202,6 +2349,7 @@ static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 		cancel_linger_request(lreq->ping_req);
 	if (lreq->reg_req->r_osd)
 		cancel_linger_request(lreq->reg_req);
+	cancel_linger_map_check(lreq);
 	unlink_linger(lreq->osd, lreq);
 	linger_unregister(lreq);
 }
@@ -2216,6 +2364,89 @@ static void linger_cancel(struct ceph_osd_linger_request *lreq)
 	up_write(&osdc->lock);
 }
 
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (lreq->register_gen) {
+		lreq->map_dne_bound = map->epoch;
+		dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+		     lreq, lreq->linger_id);
+	} else {
+		dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+		     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+		     map->epoch);
+	}
+
+	if (lreq->map_dne_bound) {
+		if (map->epoch >= lreq->map_dne_bound) {
+			/* we had a new enough map */
+			pr_info("linger_id %llu pool does not exist\n",
+				lreq->linger_id);
+			linger_reg_commit_complete(lreq, -ENOENT);
+			__linger_cancel(lreq);
+		}
+	} else {
+		send_linger_map_check(lreq);
+	}
+}
+
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_linger_request *lreq;
+	u64 linger_id = greq->private_data;
+
+	WARN_ON(greq->result || !greq->u.newest);
+
+	down_write(&osdc->lock);
+	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+	if (!lreq) {
+		dout("%s linger_id %llu dne\n", __func__, linger_id);
+		goto out_unlock;
+	}
+
+	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+	     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+	     greq->u.newest);
+	if (!lreq->map_dne_bound)
+		lreq->map_dne_bound = greq->u.newest;
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	check_linger_pool_dne(lreq);
+
+	linger_put(lreq);
+out_unlock:
+	up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (lookup_lreq) {
+		WARN_ON(lookup_lreq != lreq);
+		return;
+	}
+
+	linger_get(lreq);
+	insert_linger_mc(&osdc->linger_map_checks, lreq);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  linger_map_check_cb, lreq->linger_id);
+	WARN_ON(ret);
+}
+
 static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 {
 	int ret;
@@ -2677,10 +2908,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	return;
 
 fail_request:
-	req->r_result = -EIO;
-	__finish_request(req);
-	__complete_request(req);
-	complete_all(&req->r_safe_completion);
+	complete_request(req, -EIO);
 out_unlock_session:
 	mutex_unlock(&osd->lock);
 out_unlock_osdc:
@@ -2764,6 +2992,7 @@ static void scan_requests(struct ceph_osd *osd,
 
 			/* fall through */
 		case CALC_TARGET_NEED_RESEND:
+			cancel_linger_map_check(lreq);
 			/*
 			 * scan_requests() for the previous epoch(s)
 			 * may have already added it to the list, since
@@ -2773,6 +3002,7 @@ static void scan_requests(struct ceph_osd *osd,
 				list_add_tail(&lreq->scan_item, need_resend_linger);
 			break;
 		case CALC_TARGET_POOL_DNE:
+			check_linger_pool_dne(lreq);
 			break;
 		}
 	}
@@ -2782,7 +3012,7 @@ static void scan_requests(struct ceph_osd *osd,
 		    rb_entry(n, struct ceph_osd_request, r_node);
 		enum calc_target_result ct_res;
 
-		n = rb_next(n); /* unlink_request() */
+		n = rb_next(n); /* unlink_request(), check_pool_dne() */
 
 		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 		ct_res = calc_target(osdc, &req->r_t,
@@ -2799,10 +3029,12 @@ static void scan_requests(struct ceph_osd *osd,
 
 			/* fall through */
 		case CALC_TARGET_NEED_RESEND:
+			cancel_map_check(req);
 			unlink_request(osd, req);
 			insert_request(need_resend, req);
 			break;
 		case CALC_TARGET_POOL_DNE:
+			check_pool_dne(req);
 			break;
 		}
 	}
@@ -3655,6 +3887,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->homeless_osd.o_osdc = osdc;
 	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
 	osdc->linger_requests = RB_ROOT;
+	osdc->map_checks = RB_ROOT;
+	osdc->linger_map_checks = RB_ROOT;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 
@@ -3720,6 +3954,8 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
 	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
-- 
cgit v1.2.3


From 7cca78c9dcd1afa243e46edc31896730df85d2b5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:28 +0200
Subject: libceph: replace ceph_monc_request_next_osdmap()

... with a wrapper around maybe_request_map() - no need for two
osdmap-specific functions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  2 +-
 include/linux/ceph/mon_client.h |  1 -
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/mon_client.c           | 14 --------------
 net/ceph/osd_client.c           |  7 +++++++
 5 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8eae6f56194d..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4902,7 +4902,7 @@ again:
 			return ret;
 
 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-			ceph_monc_request_next_osdmap(&rbdc->client->monc);
+			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
 						     newest_epoch,
 						     opts->mount_timeout);
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 19800d9b45f3..1d730993c3f8 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -128,7 +128,6 @@ bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
 void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 				 unsigned long timeout);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3e7bf72e4078..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -381,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 4e49b2296920..72a910bf7819 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -384,20 +384,6 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_renew_subs);
 
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-	dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
-	mutex_lock(&monc->mutex);
-	if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
-				 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-		__send_subscribe(monc);
-	mutex_unlock(&monc->mutex);
-}
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
-
 /*
  * Wait for an osdmap with a given epoch.
  *
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ece2d10a1208..55cafd3a2ff0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3869,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
 
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+	down_read(&osdc->lock);
+	maybe_request_map(osdc);
+	up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 
 /*
  * init, shutdown
-- 
cgit v1.2.3


From 737cc81ead34bcef0b1f6ea8322228e4378cf21a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 00:05:01 +0200
Subject: libceph: support for subscribing to "mdsmap.<id>" maps

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h    |  2 ++
 include/linux/ceph/mon_client.h |  1 +
 net/ceph/debugfs.c              |  1 +
 net/ceph/mon_client.c           | 18 +++++++++++++-----
 4 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3b911ff889dd..bae833d0d055 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -208,6 +208,8 @@ struct ceph_mon_subscribe_ack {
 	struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
+#define CEPH_FS_CLUSTER_ID_NONE  -1
+
 /*
  * mdsmap flags
  */
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 1d730993c3f8..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -96,6 +96,7 @@ struct ceph_mon_client {
 		bool want;
 		u32 have; /* epoch */
 	} subs[3];
+	int fs_cluster_id; /* "mdsmap.<id>" sub */
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_file;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6e434c75cd08..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -128,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
 					CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
 		seq_putc(s, '\n');
 	}
+	seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
 
 	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
 		__u16 op;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 72a910bf7819..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 	BUG_ON(num < 1); /* monmap sub is always there */
 	ceph_encode_32(&p, num);
 	for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-		const char *s = ceph_sub_str[i];
+		char buf[32];
+		int len;
 
 		if (!monc->subs[i].want)
 			continue;
 
-		dout("%s %s start %llu flags 0x%x\n", __func__, s,
+		len = sprintf(buf, "%s", ceph_sub_str[i]);
+		if (i == CEPH_SUB_MDSMAP &&
+		    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+			len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+		dout("%s %s start %llu flags 0x%x\n", __func__, buf,
 		     le64_to_cpu(monc->subs[i].item.start),
 		     monc->subs[i].item.flags);
-		ceph_encode_string(&p, end, s, strlen(s));
+		ceph_encode_string(&p, end, buf, len);
 		memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
 		p += sizeof(monc->subs[i].item);
 	}
 
-	BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+	BUG_ON(p > end);
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	ceph_msg_revoke(msg);
@@ -948,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	if (!monc->m_subscribe_ack)
 		goto out_auth;
 
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
 					 true);
 	if (!monc->m_subscribe)
 		goto out_subscribe_ack;
@@ -974,6 +980,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->generic_request_tree = RB_ROOT;
 	monc->last_tid = 0;
 
+	monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
 	return 0;
 
 out_auth_reply:
-- 
cgit v1.2.3


From 956d39d631dbcf7b57854873a24e309047f2a7f5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 27 Apr 2016 17:48:30 +0800
Subject: ceph: define 'end/complete' in readdir reply as bit flags

Set a flag in readdir request, which indicates that client interprets
'end/complete' as bit flags. So that mds can reply additional flags in
readdir reply.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                |  2 ++
 fs/ceph/mds_client.c         |  7 +++++--
 fs/ceph/mds_client.h         |  2 +-
 include/linux/ceph/ceph_fs.h | 12 ++++++++++++
 4 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 68530acea2c8..ebcbd1c946b4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -365,6 +365,8 @@ more:
 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.flags =
+				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
 
 		req->r_inode = inode;
 		ihold(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6220d3caf7ab..1c2befcd24fb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,8 +181,11 @@ static int parse_reply_info_dir(void **p, void *end,
 
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
 	num = ceph_decode_32(p);
-	info->dir_end = ceph_decode_8(p);
-	info->dir_complete = ceph_decode_8(p);
+	{
+		u16 flags = ceph_decode_16(p);
+		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+	}
 	if (num == 0)
 		goto done;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0b84f9c0afa3..2a865812a41b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -80,7 +80,7 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			u8                            dir_complete, dir_end;
+			bool			      dir_complete, dir_end;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index bae833d0d055..a811c5e98bfa 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -347,6 +347,17 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
 
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END		(1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
@@ -364,6 +375,7 @@ union ceph_mds_request_args {
 		__le32 frag;                 /* which dir fragment */
 		__le32 max_entries;          /* how many dentries to grab */
 		__le32 max_bytes;
+		__le16 flags;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
-- 
cgit v1.2.3


From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 29 Apr 2016 11:27:30 +0800
Subject: ceph: using hash value to compose dentry offset

If MDS sorts dentries in dirfrag in hash order, we use hash value to
compose dentry offset. dentry offset is:

  (0xff << 52) | ((24 bits hash) << 28) |
  (the nth entry hash hash collision)

This offset is stable across directory fragmentation. This alos means
there is no need to reset readdir offset if directory get fragmented
in the middle of readdir.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                | 140 ++++++++++++++++++++++++++++++++-----------
 fs/ceph/inode.c              |  31 ++++++++--
 fs/ceph/mds_client.c         |   1 +
 fs/ceph/mds_client.h         |   4 +-
 fs/ceph/super.h              |   6 +-
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 136 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e954ea2fb710..4850c3624a87 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -69,16 +69,42 @@ out_unlock:
 }
 
 /*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ *	(0xff << 52) | ((24 bits hash) << 28) |
+ *	(the nth entry has hash collision);
+ * - frag+name order;
+ *	((frag value) << 28) | (the nth entry in frag);
  */
+#define OFFSET_BITS	28
+#define OFFSET_MASK	((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER	(0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+	loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+	if (hash_order)
+		fpos |= HASH_ORDER;
+	return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+	return (p & HASH_ORDER) == HASH_ORDER;
+}
+
 static unsigned fpos_frag(loff_t p)
 {
-	return p >> 32;
+	return p >> OFFSET_BITS;
 }
+
+static unsigned fpos_hash(loff_t p)
+{
+	return ceph_frag_value(fpos_frag(p));
+}
+
 static unsigned fpos_off(loff_t p)
 {
-	return p & 0xffffffff;
+	return p & OFFSET_MASK;
 }
 
 static int fpos_cmp(loff_t l, loff_t r)
@@ -177,7 +203,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	u64 idx = 0;
 	int err = 0;
 
-	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+	dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
 
 	/* search start position */
 	if (ctx->pos > 2) {
@@ -234,7 +260,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		spin_unlock(&dentry->d_lock);
 
 		if (emit_dentry) {
-			dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+			dout(" %llx dentry %p %pd %p\n", di->offset,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
@@ -269,6 +295,16 @@ out:
 	return err;
 }
 
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+	if (!fi->last_readdir)
+		return true;
+	if (is_hash_order(pos))
+		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+	else
+		return fi->frag != fpos_frag(pos);
+}
+
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ceph_file_info *fi = file->private_data;
@@ -276,7 +312,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(ctx->pos);
 	int i;
 	int err;
 	u32 ftype;
@@ -317,7 +352,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		err = __dcache_readdir(file, ctx, shared_gen);
 		if (err != -EAGAIN)
 			return err;
-		frag = fpos_frag(ctx->pos);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
@@ -325,8 +359,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* proceed with a normal readdir */
 more:
 	/* do we have the correct frag content buffered? */
-	if (fi->frag != frag || fi->last_readdir == NULL) {
+	if (need_send_readdir(fi, ctx->pos)) {
 		struct ceph_mds_request *req;
+		unsigned frag;
 		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -336,6 +371,13 @@ more:
 			fi->last_readdir = NULL;
 		}
 
+		if (is_hash_order(ctx->pos)) {
+			frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+						NULL, NULL);
+		} else {
+			frag = fpos_frag(ctx->pos);
+		}
+
 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
 		     ceph_vinop(inode), frag, fi->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -373,19 +415,23 @@ more:
 			ceph_mdsc_put_request(req);
 			return err;
 		}
-		dout("readdir got and parsed readdir result=%d"
-		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		dout("readdir got and parsed readdir result=%d on "
+		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
+		     err, frag,
 		     (int)req->r_reply_info.dir_end,
-		     (int)req->r_reply_info.dir_complete);
-
+		     (int)req->r_reply_info.dir_complete,
+		     (int)req->r_reply_info.hash_order);
 
-		/* note next offset and last dentry name */
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
-			fi->next_offset = req->r_readdir_offset;
-			/* adjust ctx->pos to beginning of frag */
-			ctx->pos = ceph_make_fpos(frag, fi->next_offset);
+			if (!rinfo->hash_order) {
+				fi->next_offset = req->r_readdir_offset;
+				/* adjust ctx->pos to beginning of frag */
+				ctx->pos = ceph_make_fpos(frag,
+							  fi->next_offset,
+							  false);
+			}
 		}
 
 		fi->frag = frag;
@@ -411,23 +457,25 @@ more:
 			fi->dir_release_count = 0;
 		}
 
-		if (req->r_reply_info.dir_end) {
-			kfree(fi->last_name);
-			fi->last_name = NULL;
-			fi->next_offset = 2;
-		} else {
+		/* note next offset and last dentry name */
+		if (rinfo->dir_nr > 0) {
 			struct ceph_mds_reply_dir_entry *rde =
 					rinfo->dir_entries + (rinfo->dir_nr-1);
+			unsigned next_offset = req->r_reply_info.dir_end ?
+					2 : (fpos_off(rde->offset) + 1);
 			err = note_last_dentry(fi, rde->name, rde->name_len,
-					       fpos_off(rde->offset) + 1);
+					       next_offset);
 			if (err)
 				return err;
+		} else if (req->r_reply_info.dir_end) {
+			fi->next_offset = 2;
+			/* keep last name */
 		}
 	}
 
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-	     frag, rinfo->dir_nr, ctx->pos,
+	     fi->frag, rinfo->dir_nr, ctx->pos,
 	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
 	i = 0;
@@ -470,16 +518,26 @@ more:
 		ctx->pos++;
 	}
 
-	if (fi->last_name) {
+	if (fi->next_offset > 2) {
 		ceph_mdsc_put_request(fi->last_readdir);
 		fi->last_readdir = NULL;
 		goto more;
 	}
 
 	/* more frags? */
-	if (!ceph_frag_is_rightmost(frag)) {
-		frag = ceph_frag_next(frag);
-		ctx->pos = ceph_make_fpos(frag, 2);
+	if (!ceph_frag_is_rightmost(fi->frag)) {
+		unsigned frag = ceph_frag_next(fi->frag);
+		if (is_hash_order(ctx->pos)) {
+			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+							fi->next_offset, true);
+			if (new_pos > ctx->pos)
+				ctx->pos = new_pos;
+			/* keep last_name */
+		} else {
+			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+		}
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -532,14 +590,21 @@ static void reset_readdir(struct ceph_file_info *fi)
 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 {
 	struct ceph_mds_reply_info_parsed *rinfo;
+	loff_t chunk_offset;
 	if (new_pos == 0)
 		return true;
-	if (fpos_frag(new_pos) != fi->frag)
+	if (is_hash_order(new_pos)) {
+		/* no need to reset last_name for a forward seek when
+		 * dentries are sotred in hash order */
+	} else if (fi->frag |= fpos_frag(new_pos)) {
 		return true;
+	}
 	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
 	if (!rinfo || !rinfo->dir_nr)
 		return true;
-	return new_pos < rinfo->dir_entries[0].offset;;
+	chunk_offset = rinfo->dir_entries[0].offset;
+	return new_pos < chunk_offset ||
+	       is_hash_order(new_pos) != is_hash_order(chunk_offset);
 }
 
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
@@ -562,17 +627,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	if (offset >= 0) {
+		if (need_reset_readdir(fi, offset)) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		} else if (is_hash_order(offset) && offset > file->f_pos) {
+			/* for hash offset, we don't know if a forward seek
+			 * is within same frag */
+			fi->dir_release_count = 0;
+			fi->readdir_cache_idx = -1;
+		}
+
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 			fi->flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;
-
-		if (need_reset_readdir(fi, offset)) {
-			dout("dir_llseek dropping %p content\n", file);
-			reset_readdir(fi);
-		}
 	}
 out:
 	inode_unlock(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b53c95903aeb..f51b6fd5f570 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1387,6 +1387,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
 	struct dentry *parent = req->r_dentry;
+	struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct qstr dname;
 	struct dentry *dn;
@@ -1394,19 +1395,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	int err = 0, skipped = 0, ret, i;
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	struct ceph_dentry_info *di;
 	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+	u32 last_hash = 0;
+	u32 fpos_offset;
 	struct ceph_readdir_cache_control cache_ctl = {};
 
 	if (req->r_aborted)
 		return readdir_prepopulate_inodes_only(req, session);
 
+	if (rinfo->hash_order && req->r_path2) {
+		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+					  req->r_path2, strlen(req->r_path2));
+		last_hash = ceph_frag_value(last_hash);
+	}
+
 	if (rinfo->dir_dir &&
 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 		dout("readdir_prepopulate got new frag %x -> %x\n",
 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
-		req->r_readdir_offset = 2;
+		if (!rinfo->hash_order)
+			req->r_readdir_offset = 2;
 	}
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1424,13 +1433,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
-		struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
 		req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
 		req->r_readdir_cache_idx = 0;
 	}
 
 	cache_ctl.index = req->r_readdir_cache_idx;
+	fpos_offset = req->r_readdir_offset;
 
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1444,6 +1453,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		vino.ino = le64_to_cpu(rde->inode.in->ino);
 		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
+		if (rinfo->hash_order) {
+			u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+						 rde->name, rde->name_len);
+			hash = ceph_frag_value(hash);
+			if (hash != last_hash)
+				fpos_offset = 2;
+			last_hash = hash;
+			rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+		} else {
+			rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+		}
+
 retry_lookup:
 		dn = d_lookup(parent, &dname);
 		dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1521,9 +1542,7 @@ retry_lookup:
 			dn = realdn;
 		}
 
-		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-		rde->offset = di->offset;
+		ceph_dentry(dn)->offset = rde->offset;
 
 		update_dentry_lease(dn, rde->lease, req->r_session,
 				    req->r_request_started);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 48def22fc7b9..7ad31283d510 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -185,6 +185,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		u16 flags = ceph_decode_16(p);
 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
 	}
 	if (num == 0)
 		goto done;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4ce19d852657..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -81,7 +81,9 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			bool			      dir_complete, dir_end;
+			bool			      dir_complete;
+			bool			      dir_end;
+			bool			      hash_order;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0628099ba1f2..c9b671dfff81 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -540,11 +540,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
 
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-	return ((loff_t)frag << 32) | (loff_t)off;
-}
-
 /*
  * caps helpers
  */
@@ -949,6 +944,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
 			       struct dentry *dentry, int err);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index a811c5e98bfa..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op);
  */
 #define CEPH_READDIR_FRAG_END		(1<<0)
 #define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+#define CEPH_READDIR_HASH_ORDER		(1<<9)
 
 union ceph_mds_request_args {
 	struct {
-- 
cgit v1.2.3


From 3b33f692c84c28cc8178aaeeb9264d82b48787f1 Mon Sep 17 00:00:00 2001
From: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
Date: Fri, 25 Mar 2016 05:18:39 -0400
Subject: ceph: make logical calculation functions return bool

This patch makes serverl logical caculation functions return bool to
improve readability due to these particular functions only using 0/1
as their return value.

No functional change.

Signed-off-by: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
---
 fs/ceph/cache.c                | 2 +-
 fs/ceph/dir.c                  | 2 +-
 include/linux/ceph/ceph_frag.h | 4 ++--
 include/linux/ceph/decode.h    | 2 +-
 include/linux/ceph/osdmap.h    | 6 +++---
 net/ceph/ceph_common.c         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
 	unlock_page(page);
 }
 
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
 	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
 		(ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4850c3624a87..f6279a1bd6ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -710,7 +710,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 	return dentry;
 }
 
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
 	return ceph_ino(inode) == CEPH_INO_ROOT &&
 		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 	return ceph_frag_make(newbits,
 			 ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
 	return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
 	return ceph_frag_value(f) == ceph_frag_mask(f);
 }
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
 	return end >= *p && n <= end - *p;
 }
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 821e16fff39a..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -172,19 +172,19 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
 	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
 
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
 	return ceph_osd_exists(map, osd) &&
 	       (map->osd_state[osd] & CEPH_OSD_UP);
 }
 
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
 	return !ceph_osd_is_up(map, osd);
 }
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
 	return client->monc.monmap && client->monc.monmap->epoch &&
 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
-- 
cgit v1.2.3


From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 May 2016 00:04:58 -0400
Subject: add down_write_killable_nested()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/rwsem.h  |  2 ++
 kernel/locking/rwsem.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index d1c12d160ace..d37fbb34d06f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -156,6 +156,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
 
 # define down_write_nest_lock(sem, nest_lock)			\
@@ -176,6 +177,7 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
+# define down_write_killable_nested(sem, subclass)	down_write_killable(sem)
 # define down_read_non_owner(sem)		down_read(sem)
 # define up_read_non_owner(sem)			up_read(sem)
 #endif
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c817216c1615..2e853ad93a3a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
 void up_read_non_owner(struct rw_semaphore *sem)
 {
 	__up_read(sem);
-- 
cgit v1.2.3


From 868b2072f09c8a698df8066ca72d30411dcc57d6 Mon Sep 17 00:00:00 2001
From: Moritz Fischer <moritz.fischer@ettus.com>
Date: Mon, 23 May 2016 11:44:39 -0700
Subject: misc: at24: Fix typo in at24 header file

This commit fixes a simple typo s/mvmem/nvmem in the
example.

Signed-off-by: Moritz Fischer <moritz.fischer@ettus.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/platform_data/at24.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/at24.h b/include/linux/platform_data/at24.h
index dc9a13e5acda..be830b141d83 100644
--- a/include/linux/platform_data/at24.h
+++ b/include/linux/platform_data/at24.h
@@ -26,7 +26,7 @@
  *
  * An example in pseudo code for a setup() callback:
  *
- * void get_mac_addr(struct mvmem_device *nvmem, void *context)
+ * void get_mac_addr(struct nvmem_device *nvmem, void *context)
  * {
  *	u8 *mac_addr = ethernet_pdata->mac_addr;
  *	off_t offset = context;
-- 
cgit v1.2.3


From 50755bc1c305340660bbfa65fdae3ed113d8fe0e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 26 May 2016 15:16:06 -0700
Subject: seqlock: fix raw_read_seqcount_latch()

lockless_dereference() is supposed to take pointer not integer.

Link: http://lkml.kernel.org/r/20160521201448.GA7429@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seqlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e0582106ef4f..7973a821ac58 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -277,7 +277,7 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
-	return lockless_dereference(s->sequence);
+	return lockless_dereference(s)->sequence;
 }
 
 /**
@@ -331,7 +331,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *	unsigned seq, idx;
  *
  *	do {
- *		seq = lockless_dereference(latch->seq);
+ *		seq = lockless_dereference(latch)->seq;
  *
  *		idx = seq & 0x01;
  *		entry = data_query(latch->data[idx], ...);
-- 
cgit v1.2.3


From d96c84f8d27ce57ff08f12b9654d9f505a8cce6e Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 26 May 2016 15:16:14 -0700
Subject: mm: slub: remove unused virt_to_obj()

It's unused since commit 7ed2f9e66385 ("mm, kasan: SLAB support")

Link: http://lkml.kernel.org/r/1464020961-2242-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 665cd0cd18b8..d1faa019c02a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -111,22 +111,6 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
-
-/**
- * virt_to_obj - returns address of the beginning of object.
- * @s: object's kmem_cache
- * @slab_page: address of slab page
- * @x: address within object memory range
- *
- * Returns address of the beginning of object
- */
-static inline void *virt_to_obj(struct kmem_cache *s,
-				const void *slab_page,
-				const void *x)
-{
-	return (void *)x - ((x - slab_page) % s->size);
-}
-
 void object_err(struct kmem_cache *s, struct page *page,
 		u8 *object, char *reason);
 
-- 
cgit v1.2.3


From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 26 May 2016 15:16:22 -0700
Subject: mm: oom_reaper: remove some bloat

mmput_async is currently used only from the oom_reaper which is defined
only for CONFIG_MMU.  We can save work_struct in mm_struct for
!CONFIG_MMU.

[akpm@linux-foundation.org: fix typo, per Minchan]
Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz
Reported-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 ++
 include/linux/sched.h    | 4 +++-
 kernel/fork.c            | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d553855503e6..ca3e517980a0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -514,7 +514,9 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+#ifdef CONFIG_MMU
 	struct work_struct async_put_work;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23e075dcdfe4..6e42ada26345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2745,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-/* same as above but performs the slow path from the async kontext. Can
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
 extern void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47887bba944f..5c2c355aa97f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
 		schedule_work(&mm->async_put_work);
 	}
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
-- 
cgit v1.2.3


From 5930122683dff58f0846b0f0405b4bd598a3ba6a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2016 10:19:30 -0400
Subject: switch xattr_handler->set() to passing dentry and inode separately

preparation for similar switch in ->setxattr() (see the next commit for
rationale).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c                  |  6 +++---
 fs/9p/xattr.c                |  5 +++--
 fs/btrfs/xattr.c             | 12 +++++-------
 fs/ceph/xattr.c              |  7 ++++---
 fs/cifs/xattr.c              |  9 +++++----
 fs/ext2/xattr_security.c     |  7 ++++---
 fs/ext2/xattr_trusted.c      |  7 ++++---
 fs/ext2/xattr_user.c         |  9 +++++----
 fs/ext4/xattr_security.c     |  7 ++++---
 fs/ext4/xattr_trusted.c      |  7 ++++---
 fs/ext4/xattr_user.c         |  9 +++++----
 fs/f2fs/xattr.c              | 12 ++++++------
 fs/gfs2/xattr.c              |  6 +++---
 fs/hfsplus/xattr.c           | 12 ++++++------
 fs/hfsplus/xattr.h           |  2 +-
 fs/hfsplus/xattr_security.c  |  7 ++++---
 fs/hfsplus/xattr_trusted.c   |  7 ++++---
 fs/hfsplus/xattr_user.c      |  7 ++++---
 fs/jffs2/security.c          |  7 ++++---
 fs/jffs2/xattr_trusted.c     |  7 ++++---
 fs/jffs2/xattr_user.c        |  7 ++++---
 fs/jfs/xattr.c               | 14 ++++++--------
 fs/nfs/nfs4proc.c            | 19 +++++++++----------
 fs/ocfs2/xattr.c             | 23 +++++++++++++----------
 fs/orangefs/xattr.c          | 10 ++++++----
 fs/posix_acl.c               |  6 +++---
 fs/reiserfs/xattr_security.c |  9 +++++----
 fs/reiserfs/xattr_trusted.c  |  9 +++++----
 fs/reiserfs/xattr_user.c     |  9 +++++----
 fs/ubifs/xattr.c             |  7 +++----
 fs/xattr.c                   |  6 ++++--
 fs/xfs/xfs_xattr.c           |  9 +++++----
 include/linux/xattr.h        |  4 ++--
 mm/shmem.c                   |  7 ++++---
 34 files changed, 156 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eb3589edf485..0576eaeb60b9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
-			      struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags)
+			      struct dentry *dentry, struct inode *inode,
+			      const char *name, const void *value,
+			      size_t size, int flags)
 {
 	int retval;
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
-	struct inode *inode = d_inode(dentry);
 
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 18c62bae9591..a6bd349bab23 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *dentry, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	const char *full_name = xattr_full_name(handler, name);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3bfb252206c7..d1a177a3dbe8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
-					struct dentry *dentry,
+					struct dentry *unused, struct inode *inode,
 					const char *name, const void *value,
 					size_t size, int flags)
 {
 	name = xattr_full_name(handler, name);
-	return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+	return btrfs_set_prop(inode, name, value, size, flags);
 }
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..2baa6939dfe6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1051,12 +1051,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
-	return __ceph_setxattr(d_inode(dentry), name, value, size, flags);
+	return __ceph_setxattr(inode, name, value, size, flags);
 }
 
 const struct xattr_handler ceph_other_xattr_handler = {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index c8b77aa24a1d..5e23f64c0804 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -39,8 +39,9 @@
 enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *dentry, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
 	int rc = -EOPNOTSUPP;
 	unsigned int xid;
@@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 			if (value &&
 			    pTcon->ses->server->ops->set_acl)
 				rc = pTcon->ses->server->ops->set_acl(pacl,
-						size, d_inode(dentry),
+						size, inode,
 						full_path, CIFS_ACL_DACL);
 			else
 				rc = -EOPNOTSUPP;
 			if (rc == 0) /* force revalidate of the inode */
-				CIFS_I(d_inode(dentry))->time = 0;
+				CIFS_I(inode)->time = 0;
 			kfree(pacl);
 		}
 #endif /* CONFIG_CIFS_ACL */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 7fd3b867ce65..7b9e9c1842d5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 0f85705ff519..65049b71af13 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 1fafd27037cc..fb2f992ae763 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 123a7d010efe..a8921112030d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 60652fa24cbc..c7765c735714 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 17a446ffecd3..ca20e423034b 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 00ea56797258..e3decae3acfb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+	return f2fs_setxattr(inode, handler->flags, name,
 					value, size, NULL, flags);
 }
 
@@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value == NULL)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f42ab53bd30d..3a2853504084 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *unused, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4f118d282a7a..d37bb88dc746 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
 	return len;
 }
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 		     const void *value, size_t size, int flags,
 		     const char *prefix, size_t prefixlen)
 {
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 		return -ENOMEM;
 	strcpy(xattr_name, prefix);
 	strcpy(xattr_name + prefixlen, name);
-	res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
-				 flags);
+	res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
 	kfree(xattr_name);
 	return res;
 }
@@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *buffer, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *buffer,
+				size_t size, int flags)
 {
 	/*
 	 * Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
 	 * creates), so we pass the name through unmodified (after
 	 * ensuring it doesn't conflict with another namespace).
 	 */
-	return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+	return __hfsplus_setxattr(inode, name, buffer, size, flags);
 }
 
 const struct xattr_handler hfsplus_xattr_osx_handler = {
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d04ba6f58df2..68f6b539371f 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 				   const void *value, size_t size, int flags,
 				   const char *prefix, size_t prefixlen);
 
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index ae2ca8c2e335..37b3efa733ef 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *name,
-				     const void *buffer, size_t size, int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *name, const void *buffer,
+				     size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN);
 }
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index eae2947060aa..94519d6c627d 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *buffer, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *buffer,
+				    size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 3c9eec3e4c7b..fae6c0ea0030 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
-				 struct dentry *dentry, const char *name,
-				 const void *buffer, size_t size, int flags)
+				 struct dentry *unused, struct inode *inode,
+				 const char *name, const void *buffer,
+				 size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 3ed9a4b49778..c2332e30f218 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 4ebecff1d922..5d6030826c52 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *buffer, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *buffer,
+				  size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index bce249e1b277..9d027b4abcf9 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
-			       struct dentry *dentry, const char *name,
-			       const void *buffer, size_t size, int flags)
+			       struct dentry *unused, struct inode *inode,
+			       const char *name, const void *buffer,
+			       size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index beb182b503b3..0bf3c33aedff 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
-			 struct dentry *dentry, const char *name,
-			 const void *value, size_t size, int flags)
+			 struct dentry *unused, struct inode *inode,
+			 const char *name, const void *value,
+			 size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __jfs_xattr_set(inode, name, value, size, flags);
 }
@@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
-			     struct dentry *dentry, const char *name,
-			     const void *value, size_t size, int flags)
+			     struct dentry *unused, struct inode *inode,
+			     const char *name, const void *value,
+			     size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (is_known_namespace(name))
 		return -EOPNOTSUPP;
 	return __jfs_xattr_set(inode, name, value, size, flags);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 084e8570da18..2e802ec47b8a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4993,12 +4993,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
 }
 
 static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
 {
 	struct nfs4_label ilabel, *olabel = NULL;
 	struct nfs_fattr fattr;
 	struct rpc_cred *cred;
-	struct inode *inode = d_inode(dentry);
 	int status;
 
 	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6255,11 +6254,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *key,
-				   const void *buf, size_t buflen,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *key, const void *buf,
+				   size_t buflen, int flags)
 {
-	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+	return nfs4_proc_set_acl(inode, buf, buflen);
 }
 
 static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
@@ -6277,12 +6276,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *key,
-				     const void *buf, size_t buflen,
-				     int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *key, const void *buf,
+				     size_t buflen, int flags)
 {
 	if (security_ismaclabel(key))
-		return nfs4_set_security_label(dentry, buf, buflen);
+		return nfs4_set_security_label(inode, buf, buflen);
 
 	return -EOPNOTSUPP;
 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ad16995c9e7a..d2053853951e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *value, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *value,
+				    size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
 			       name, value, size, flags);
 }
 
@@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
 			       name, value, size, flags);
 }
 
@@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *value, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *value,
+				size_t size, int flags)
 {
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
 			       name, value, size, flags);
 }
 
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 99c19545752c..5893ddde0e4b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -448,13 +448,14 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
-				      struct dentry *dentry,
+				      struct dentry *unused,
+				      struct inode *inode,
 				      const char *name,
 				      const void *buffer,
 				      size_t size,
 				      int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
 				    name,
 				    buffer,
@@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
-				     struct dentry *dentry,
+				     struct dentry *unused,
+				     struct inode *inode,
 				     const char *name,
 				     const void *buffer,
 				     size_t size,
 				     int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
 				    name,
 				    buffer,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2c60f17e7d92..8a4a266beff3 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	struct inode *inode = d_backing_inode(dentry);
 	struct posix_acl *acl = NULL;
 	int ret;
 
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 86aeb9dd805a..e4cbb7719906 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
-	     const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+	     struct inode *inode, const char *name, const void *buffer,
+	     size_t size, int flags)
 {
-	if (IS_PRIVATE(d_inode(dentry)))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 31837f031f59..f15a5f9e84ce 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
-	    const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+	    struct inode *inode, const char *name, const void *buffer,
+	    size_t size, int flags)
 {
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index f7c39731684b..dc59df43b2db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
-	 const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+	 struct inode *inode, const char *name, const void *buffer,
+	 size_t size, int flags)
 {
-	if (!reiserfs_xattrs_user(dentry->d_sb))
+	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 6c277eb6aef9..b5fc27969e9d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ubifs_xattr_set(const struct xattr_handler *handler,
-			   struct dentry *dentry, const char *name,
-			   const void *value, size_t size, int flags)
+			   struct dentry *dentry, struct inode *inode,
+			   const char *name, const void *value,
+			   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
 		name, inode->i_ino, dentry, size);
 
diff --git a/fs/xattr.c b/fs/xattr.c
index fc81e771488a..b16d07889700 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -754,7 +754,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, value, size, flags);
+	return handler->set(handler, dentry, d_inode(dentry), name, value,
+			    size, flags);
 }
 
 /*
@@ -769,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+	return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+			    0, XATTR_REPLACE);
 }
 
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index d111f691f313..2773b155cb56 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -74,11 +74,12 @@ xfs_forget_acl(
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
-		const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+		struct inode *inode, const char *name, const void *value,
+		size_t size, int flags)
 {
 	int			xflags = handler->flags;
-	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 
 	/* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
 	error = xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
 	if (!error)
-		xfs_forget_acl(d_inode(dentry), name, xflags);
+		xfs_forget_acl(inode, name, xflags);
 
 	return error;
 }
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 1cc4c578deb9..76beb206741a 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -33,8 +33,8 @@ struct xattr_handler {
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
 	int (*set)(const struct xattr_handler *, struct dentry *dentry,
-		   const char *name, const void *buffer, size_t size,
-		   int flags);
+		   struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
 };
 
 const char *xattr_full_name(const struct xattr_handler *, const char *);
diff --git a/mm/shmem.c b/mm/shmem.c
index e418a995427d..a36144909b28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2645,10 +2645,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	name = xattr_full_name(handler, name);
 	return simple_xattr_set(&info->xattrs, name, value, size, flags);
-- 
cgit v1.2.3


From 7ded384a12688c2a86b618da16bc87713404dfcc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:23:32 -0700
Subject: mm: fix section mismatch warning

The register_page_bootmem_info_node() function needs to be marked __init
in order to avoid a new warning introduced by commit f65e91df25aa ("mm:
use early_pfn_to_nid in register_page_bootmem_info_node").

Otherwise you'll get a warning about how a non-init function calls
early_pfn_to_nid (which is __meminit)

Cc: Yang Shi <yang.shi@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d8a5d4d133..5145620ba48a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -182,7 +182,7 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b8ee0806415f..e3cbdcaff2a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-void register_page_bootmem_info_node(struct pglist_data *pgdat)
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
 	unsigned long i, pfn, end_pfn, nr_pages;
 	int node = pgdat->node_id;
-- 
cgit v1.2.3


From 5d22fc25d4fc8096d2d7df27ea1893d4e055e764 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:57:31 -0700
Subject: mm: remove more IS_ERR_VALUE abuses

The do_brk() and vm_brk() return value was "unsigned long" and returned
the starting address on success, and an error value on failure.  The
reasons are entirely historical, and go back to it basically behaving
like the mmap() interface does.

However, nobody actually wanted that interface, and it causes totally
pointless IS_ERR_VALUE() confusion.

What every single caller actually wants is just the simpler integer
return of zero for success and negative error number on failure.

So just convert to that much clearer and more common calling convention,
and get rid of all the IS_ERR_VALUE() uses wrt vm_brk().

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c | 17 ++++++++---------
 fs/binfmt_aout.c          | 18 +++++++-----------
 fs/binfmt_elf.c           | 11 +++++------
 include/linux/mm.h        |  2 +-
 mm/mmap.c                 | 16 ++++++++--------
 mm/nommu.c                |  2 +-
 6 files changed, 30 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f5e737ff0022..cb26f18d43af 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -116,12 +116,12 @@ static struct linux_binfmt aout_format = {
 	.min_coredump	= PAGE_SIZE
 };
 
-static unsigned long set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
 	if (end <= start)
-		return start;
+		return 0;
 	return vm_brk(start, end - start);
 }
 
@@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
 
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, 32,
@@ -350,7 +350,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -378,7 +378,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 beyond_if:
 	error = set_brk(current->mm->start_brk, current->mm->brk);
-	if (IS_ERR_VALUE(error))
+	if (error)
 		return error;
 
 	set_binfmt(&aout_format);
@@ -441,7 +441,7 @@ static int load_aout_library(struct file *file)
 		}
 #endif
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -461,9 +461,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2fab9f130e51..64b331ae3428 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -127,11 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
-	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+	if (end > start)
+		return vm_brk(start, end - start);
 	}
 	return 0;
 }
@@ -275,7 +272,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		map_size = ex.a_text+ex.a_data;
 #endif
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, pos,
@@ -298,7 +295,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -382,7 +379,7 @@ static int load_aout_library(struct file *file)
 			       file);
 		}
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -402,9 +399,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 938fc4ede764..e158b22ef32f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+		int error = vm_brk(start, end - start);
+		if (error)
+			return error;
 	}
 	current->mm->start_brk = current->mm->brk = end;
 	return 0;
@@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 		/* Map the last of the bss segment */
 		error = vm_brk(elf_bss, last_bss - elf_bss);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out;
 	}
 
@@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file)
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
 	if (bss > len) {
 		error = vm_brk(len, bss - len);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out_free_ph;
 	}
 	error = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a00ec816233a..5df5feb49575 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2018,7 +2018,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index d3d9a94ca031..de2c1769cc68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
 		goto out;
 
 set_brk:
@@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	len = PAGE_ALIGN(len);
 	if (!len)
-		return addr;
+		return 0;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
@@ -2703,13 +2703,13 @@ out:
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
-	return addr;
+	return 0;
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long ret;
+	int ret;
 	bool populate;
 
 	if (down_write_killable(&mm->mmap_sem))
@@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-	if (populate)
+	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..c2e58880207f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm)
 	}
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From aa00edc1287a693eadc7bc67a3d73555d969b35d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 16:03:22 -0700
Subject: make IS_ERR_VALUE() complain about non-pointer-sized arguments

Now that the allmodconfig x86-64 build is clean wrt IS_ERR_VALUE() uses
on integers, add a cast to a pointer and back to the argument, so that
any new mis-uses of IS_ERR_VALUE() will cause warnings like

   warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]

so that we don't re-introduce any bogus uses.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/err.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/err.h b/include/linux/err.h
index 56762ab41713..1e3558845e4c 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -18,7 +18,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)
 
 static inline void * __must_check ERR_PTR(long error)
 {
-- 
cgit v1.2.3


From 3767e255b390d72f9a33c08d9e86c5f21f25860f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2016 11:06:05 -0400
Subject: switch ->setxattr() to passing dentry and inode separately

smack ->d_instantiate() uses ->setxattr(), so to be able to call it before
we'd hashed the new dentry and attached it to inode, we need ->setxattr()
instances getting the inode as an explicit argument rather than obtaining
it from dentry.

Similar change for ->getxattr() had been done in commit ce23e64.  Unlike
->getxattr() (which is used by both selinux and smack instances of
->d_instantiate()) ->setxattr() is used only by smack one and unfortunately
it got missed back then.

Reported-by: Seung-Woo Kim <sw0312.kim@samsung.com>
Tested-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting                    |  7 +++++++
 drivers/staging/lustre/lustre/llite/llite_internal.h |  4 ++--
 drivers/staging/lustre/lustre/llite/xattr.c          |  6 ++----
 fs/bad_inode.c                                       |  4 ++--
 fs/ecryptfs/crypto.c                                 |  9 +++++----
 fs/ecryptfs/ecryptfs_kernel.h                        |  4 ++--
 fs/ecryptfs/inode.c                                  |  7 ++++---
 fs/ecryptfs/mmap.c                                   |  3 ++-
 fs/fuse/dir.c                                        |  6 +++---
 fs/hfs/attr.c                                        |  6 +++---
 fs/hfs/hfs_fs.h                                      |  2 +-
 fs/kernfs/inode.c                                    | 11 ++++++-----
 fs/kernfs/kernfs-internal.h                          |  3 ++-
 fs/libfs.c                                           |  5 +++--
 fs/overlayfs/inode.c                                 |  5 +++--
 fs/overlayfs/overlayfs.h                             |  5 +++--
 fs/xattr.c                                           |  8 ++++----
 include/linux/fs.h                                   |  3 ++-
 include/linux/xattr.h                                |  3 ++-
 security/smack/smack_lsm.c                           |  2 +-
 20 files changed, 59 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 46f3bb7a02f5..a5fb89cac615 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -578,3 +578,10 @@ in your dentry operations instead.
 --
 [mandatory]
 	->atomic_open() calls without O_CREAT may happen in parallel.
+--
+[mandatory]
+	->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
+	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
+	called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
+	->d_instantiate() uses not just ->getxattr() but ->setxattr() as well.
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index ce1f949430f1..3f2f30b6542c 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -976,8 +976,8 @@ static inline __u64 ll_file_maxbytes(struct inode *inode)
 }
 
 /* llite/xattr.c */
-int ll_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags);
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags);
 ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode,
 		    const char *name, void *buffer, size_t size);
 ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
index ed4de04381c3..608014b0dbcd 100644
--- a/drivers/staging/lustre/lustre/llite/xattr.c
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -211,11 +211,9 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 	return 0;
 }
 
-int ll_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	LASSERT(inode);
 	LASSERT(name);
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 72e35b721608..3ba385eaa26e 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
 	return -EIO;
 }
 
-static int bad_inode_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
+		const char *name, const void *value, size_t size, int flags)
 {
 	return -EIO;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ebd40f46ed4c..0d8eb3455b34 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
 
 static int
 ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+				 struct inode *ecryptfs_inode,
 				 char *page_virt, size_t size)
 {
 	int rc;
 
-	rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
-			       size, 0);
+	rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
+			       ECRYPTFS_XATTR_NAME, page_virt, size, 0);
 	return rc;
 }
 
@@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
 		goto out_free;
 	}
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
-						      size);
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode,
+						      virt, size);
 	else
 		rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
 							 virt_len);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3ec495db7e82..4ba1547bb9ad 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -609,8 +609,8 @@ ssize_t
 ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
 			const char *name, void *value, size_t size);
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags);
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		  const void *value, size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
 #ifdef CONFIG_ECRYPT_FS_MESSAGING
 int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 318b04689d76..9d153b6a1d72 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
+		  const char *name, const void *value,
 		  size_t size, int flags)
 {
 	int rc = 0;
@@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	}
 
 	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-	if (!rc && d_really_is_positive(dentry))
-		fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+	if (!rc && inode)
+		fsstack_copy_attr_all(inode, d_inode(lower_dentry));
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 148d11b514fb..9c3437c8a5b1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	if (size < 0)
 		size = 8;
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+	rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
+					 ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
 	inode_unlock(lower_inode);
 	if (rc)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b9419058108f..ccd4971cc6c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	return fuse_update_attributes(inode, stat, NULL, NULL);
 }
 
-static int fuse_setxattr(struct dentry *entry, const char *name,
-			 const void *value, size_t size, int flags)
+static int fuse_setxattr(struct dentry *unused, struct inode *inode,
+			 const char *name, const void *value,
+			 size_t size, int flags)
 {
-	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
 	struct fuse_setxattr_in inarg;
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 064f92f17efc..d9a86919fdf6 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -13,10 +13,10 @@
 #include "hfs_fs.h"
 #include "btree.h"
 
-int hfs_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags)
+int hfs_setxattr(struct dentry *unused, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
 	struct hfs_find_data fd;
 	hfs_cat_rec rec;
 	struct hfs_cat_file *file;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fa3eed86837c..ee2f385811c8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, const char *name,
+extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
 extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
 			    const char *name, void *value, size_t size);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 1719649d7ad7..63b925d5ba1e 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -160,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
 	return 0;
 }
 
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_iattrs *attrs;
 	void *secdata;
 	int error;
@@ -175,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		error = security_inode_setsecurity(d_inode(dentry), suffix,
+		error = security_inode_setsecurity(inode, suffix,
 						value, size, flags);
 		if (error)
 			return error;
-		error = security_inode_getsecctx(d_inode(dentry),
+		error = security_inode_getsecctx(inode,
 						&secdata, &secdata_len);
 		if (error)
 			return error;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 45c9192c276e..37159235ac10 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
 int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
+			const char *name, const void *value,
 			size_t size, int flags);
 int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
diff --git a/fs/libfs.c b/fs/libfs.c
index 8765ff1adc07..3db2721144c2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
 	return -EPERM;
 }
 
-static int empty_dir_setxattr(struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags)
+static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
+			      const char *name, const void *value,
+			      size_t size, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c7b31a03dc9c..0ed7c4012437 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name)
 	return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
 }
 
-int ovl_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags)
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags)
 {
 	int err;
 	struct dentry *upperdentry;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99ec4b035237..d79577eb3937 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -171,8 +171,9 @@ int ovl_check_d_type_supported(struct path *realpath);
 /* inode.c */
 int ovl_setattr(struct dentry *dentry, struct iattr *attr);
 int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, const char *name,
-		 const void *value, size_t size, int flags);
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+		 const char *name, const void *value,
+		 size_t size, int flags);
 ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
 		     const char *name, void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/xattr.c b/fs/xattr.c
index b16d07889700..4beafc43daa5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_op->setxattr) {
-		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
 			security_inode_post_setxattr(dentry, name, value,
@@ -745,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
  * Find the handler for the prefix and dispatch its set() operation.
  */
 int
-generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		 const void *value, size_t size, int flags)
 {
 	const struct xattr_handler *handler;
 
@@ -754,8 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, d_inode(dentry), name, value,
-			    size, flags);
+	return handler->set(handler, dentry, inode, name, value, size, flags);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f61431d8673..62bdb0a6cf2d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1730,7 +1730,8 @@ struct inode_operations {
 			struct inode *, struct dentry *, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
-	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+	int (*setxattr) (struct dentry *, struct inode *,
+			 const char *, const void *, size_t, int);
 	ssize_t (*getxattr) (struct dentry *, struct inode *,
 			     const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 76beb206741a..94079bab9243 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -54,7 +54,8 @@ int vfs_removexattr(struct dentry *, const char *);
 
 ssize_t generic_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
+int generic_setxattr(struct dentry *dentry, struct inode *inode,
+		     const char *name, const void *value, size_t size, int flags);
 int generic_removexattr(struct dentry *dentry, const char *name);
 ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index ff2b8c3cf7a9..6777295f4b2b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3514,7 +3514,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			 */
 			if (isp->smk_flags & SMK_INODE_CHANGED) {
 				isp->smk_flags &= ~SMK_INODE_CHANGED;
-				rc = inode->i_op->setxattr(dp,
+				rc = inode->i_op->setxattr(dp, inode,
 					XATTR_NAME_SMACKTRANSMUTE,
 					TRANS_TRUE, TRANS_TRUE_SIZE,
 					0);
-- 
cgit v1.2.3


From f4bcbe792b8f434e32487cff9d9e30ab45a3ce02 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 07:26:00 -0400
Subject: Pull out string hash to <linux/stringhash.h>

... so they can be used without the rest of <linux/dcache.h>

The hashlen_* macros will make sense next patch.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 include/linux/dcache.h     | 27 +----------------
 include/linux/stringhash.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 26 deletions(-)
 create mode 100644 include/linux/stringhash.h

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 7e9422cb5989..0f9a977c334f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -10,6 +10,7 @@
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
 #include <linux/lockref.h>
+#include <linux/stringhash.h>
 
 struct path;
 struct vfsmount;
@@ -52,9 +53,6 @@ struct qstr {
 };
 
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
-#define hashlen_hash(hashlen) ((u32) (hashlen))
-#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
-#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash))
 
 struct dentry_stat_t {
 	long nr_dentry;
@@ -65,29 +63,6 @@ struct dentry_stat_t {
 };
 extern struct dentry_stat_t dentry_stat;
 
-/* Name hashing routines. Initial hash value */
-/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
-#define init_name_hash()		0
-
-/* partial hash update function. Assume roughly 4 bits per character */
-static inline unsigned long
-partial_name_hash(unsigned long c, unsigned long prevhash)
-{
-	return (prevhash + (c << 4) + (c >> 4)) * 11;
-}
-
-/*
- * Finally: cut down the number of bits to a int value (and try to avoid
- * losing bits)
- */
-static inline unsigned long end_name_hash(unsigned long hash)
-{
-	return (unsigned int) hash;
-}
-
-/* Compute the hash for a name string. */
-extern unsigned int full_name_hash(const unsigned char *, unsigned int);
-
 /*
  * Try to keep struct dentry aligned on 64 byte cachelines (this will
  * give reasonable cacheline footprint with larger lines without the
diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h
new file mode 100644
index 000000000000..2eaaaf6d2776
--- /dev/null
+++ b/include/linux/stringhash.h
@@ -0,0 +1,72 @@
+#ifndef __LINUX_STRINGHASH_H
+#define __LINUX_STRINGHASH_H
+
+#include <linux/types.h>
+
+/*
+ * Routines for hashing strings of bytes to a 32-bit hash value.
+ *
+ * These hash functions are NOT GUARANTEED STABLE between kernel
+ * versions, architectures, or even repeated boots of the same kernel.
+ * (E.g. they may depend on boot-time hardware detection or be
+ * deliberately randomized.)
+ *
+ * They are also not intended to be secure against collisions caused by
+ * malicious inputs; much slower hash functions are required for that.
+ *
+ * They are optimized for pathname components, meaning short strings.
+ * Even if a majority of files have longer names, the dynamic profile of
+ * pathname components skews short due to short directory names.
+ * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
+ */
+
+/*
+ * Version 1: one byte at a time.  Example of use:
+ *
+ * unsigned long hash = init_name_hash;
+ * while (*p)
+ *	hash = partial_name_hash(tolower(*p++), hash);
+ * hash = end_name_hash(hash);
+ *
+ * Although this is designed for bytes, fs/hfsplus/unicode.c
+ * abuses it to hash 16-bit values.
+ */
+
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define init_name_hash()		0
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static inline unsigned long
+partial_name_hash(unsigned long c, unsigned long prevhash)
+{
+	return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits)
+ */
+static inline unsigned long end_name_hash(unsigned long hash)
+{
+	return (unsigned int)hash;
+}
+
+/*
+ * Version 2: One word (32 or 64 bits) at a time.
+ * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
+ * exists, which describes major Linux platforms like x86 and ARM), then
+ * this computes a different hash function much faster.
+ *
+ * If not set, this falls back to a wrapper around the preceding.
+ */
+extern unsigned int full_name_hash(const unsigned char *, unsigned int);
+
+/*
+ * A hash_len is a u64 with the hash of a string in the low
+ * half and the length in the high half.
+ */
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+
+#endif	/* __LINUX_STRINGHASH_H */
-- 
cgit v1.2.3


From fcfd2fbf22d2587196890103d41e3d554c47da0e Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 08:41:37 -0400
Subject: fs/namei.c: Add hashlen_string() function

We'd like to make more use of the highly-optimized dcache hash functions
throughout the kernel, rather than have every subsystem create its own,
and a function that hashes basic null-terminated strings is required
for that.

(The name is to emphasize that it returns both hash and length.)

It's actually useful in the dcache itself, specifically d_alloc_name().
Other uses in the next patch.

full_name_hash() is also tweaked to make it more generally useful:
1) Take a "char *" rather than "unsigned char *" argument, to
   be consistent with hash_name().
2) Handle zero-length inputs.  If we want more callers, we don't want
   to make them worry about corner cases.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 fs/dcache.c                |  3 +--
 fs/namei.c                 | 51 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/stringhash.h |  8 ++++++--
 3 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index d5ecc6e477da..19b751806789 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1653,8 +1653,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 	struct qstr q;
 
 	q.name = name;
-	q.len = strlen(name);
-	q.hash = full_name_hash(q.name, q.len);
+	q.hash_len = hashlen_string(name);
 	return d_alloc(parent, &q);
 }
 EXPORT_SYMBOL(d_alloc_name);
diff --git a/fs/namei.c b/fs/namei.c
index 42f8ca038254..dd98d43a54f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1822,19 +1822,20 @@ static inline unsigned long mix_hash(unsigned long hash)
 
 #endif
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
 	unsigned long a, hash = 0;
 
 	for (;;) {
+		if (!len)
+			goto done;
 		a = load_unaligned_zeropad(name);
 		if (len < sizeof(unsigned long))
 			break;
 		hash = mix_hash(hash + a);
 		name += sizeof(unsigned long);
 		len -= sizeof(unsigned long);
-		if (!len)
-			goto done;
 	}
 	hash += a & bytemask_from_count(len);
 done:
@@ -1842,6 +1843,29 @@ done:
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+	unsigned long a, adata, mask, hash, len;
+	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = mix_hash(hash + a);
+		len += sizeof(unsigned long);
+		a = load_unaligned_zeropad(name+len);
+	} while (!has_zero(a, &adata, &constants));
+
+	adata = prep_zero_mask(a, adata, &constants);
+	mask = create_zero_mask(adata);
+	hash += a & zero_bytemask(mask);
+	len += find_zero(mask);
+
+	return hashlen_create(fold_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * Calculate the length and hash of the path component, and
  * return the "hash_len" as the result.
@@ -1872,15 +1896,32 @@ static inline u64 hash_name(const char *name)
 
 #else
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
 	while (len--)
-		hash = partial_name_hash(*name++, hash);
+		hash = partial_name_hash((unsigned char)*name++, hash);
 	return end_name_hash(hash);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hash_string(const char *name)
+{
+	unsigned long hash = init_name_hash();
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	do {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	} while (c);
+	return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hash_string);
+
 /*
  * We know there's a real path component here of at least
  * one character.
diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h
index 2eaaaf6d2776..451771d9b9c0 100644
--- a/include/linux/stringhash.h
+++ b/include/linux/stringhash.h
@@ -1,7 +1,8 @@
 #ifndef __LINUX_STRINGHASH_H
 #define __LINUX_STRINGHASH_H
 
-#include <linux/types.h>
+#include <linux/compiler.h>	/* For __pure */
+#include <linux/types.h>	/* For u32, u64 */
 
 /*
  * Routines for hashing strings of bytes to a 32-bit hash value.
@@ -59,7 +60,7 @@ static inline unsigned long end_name_hash(unsigned long hash)
  *
  * If not set, this falls back to a wrapper around the preceding.
  */
-extern unsigned int full_name_hash(const unsigned char *, unsigned int);
+extern unsigned int __pure full_name_hash(const char *, unsigned int);
 
 /*
  * A hash_len is a u64 with the hash of a string in the low
@@ -69,4 +70,7 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int);
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
 #define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+extern u64 __pure hashlen_string(const char *name);
+
 #endif	/* __LINUX_STRINGHASH_H */
-- 
cgit v1.2.3


From 917ea166f4672ec085f2cccc135c7c0eec72282c Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Fri, 20 May 2016 13:31:33 -0400
Subject: <linux/sunrpc/svcauth.h>: Define hash_str() in terms of
 hashlen_string()

Finally, the first use of previous two patches: eliminate the
separate ad-hoc string hash functions in the sunrpc code.

Now hash_str() is a wrapper around hash_string(), and hash_mem() is
likewise a wrapper around full_name_hash().

Note that sunrpc code *does* call hash_mem() with a zero length, which
is why the previous patch needed to handle that in full_name_hash().
(Thanks, Bruce, for finding that!)

This also eliminates the only caller of hash_long which asks for
more than 32 bits of output.

The comment about the quality of hashlen_string() and full_name_hash()
is jumping the gun by a few patches; they aren't very impressive now,
but will be improved greatly later in the series.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: J. Bruce Fields <bfields@redhat.com>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: linux-nfs@vger.kernel.org
---
 include/linux/sunrpc/svcauth.h | 40 +++++++++-------------------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index c00f53a4ccdd..91d5a5d6f52b 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -16,6 +16,7 @@
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/hash.h>
+#include <linux/stringhash.h>
 #include <linux/cred.h>
 
 struct svc_cred {
@@ -165,41 +166,18 @@ extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
 extern int unix_gid_cache_create(struct net *net);
 extern void unix_gid_cache_destroy(struct net *net);
 
-static inline unsigned long hash_str(char *name, int bits)
+/*
+ * The <stringhash.h> functions are good enough that we don't need to
+ * use hash_32() on them; just extracting the high bits is enough.
+ */
+static inline unsigned long hash_str(char const *name, int bits)
 {
-	unsigned long hash = 0;
-	unsigned long l = 0;
-	int len = 0;
-	unsigned char c;
-	do {
-		if (unlikely(!(c = *name++))) {
-			c = (char)len; len = -1;
-		}
-		l = (l << 8) | c;
-		len++;
-		if ((len & (BITS_PER_LONG/8-1))==0)
-			hash = hash_long(hash^l, BITS_PER_LONG);
-	} while (len);
-	return hash >> (BITS_PER_LONG - bits);
+	return hashlen_hash(hashlen_string(name)) >> (32 - bits);
 }
 
-static inline unsigned long hash_mem(char *buf, int length, int bits)
+static inline unsigned long hash_mem(char const *buf, int length, int bits)
 {
-	unsigned long hash = 0;
-	unsigned long l = 0;
-	int len = 0;
-	unsigned char c;
-	do {
-		if (len == length) {
-			c = (char)len; len = -1;
-		} else
-			c = *buf++;
-		l = (l << 8) | c;
-		len++;
-		if ((len & (BITS_PER_LONG/8-1))==0)
-			hash = hash_long(hash^l, BITS_PER_LONG);
-	} while (len);
-	return hash >> (BITS_PER_LONG - bits);
+	return full_name_hash(buf, length) >> (32 - bits);
 }
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 92d567740f2ab5937b2c23bee94ea4b284bb1f98 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 22:22:01 -0400
Subject: Change hash_64() return value to 32 bits

That's all that's ever asked for, and it makes the return
type of hash_long() consistent.

It also allows (upcoming patch) an optimized implementation
of hash_64 on 32-bit machines.

I tried adding a BUILD_BUG_ON to ensure the number of bits requested
was never more than 32 (most callers use a compile-time constant), but
adding <linux/bug.h> to <linux/hash.h> breaks the tools/perf compiler
unless tools/perf/MANIFEST is updated, and understanding that code base
well enough to update it is too much trouble.  I did the rest of an
allyesconfig build with such a check, and nothing tripped.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
---
 include/linux/hash.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hash.h b/include/linux/hash.h
index 79c52fa81cac..f967dedb10e2 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -48,7 +48,7 @@
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static __always_inline u64 hash_64(u64 val, unsigned int bits)
+static __always_inline u32 hash_64(u64 val, unsigned int bits)
 {
 	u64 hash = val;
 
@@ -72,7 +72,7 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits)
 #endif
 
 	/* High bits are more random, so use them. */
-	return hash >> (64 - bits);
+	return (u32)(hash >> (64 - bits));
 }
 
 static inline u32 hash_32(u32 val, unsigned int bits)
@@ -84,7 +84,7 @@ static inline u32 hash_32(u32 val, unsigned int bits)
 	return hash >> (32 - bits);
 }
 
-static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+static inline u32 hash_ptr(const void *ptr, unsigned int bits)
 {
 	return hash_long((unsigned long)ptr, bits);
 }
-- 
cgit v1.2.3


From ef703f49a6c5b909a85149bb6625c4ed0d697186 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 23:00:23 -0400
Subject: Eliminate bad hash multipliers from hash_32() and  hash_64()

The "simplified" prime multipliers made very bad hash functions, so get rid
of them.  This completes the work of 689de1d6ca.

To avoid the inefficiency which was the motivation for the "simplified"
multipliers, hash_64() on 32-bit systems is changed to use a different
algorithm.  It makes two calls to hash_32() instead.

drivers/media/usb/dvb-usb-v2/af9015.c uses the old GOLDEN_RATIO_PRIME_32
for some horrible reason, so it inherits a copy of the old definition.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Mauro Carvalho Chehab <m.chehab@samsung.com>
---
 drivers/media/usb/dvb-usb-v2/af9015.c |  2 +
 include/linux/hash.h                  | 87 ++++++++++++++---------------------
 2 files changed, 36 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/usb/dvb-usb-v2/af9015.c b/drivers/media/usb/dvb-usb-v2/af9015.c
index 95a7388e89d4..09e0f58f6bb7 100644
--- a/drivers/media/usb/dvb-usb-v2/af9015.c
+++ b/drivers/media/usb/dvb-usb-v2/af9015.c
@@ -398,6 +398,8 @@ error:
 }
 
 #define AF9015_EEPROM_SIZE 256
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
 
 /* hash (and dump) eeprom */
 static int af9015_eeprom_hash(struct dvb_usb_device *d)
diff --git a/include/linux/hash.h b/include/linux/hash.h
index f967dedb10e2..613cfde3a1e0 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -3,85 +3,65 @@
 /* Fast hashing routine for ints,  longs and pointers.
    (C) 2002 Nadia Yvette Chambers, IBM */
 
-/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
 #include <asm/types.h>
 #include <linux/compiler.h>
 
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
-
+/*
+ * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
+ * fs/inode.c.  It's not actually prime any more (the previous primes
+ * were actively bad for hashing), but the name remains.
+ */
 #if BITS_PER_LONG == 32
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
 #define hash_long(val, bits) hash_32(val, bits)
 #elif BITS_PER_LONG == 64
 #define hash_long(val, bits) hash_64(val, bits)
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
 #else
 #error Wordsize not 32 or 64
 #endif
 
 /*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
+ * This hash multiplies the input by a large odd number and takes the
+ * high bits.  Since multiplication propagates changes to the most
+ * significant end only, it is essential that the high bits of the
+ * product be used for the hash value.
+ *
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
  *
  * Although a random odd number will do, it turns out that the golden
  * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
- * properties.
+ * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
  *
- * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
- * (See Knuth vol 3, section 6.4, exercise 9.)
+ * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
+ * which is very slightly easier to multiply by and makes no
+ * difference to the hash distribution.
  */
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static __always_inline u32 hash_64(u64 val, unsigned int bits)
-{
-	u64 hash = val;
-
-#if BITS_PER_LONG == 64
-	hash = hash * GOLDEN_RATIO_64;
-#else
-	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	u64 n = hash;
-	n <<= 18;
-	hash -= n;
-	n <<= 33;
-	hash -= n;
-	n <<= 3;
-	hash += n;
-	n <<= 3;
-	hash -= n;
-	n <<= 4;
-	hash += n;
-	n <<= 2;
-	hash += n;
-#endif
 
-	/* High bits are more random, so use them. */
-	return (u32)(hash >> (64 - bits));
+static inline u32 __hash_32(u32 val)
+{
+	return val * GOLDEN_RATIO_32;
 }
 
 static inline u32 hash_32(u32 val, unsigned int bits)
 {
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	u32 hash = val * GOLDEN_RATIO_PRIME_32;
-
 	/* High bits are more random, so use them. */
-	return hash >> (32 - bits);
+	return __hash_32(val) >> (32 - bits);
+}
+
+static __always_inline u32 hash_64(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return hash_32((u32)val ^ __hash_32(val >> 32), bits);
+#endif
 }
 
 static inline u32 hash_ptr(const void *ptr, unsigned int bits)
@@ -89,6 +69,7 @@ static inline u32 hash_ptr(const void *ptr, unsigned int bits)
 	return hash_long((unsigned long)ptr, bits);
 }
 
+/* This really should be called fold32_ptr; it does no hashing to speak of. */
 static inline u32 hash32_ptr(const void *ptr)
 {
 	unsigned long val = (unsigned long)ptr;
-- 
cgit v1.2.3


From 468a9428521e7d00fb21250af363eb94dc1d6861 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@sciencehorizons.net>
Date: Thu, 26 May 2016 22:11:51 -0400
Subject: <linux/hash.h>: Add support for architecture-specific functions

This is just the infrastructure; there are no users yet.

This is modelled on CONFIG_ARCH_RANDOM; a CONFIG_ symbol declares
the existence of <asm/hash.h>.

That file may define its own versions of various functions, and define
HAVE_* symbols (no CONFIG_ prefix!) to suppress the generic ones.

Included is a self-test (in lib/test_hash.c) that verifies the basics.
It is NOT in general required that the arch-specific functions compute
the same thing as the generic, but if a HAVE_* symbol is defined with
the value 1, then equality is tested.

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Andreas Schwab <schwab@linux-m68k.org>
Cc: Philippe De Muyter <phdm@macq.eu>
Cc: linux-m68k@lists.linux-m68k.org
Cc: Alistair Francis <alistai@xilinx.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: uclinux-h8-devel@lists.sourceforge.jp
---
 arch/Kconfig         |   8 ++
 fs/namei.c           |   6 +-
 include/linux/hash.h |  27 +++++-
 lib/Kconfig.debug    |  11 +++
 lib/Makefile         |   1 +
 lib/test_hash.c      | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 299 insertions(+), 4 deletions(-)
 create mode 100644 lib/test_hash.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..96406e4db995 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -589,6 +589,14 @@ config HAVE_STACK_VALIDATION
 	  Architecture supports the 'objtool check' host tool command, which
 	  performs compile-time stack metadata validation.
 
+config HAVE_ARCH_HASH
+	bool
+	default n
+	help
+	  If this is set, the architecture provides an <asm/hash.h>
+	  file which provides platform-specific implementations of some
+	  functions in <linux/hash.h> or fs/namei.c.
+
 #
 # ABI hall of shame
 #
diff --git a/fs/namei.c b/fs/namei.c
index a49cbd7efcaa..968dae025230 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1788,7 +1788,11 @@ static int walk_component(struct nameidata *nd, int flags)
 
 #include <asm/word-at-a-time.h>
 
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
+
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+
+#elif defined(CONFIG_64BIT)
 /*
  * Register pressure in the mixing function is an issue, particularly
  * on 32-bit x86, but almost any function requires one state value and
diff --git a/include/linux/hash.h b/include/linux/hash.h
index 613cfde3a1e0..ad6fa21d977b 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -41,19 +41,40 @@
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
+#ifdef CONFIG_HAVE_ARCH_HASH
+/* This header may use the GOLDEN_RATIO_xx constants */
+#include <asm/hash.h>
+#endif
 
-static inline u32 __hash_32(u32 val)
+/*
+ * The _generic versions exist only so lib/test_hash.c can compare
+ * the arch-optimized versions with the generic.
+ *
+ * Note that if you change these, any <asm/hash.h> that aren't updated
+ * to match need to have their HAVE_ARCH_* define values updated so the
+ * self-test will not false-positive.
+ */
+#ifndef HAVE_ARCH__HASH_32
+#define __hash_32 __hash_32_generic
+#endif
+static inline u32 __hash_32_generic(u32 val)
 {
 	return val * GOLDEN_RATIO_32;
 }
 
-static inline u32 hash_32(u32 val, unsigned int bits)
+#ifndef HAVE_ARCH_HASH_32
+#define hash_32 hash_32_generic
+#endif
+static inline u32 hash_32_generic(u32 val, unsigned int bits)
 {
 	/* High bits are more random, so use them. */
 	return __hash_32(val) >> (32 - bits);
 }
 
-static __always_inline u32 hash_64(u64 val, unsigned int bits)
+#ifndef HAVE_ARCH_HASH_64
+#define hash_64 hash_64_generic
+#endif
+static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
 {
 #if BITS_PER_LONG == 64
 	/* 64x64-bit multiply is efficient on all 64-bit processors */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..18ec69ba8eb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1815,6 +1815,17 @@ config TEST_RHASHTABLE
 
 	  If unsure, say N.
 
+config TEST_HASH
+	tristate "Perform selftest on hash functions"
+	default n
+	help
+	  Enable this option to test the kernel's integer (<linux/hash,h>)
+	  and string (<linux/stringhash.h>) hash functions on boot
+	  (or module load).
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..f80b1a1b3afd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/test_hash.c b/lib/test_hash.c
new file mode 100644
index 000000000000..c9549c8b4909
--- /dev/null
+++ b/lib/test_hash.c
@@ -0,0 +1,250 @@
+/*
+ * Test cases for <linux/hash.h> and <linux/stringhash.h>
+ * This just verifies that various ways of computing a hash
+ * produce the same thing and, for cases where a k-bit hash
+ * value is requested, is of the requested size.
+ *
+ * We fill a buffer with a 255-byte null-terminated string,
+ * and use both full_name_hash() and hashlen_string() to hash the
+ * substrings from i to j, where 0 <= i < j < 256.
+ *
+ * The returned values are used to check that __hash_32() and
+ * __hash_32_generic() compute the same thing.  Likewise hash_32()
+ * and hash_64().
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/printk.h>
+
+/* 32-bit XORSHIFT generator.  Seed must not be zero. */
+static u32 __init __attribute_const__
+xorshift(u32 seed)
+{
+	seed ^= seed << 13;
+	seed ^= seed >> 17;
+	seed ^= seed << 5;
+	return seed;
+}
+
+/* Given a non-zero x, returns a non-zero byte. */
+static u8 __init __attribute_const__
+mod255(u32 x)
+{
+	x = (x & 0xffff) + (x >> 16);	/* 1 <= x <= 0x1fffe */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x2fd */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0x100 */
+	x = (x & 0xff) + (x >> 8);	/* 1 <= x <= 0xff */
+	return x;
+}
+
+/* Fill the buffer with non-zero bytes. */
+static void __init
+fill_buf(char *buf, size_t len, u32 seed)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		seed = xorshift(seed);
+		buf[i] = mod255(seed);
+	}
+}
+
+/*
+ * Test the various integer hash functions.  h64 (or its low-order bits)
+ * is the integer to hash.  hash_or accumulates the OR of the hash values,
+ * which are later checked to see that they cover all the requested bits.
+ *
+ * Because these functions (as opposed to the string hashes) are all
+ * inline, the code being tested is actually in the module, and you can
+ * recompile and re-test the module without rebooting.
+ */
+static bool __init
+test_int_hash(unsigned long long h64, u32 hash_or[2][33])
+{
+	int k;
+	u32 h0 = (u32)h64, h1, h2;
+
+	/* Test __hash32 */
+	hash_or[0][0] |= h1 = __hash_32(h0);
+#ifdef HAVE_ARCH__HASH_32
+	hash_or[1][0] |= h2 = __hash_32_generic(h0);
+#if HAVE_ARCH__HASH_32 == 1
+	if (h1 != h2) {
+		pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x",
+			h0, h1, h2);
+		return false;
+	}
+#endif
+#endif
+
+	/* Test k = 1..32 bits */
+	for (k = 1; k <= 32; k++) {
+		u32 const m = ((u32)2 << (k-1)) - 1;	/* Low k bits set */
+
+		/* Test hash_32 */
+		hash_or[0][k] |= h1 = hash_32(h0, k);
+		if (h1 > m) {
+			pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_32
+		h2 = hash_32_generic(h0, k);
+#if HAVE_ARCH_HASH_32 == 1
+		if (h1 != h2) {
+			pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
+				" = %#x", h0, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
+				h0, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+		/* Test hash_64 */
+		hash_or[1][k] |= h1 = hash_64(h64, k);
+		if (h1 > m) {
+			pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m);
+			return false;
+		}
+#ifdef HAVE_ARCH_HASH_64
+		h2 = hash_64_generic(h64, k);
+#if HAVE_ARCH_HASH_64 == 1
+		if (h1 != h2) {
+			pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() "
+				"= %#x", h64, k, h1, h2);
+			return false;
+		}
+#else
+		if (h2 > m) {
+			pr_err("hash_64_generic(%#llx, %d) = %#x > %#x",
+				h64, k, h1, m);
+			return false;
+		}
+#endif
+#endif
+	}
+
+	(void)h2;	/* Suppress unused variable warning */
+	return true;
+}
+
+#define SIZE 256	/* Run time is cubic in SIZE */
+
+static int __init
+test_hash_init(void)
+{
+	char buf[SIZE+1];
+	u32 string_or = 0, hash_or[2][33] = { 0 };
+	unsigned tests = 0;
+	unsigned long long h64 = 0;
+	int i, j;
+
+	fill_buf(buf, SIZE, 1);
+
+	/* Test every possible non-empty substring in the buffer. */
+	for (j = SIZE; j > 0; --j) {
+		buf[j] = '\0';
+
+		for (i = 0; i <= j; i++) {
+			u64 hashlen = hashlen_string(buf+i);
+			u32 h0 = full_name_hash(buf+i, j-i);
+
+			/* Check that hashlen_string gets the length right */
+			if (hashlen_len(hashlen) != j-i) {
+				pr_err("hashlen_string(%d..%d) returned length"
+					" %u, expected %d",
+					i, j, hashlen_len(hashlen), j-i);
+				return -EINVAL;
+			}
+			/* Check that the hashes match */
+			if (hashlen_hash(hashlen) != h0) {
+				pr_err("hashlen_string(%d..%d) = %08x != "
+					"full_name_hash() = %08x",
+					i, j, hashlen_hash(hashlen), h0);
+				return -EINVAL;
+			}
+
+			string_or |= h0;
+			h64 = h64 << 32 | h0;	/* For use with hash_64 */
+			if (!test_int_hash(h64, hash_or))
+				return -EINVAL;
+			tests++;
+		} /* i */
+	} /* j */
+
+	/* The OR of all the hash values should cover all the bits */
+	if (~string_or) {
+		pr_err("OR of all string hash results = %#x != %#x",
+			string_or, -1u);
+		return -EINVAL;
+	}
+	if (~hash_or[0][0]) {
+		pr_err("OR of all __hash_32 results = %#x != %#x",
+			hash_or[0][0], -1u);
+		return -EINVAL;
+	}
+#ifdef HAVE_ARCH__HASH_32
+#if HAVE_ARCH__HASH_32 != 1	/* Test is pointless if results match */
+	if (~hash_or[1][0]) {
+		pr_err("OR of all __hash_32_generic results = %#x != %#x",
+			hash_or[1][0], -1u);
+		return -EINVAL;
+	}
+#endif
+#endif
+
+	/* Likewise for all the i-bit hash values */
+	for (i = 1; i <= 32; i++) {
+		u32 const m = ((u32)2 << (i-1)) - 1;	/* Low i bits set */
+
+		if (hash_or[0][i] != m) {
+			pr_err("OR of all hash_32(%d) results = %#x "
+				"(%#x expected)", i, hash_or[0][i], m);
+			return -EINVAL;
+		}
+		if (hash_or[1][i] != m) {
+			pr_err("OR of all hash_64(%d) results = %#x "
+				"(%#x expected)", i, hash_or[1][i], m);
+			return -EINVAL;
+		}
+	}
+
+	/* Issue notices about skipped tests. */
+#ifndef HAVE_ARCH__HASH_32
+	pr_info("__hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH__HASH_32 != 1
+	pr_info("__hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_32
+	pr_info("hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_32 != 1
+	pr_info("hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_64
+	pr_info("hash_64() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_64 != 1
+	pr_info("hash_64() is arch-specific; not compared to generic.");
+#endif
+
+	pr_notice("%u tests passed.", tests);
+
+	return 0;
+}
+
+static void __exit test_hash_exit(void)
+{
+}
+
+module_init(test_hash_init);	/* Does everything */
+module_exit(test_hash_exit);	/* Does nothing */
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From dd5f1b049dc139876801db3cdd0f20d21fd428cc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 2 Jun 2016 09:00:28 +0100
Subject: irqchip/gic-v3: Fix ICC_SGI1R_EL1.INTID decoding mask

The INTID mask is wrong, and is made a signed value, which has
nteresting effects in the KVM emulation. Let's sanitize it.

Cc: stable@vger.kernel.org
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index bfbd707de390..85a8c2acdef5 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -305,7 +305,7 @@
 #define ICC_SGI1R_AFFINITY_1_SHIFT	16
 #define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
 #define ICC_SGI1R_SGI_ID_SHIFT		24
-#define ICC_SGI1R_SGI_ID_MASK		(0xff << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
 #define ICC_SGI1R_AFFINITY_2_SHIFT	32
 #define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
-- 
cgit v1.2.3


From fab0cdc30d81694d2d5524b24e42c43414971719 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Thu, 12 May 2016 10:46:34 +0200
Subject: irqchip/gic-v3: Fix copy+paste mistakes in defines

ICC_SGI1R_AFFINITY_{2,3}_MASK are unused, which is good
because they were defined with the wrong shifts.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 85a8c2acdef5..dc493e0f0ff7 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -307,10 +307,10 @@
 #define ICC_SGI1R_SGI_ID_SHIFT		24
 #define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
 #define ICC_SGI1R_AFFINITY_2_SHIFT	32
-#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
 #define ICC_SGI1R_AFFINITY_3_SHIFT	48
-#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
 
 #include <asm/arch_gicv3.h>
 
-- 
cgit v1.2.3