From 1deaf9d19776916cd1c83191503bf327319a301a Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Thu, 14 Jan 2016 15:16:56 -0800
Subject: fs/notify/inode_mark.c: use list_next_entry in
 fsnotify_unmount_inodes

To make the intention clearer, use list_next_entry instead of
list_entry.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inode_mark.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 				break;
 			}
 			spin_unlock(&next_i->i_lock);
-			next_i = list_entry(next_i->i_sb_list.next,
-						struct inode, i_sb_list);
+			next_i = list_next_entry(next_i, i_sb_list);
 		}
 
 		/*
-- 
cgit v1.2.3


From c510eff6bebaa244e577b8f499e86606b5e5d4c7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 14 Jan 2016 15:16:59 -0800
Subject: fsnotify: destroy marks with call_srcu instead of dedicated thread

At the time that this code was originally written, call_srcu didn't
exist, so this thread was required to ensure that we waited for that
SRCU grace period to settle before finally freeing the object.

It does exist now however and we can much more efficiently use call_srcu
to handle this.  That also allows us to potentially use srcu_barrier to
ensure that they are all of the callbacks have run before proceeding.
In order to conserve space, we union the rcu_head with the g_list.

This will be necessary for nfsd which will allocate marks from a
dedicated slabcache.  We have to be able to ensure that all of the
objects are destroyed before destroying the cache.  That's fairly

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Cc: Eric Paris <eparis@parisplace.org>
Reviewed-by: Jan Kara <jack@suse.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/mark.c                 | 66 +++++++++-------------------------------
 include/linux/fsnotify_backend.h |  5 ++-
 2 files changed, 18 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..cfcbf114676e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
 #include "fsnotify.h"
 
 struct srcu_struct fsnotify_mark_srcu;
-static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 	atomic_dec(&group->num_marks);
 }
 
+static void
+fsnotify_mark_free_rcu(struct rcu_head *rcu)
+{
+	struct fsnotify_mark	*mark;
+
+	mark = container_of(rcu, struct fsnotify_mark, g_rcu);
+	fsnotify_put_mark(mark);
+}
+
 /*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a call_srcu
+ * callback. Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
  */
 void fsnotify_free_mark(struct fsnotify_mark *mark)
 {
@@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);
 
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	wake_up(&destroy_waitq);
+	call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
 
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
@@ -385,11 +388,7 @@ err:
 
 	spin_unlock(&mark->lock);
 
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	wake_up(&destroy_waitq);
-
+	call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
 	return ret;
 }
 
@@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
-
-static int fsnotify_mark_destroy(void *ignored)
-{
-	struct fsnotify_mark *mark, *next;
-	struct list_head private_destroy_list;
-
-	for (;;) {
-		spin_lock(&destroy_lock);
-		/* exchange the list head */
-		list_replace_init(&destroy_list, &private_destroy_list);
-		spin_unlock(&destroy_lock);
-
-		synchronize_srcu(&fsnotify_mark_srcu);
-
-		list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
-			list_del_init(&mark->g_list);
-			fsnotify_put_mark(mark);
-		}
-
-		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
-	}
-
-	return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
-	struct task_struct *thread;
-
-	thread = kthread_run(fsnotify_mark_destroy, NULL,
-			     "fsnotify_mark");
-	if (IS_ERR(thread))
-		panic("unable to start fsnotify mark destruction thread.");
-
-	return 0;
-}
-device_initcall(fsnotify_mark_init);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 533c4408529a..6b7e89f45aa4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -220,7 +220,10 @@ struct fsnotify_mark {
 	/* List of marks by group->i_fsnotify_marks. Also reused for queueing
 	 * mark into destroy_list when it's waiting for the end of SRCU period
 	 * before it can be freed. [group->mark_mutex] */
-	struct list_head g_list;
+	union {
+		struct list_head g_list;
+		struct rcu_head g_rcu;
+	};
 	/* Protects inode / mnt pointers, flags, masks */
 	spinlock_t lock;
 	/* List of marks for inode / vfsmount [obj_lock] */
-- 
cgit v1.2.3


From 2886357b24662fcf7dc138c9d5e3166e99a55750 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Jan 2016 15:17:05 -0800
Subject: logfs: fix logfs build errors and dependencies

Fix build errors that happen when CONFIG_LOGFS=y and CONFIG_MTD=m:

  fs/built-in.o: In function `logfs_mount':
  super.c:(.text+0x92a6f): undefined reference to `logfs_get_sb_mtd'
  fs/built-in.o: In function `logfs_get_sb_bdev':
  (.text+0x93530): undefined reference to `logfs_get_sb_mtd'

This patch avoids the error by changing the dependencies of logfs in a
way that we can no longer configure logfs as built-in when the MTD core
is a loadable module, while leaving the dependency to require at least
one of MTD or BLOCK to be enabled.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Peter Chen <peter.chen@freescale.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0221..2b4503163930 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
 config LOGFS
 	tristate "LogFS file system"
-	depends on (MTD || BLOCK)
+	depends on MTD || (!MTD && BLOCK)
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select CRC32
-- 
cgit v1.2.3


From 3eb5bdf0f47726e3ef0a875e0cce3cca462ee70d Mon Sep 17 00:00:00 2001
From: "Norton.Zhu" <norton.zhu@huawei.com>
Date: Thu, 14 Jan 2016 15:17:09 -0800
Subject: ocfs2: optimize bad declarations and redundant assignment

In ocfs2_parse_options,

a) it's better to declare variables(small size) outside of while loop;

b) 'option' will be set by match_int, 'option = 0;' makes no sense, if
   match_int failed, it just goto bail and return.

Signed-off-by: Norton.Zhu <norton.zhu@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Gang He <ghe@suse.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/super.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..64dcd787fa80 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
 	int status, user_stack = 0;
 	char *p;
 	u32 tmp;
+	int token, option;
+	substring_t args[MAX_OPT_ARGS];
 
 	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
 
@@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 	}
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token, option;
-		substring_t args[MAX_OPT_ARGS];
-
 		if (!*p)
 			continue;
 
@@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 				mopt->atime_quantum = option;
 			break;
 		case Opt_slot:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
@@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 				mopt->slot = (s16)option;
 			break;
 		case Opt_commit:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
@@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
-- 
cgit v1.2.3


From d6364627eff4c9a2035cebb3aec7415705ba117b Mon Sep 17 00:00:00 2001
From: John Haxby <john.haxby@oracle.com>
Date: Thu, 14 Jan 2016 15:17:12 -0800
Subject: ocfs2: return non-zero st_blocks for inline data

Some versions of tar assume that files with st_blocks == 0 do not
contain any data and will skip reading them entirely.  See also commit
9206c561554c ("ext4: return non-zero st_blocks for inline data").

Signed-off-by: John Haxby <john.haxby@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Acked-by: Gang He <ghe@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..d63127932509 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
 	}
 
 	generic_fillattr(inode, stat);
+	/*
+	 * If there is inline data in the inode, the inode will normally not
+	 * have data blocks allocated (it may have an external xattr block).
+	 * Report at least one sector for such files, so tools like tar, rsync,
+	 * others don't incorrectly think the file is completely sparse.
+	 */
+	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		stat->blocks += (stat->size + 511)>>9;
 
 	/* We set the blksize from the cluster size for performance */
 	stat->blksize = osb->s_clustersize;
-- 
cgit v1.2.3


From a84ac334dcb44c76f0b051513a6c27a2d747f883 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Thu, 14 Jan 2016 15:17:15 -0800
Subject: ocfs2: o2hb: increase unsteady iterations

When run multiple xattr test of ocfs2-test on a three-nodes cluster,
mount failed sometimes with the following message.

  o2hb: Unable to stabilize heartbeart on region D18B775E758D4D80837E8CF3D086AD4A (xvdb)

Stabilize heartbeat depends on the timing order to mount ocfs2 from
cluster nodes and how fast the tcp connections are established.  So
increase unsteady interations to leave more time for it.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 709fbbd44c65..a3cc6d2fc896 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	}
 	++live_threshold;
 	atomic_set(&reg->hr_steady_iterations, live_threshold);
-	/* unsteady_iterations is double the steady_iterations */
-	atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+	/* unsteady_iterations is triple the steady_iterations */
+	atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
 
 	hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
 			      reg->hr_item.ci_name);
-- 
cgit v1.2.3


From 30bee898f86506893883ffb8db20d8101a29b5f5 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Thu, 14 Jan 2016 15:17:18 -0800
Subject: ocfs2/dlm: fix a race between purge and migration

We found a race between purge and migration when doing code review.
Node A put lockres to purgelist before receiving the migrate message
from node B which is the master.  Node A call dlm_mig_lockres_handler to
handle this message.

dlm_mig_lockres_handler
  dlm_lookup_lockres
  >>>>>> race window, dlm_run_purge_list may run and send
         deref message to master, waiting the response
  spin_lock(&res->spinlock);
  res->state |= DLM_LOCK_RES_MIGRATING;
  spin_unlock(&res->spinlock);
  dlm_mig_lockres_handler returns

  >>>>>> dlm_thread receives the response from master for the deref
  message and triggers the BUG because the lockres has the state
  DLM_LOCK_RES_MIGRATING with the following message:

dlm_purge_lockres:209 ERROR: 6633EB681FA7474A9C280A4E1A836F0F: res
M0000000000000000030c0300000000 in use after deref

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..86fb53614bf4 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 	char *buf = NULL;
 	struct dlm_work_item *item = NULL;
 	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
 
 	if (!dlm_grab(dlm))
 		return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* lookup the lock to see if we have a secondary queue for this
 	 * already...  just add the locks in and this will have its owner
 	 * and RECOVERY flag changed when it completes. */
-	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+			hash);
 	if (res) {
 	 	/* this will get a ref on res */
 		/* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 				     mres->lockname_len, mres->lockname);
 				ret = -EFAULT;
 				spin_unlock(&res->spinlock);
+				spin_unlock(&dlm->spinlock);
 				dlm_lockres_put(res);
 				goto leave;
 			}
 			res->state |= DLM_LOCK_RES_MIGRATING;
 		}
 		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
 	} else {
+		spin_unlock(&dlm->spinlock);
 		/* need to allocate, just like if it was
 		 * mastered here normally  */
 		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
-- 
cgit v1.2.3


From 9e62dc096ef39ccf45fc47e8612f3c5d361af038 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Thu, 14 Jan 2016 15:17:21 -0800
Subject: ocfs2: constify ocfs2_extent_tree_operations structures

The ocfs2_extent_tree_operations structures are never modified, so
declare them as const.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 12 ++++++------
 fs/ocfs2/alloc.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..2a0aeb77ec42 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
 				     struct ocfs2_extent_rec *rec);
 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
 	et->et_root_el = &dx_root->dr_list;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
 	return CONTIG_NONE;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct buffer_head *bh,
 				     ocfs2_journal_access_func access,
 				     void *obj,
-				     struct ocfs2_extent_tree_operations *ops)
+				     const struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_ops = ops;
 	et->et_root_bh = bh;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db162..f3dc1b0dfffc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
  */
 struct ocfs2_extent_tree_operations;
 struct ocfs2_extent_tree {
-	struct ocfs2_extent_tree_operations	*et_ops;
+	const struct ocfs2_extent_tree_operations *et_ops;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
 	struct ocfs2_caching_info		*et_ci;
-- 
cgit v1.2.3


From b5560143385e18b4109ad6951c7719705e3dd995 Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Thu, 14 Jan 2016 15:17:23 -0800
Subject: ocfs2/dlm: wait until DLM_LOCK_RES_SETREF_INPROG is cleared in
 dlm_deref_lockres_worker

Commit f3f854648de6 ("ocfs2_dlm: Ensure correct ordering of set/clear
refmap bit on lockres") still exists a race which can't ensure the
ordering is exactly correct.

Node1               Node2                    Node3
umount, migrate
lockres to Node2
                    migrate finished,
                    send migrate request
                    to Node3
                                              received migrate request,
                                              create a migration_mle,
                                              respond to Node2.
                    set DLM_LOCK_RES_SETREF_INPROG
                    and send assert master to
                    Node3
                                              delete migration_mle in
                                              assert_master_handler,
                                              Node3 umount without response
                                              dlm_thread purge
                                              this lockres, send drop
                                              deref message to Node2
                    found the flag of
                    DLM_LOCK_RES_SETREF_INPROG
                    is set, dispatch
                    dlm_deref_lockres_worker to
                    clear refmap, but in function of
                    dlm_deref_lockres_worker,
                    only if node in refmap it wait
                    DLM_LOCK_RES_SETREF_INPROG
                    to be cleared. So worker is
                    done successfully

                                              purge lockres, send
                                              assert master response
                                              to Node1, and finish umount
                    set Node3 in refmap, and it
                    won't be cleared forever, thus
                    lead to umount hung

so wait until DLM_LOCK_RES_SETREF_INPROG is cleared in
dlm_deref_lockres_worker.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84f2f8079466..1fc172f89872 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 
 	spin_lock(&res->spinlock);
 	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 	if (test_bit(node, res->refmap)) {
-		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		dlm_lockres_clear_refmap_bit(dlm, res, node);
 		cleared = 1;
 	}
-- 
cgit v1.2.3


From 72865d92300a6ad20a8851887aeac4f41fd2c579 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 14 Jan 2016 15:17:27 -0800
Subject: ocfs2: clean up redundant NULL check before iput

Since iput will take care the NULL check itself, NULL check before
calling it is redundant.  So clean them up.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c      |  3 +--
 fs/ocfs2/ioctl.c      |  4 +---
 fs/ocfs2/journal.c    | 10 +++-------
 fs/ocfs2/localalloc.c | 10 +++-------
 fs/ocfs2/namei.c      |  3 +--
 fs/ocfs2/slot_map.c   |  3 +--
 fs/ocfs2/super.c      |  3 +--
 7 files changed, 11 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2a0aeb77ec42..a3ded88718c9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail:
-	if (tl_inode)
-		iput(tl_inode);
+	iput(tl_inode);
 	brelse(tl_bh);
 
 	if (status < 0) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccce60..16b0bb482ea7 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -606,9 +606,7 @@ bail:
 	if (gb_inode)
 		mutex_unlock(&gb_inode->i_mutex);
 
-	if (gb_inode)
-		iput(gb_inode);
-
+	iput(gb_inode);
 	brelse(bh);
 
 	return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..3772a2dbb980 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 //	up_write(&journal->j_trans_barrier);
 done:
-	if (inode)
-		iput(inode);
+	iput(inode);
 }
 
 static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1687,9 +1686,7 @@ done:
 	if (got_lock)
 		ocfs2_inode_unlock(inode, 1);
 
-	if (inode)
-		iput(inode);
-
+	iput(inode);
 	brelse(bh);
 
 	return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 
 	ocfs2_inode_unlock(inode, 1);
 bail:
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	return status;
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..e9c99e35f5ea 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 bail:
 	if (status < 0)
 		brelse(alloc_bh);
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
 
@@ -473,8 +472,7 @@ out_mutex:
 	iput(main_bm_inode);
 
 out:
-	if (local_alloc_inode)
-		iput(local_alloc_inode);
+	iput(local_alloc_inode);
 
 	kfree(alloc_copy);
 }
@@ -1327,9 +1325,7 @@ bail:
 
 	brelse(main_bm_bh);
 
-	if (main_bm_inode)
-		iput(main_bm_inode);
-
+	iput(main_bm_inode);
 	kfree(alloc_copy);
 
 	if (ac)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index afb81eae2c18..714320e420eb 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1683,8 +1683,7 @@ bail:
 	if (new_inode)
 		sync_mapping_buffers(old_inode->i_mapping);
 
-	if (new_inode)
-		iput(new_inode);
+	iput(new_inode);
 
 	ocfs2_free_dir_lookup_result(&target_lookup_res);
 	ocfs2_free_dir_lookup_result(&old_entry_lookup);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d44c8..47bba1be2a51 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
 	if (si == NULL)
 		return;
 
-	if (si->si_inode)
-		iput(si->si_inode);
+	iput(si->si_inode);
 	if (si->si_bh) {
 		for (i = 0; i < si->si_blocks; i++) {
 			if (si->si_bh[i]) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 64dcd787fa80..c62d761d6bf0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1722,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	ocfs2_inode_unlock(inode, 0);
 	status = 0;
 bail:
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	if (status)
 		mlog_errno(status);
-- 
cgit v1.2.3


From c372f2193a2e73d5936bf37259ae63ca388b4cbc Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Thu, 14 Jan 2016 15:17:29 -0800
Subject: ocfs2/dlm: return appropriate value when dlm_grab() returns NULL

dlm_grab() may return NULL when the node is doing unmount.  When doing
code review, we found that some dlm handlers may return error to caller
when dlm_grab() returns NULL and make caller BUG or other problems.
Here is an example:

Node 1                                 Node 2
receives migration message
from node 3, and send
migrate request to others
                                     start unmounting

                                     receives migrate request
                                     from node 1 and call
                                     dlm_migrate_request_handler()

                                     unmount thread unregisters
                                     domain handlers and removes
                                     dlm_context from dlm_domains

                                     dlm_migrate_request_handlers()
                                     returns -EINVAL to node 1
Exit migration neither clearing the
migration state nor sending
assert master message to node 3 which
cause node 3 hung.

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 2 +-
 fs/ocfs2/dlm/dlmunlock.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 1fc172f89872..6f0748122117 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3050,7 +3050,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	int ret = 0;
 
 	if (!dlm_grab(dlm))
-		return -EINVAL;
+		return 0;
 
 	name = migrate->name;
 	namelen = migrate->namelen;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab68c..1082b2c3014b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 
 	if (!dlm_grab(dlm))
-		return DLM_REJECTED;
+		return DLM_FORWARD;
 
 	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
 			"Domain %s not fully joined!\n", dlm->name);
-- 
cgit v1.2.3


From 1247017f43a93eae3d64b7c25f3637dc545f5a47 Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Thu, 14 Jan 2016 15:17:33 -0800
Subject: ocfs2: fix slot overwritten if storage link down during mount

The following case will lead to slot overwritten.

N1                               N2
mount ocfs2 volume, find and
allocate slot 0, then set
osb->slot_num to 0, begin to
write slot info to disk
                                 mount ocfs2 volume, wait for super lock
write block fail because of
storage link down, unlock
super lock
                                 got super lock and also allocate slot 0
                                 then unlock super lock

mount fail and then dismount,
since osb->slot_num is 0, try to
put invalid slot to disk. And it
will succeed if storage link
restores.
                                 N2 slot info is now overwritten

Once another node say N3 mount, it will find and allocate slot 0 again,
which will lead to mount hung because journal has already been locked by
N2.  so when write slot info failed, invalidate slot in advance to avoid
overwrite slot.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/slot_map.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 47bba1be2a51..1e09592148ad 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -502,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	trace_ocfs2_find_slot(osb->slot_num);
 
 	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		/*
+		 * if write block failed, invalidate slot to avoid overwrite
+		 * slot during dismount in case another node rightly has mounted
+		 */
+		spin_lock(&osb->osb_lock);
+		ocfs2_invalidate_slot(si, osb->slot_num);
+		osb->slot_num = OCFS2_INVALID_SLOT;
+		spin_unlock(&osb->osb_lock);
+	}
 
 bail:
 	return status;
-- 
cgit v1.2.3


From 1cce4df04f37d3f7b969e85528fa54f918a06f03 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Thu, 14 Jan 2016 15:17:35 -0800
Subject: ocfs2: do not lock/unlock() inode DLM lock

DLM does not cache locks.  So, blocking lock and unlock will only make
the performance worse where contention over the locks is high.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 20276e340339..f92612e4b9d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2432,12 +2432,6 @@ bail:
  * done this we have to return AOP_TRUNCATED_PAGE so the aop method
  * that called us can bubble that back up into the VFS who will then
  * immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop.    This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
  */
 int ocfs2_inode_lock_with_page(struct inode *inode,
 			      struct buffer_head **ret_bh,
@@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
 	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
 	if (ret == -EAGAIN) {
 		unlock_page(page);
-		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
-			ocfs2_inode_unlock(inode, ex);
 		ret = AOP_TRUNCATED_PAGE;
 	}
 
-- 
cgit v1.2.3


From bef5502de074b6f6fa647b94b73155d675694420 Mon Sep 17 00:00:00 2001
From: xuejiufei <xuejiufei@huawei.com>
Date: Thu, 14 Jan 2016 15:17:38 -0800
Subject: ocfs2/dlm: ignore cleaning the migration mle that is inuse

We have found that migration source will trigger a BUG that the refcount
of mle is already zero before put when the target is down during
migration.  The situation is as follows:

dlm_migrate_lockres
  dlm_add_migration_mle
  dlm_mark_lockres_migrating
  dlm_get_mle_inuse
  <<<<<< Now the refcount of the mle is 2.
  dlm_send_one_lockres and wait for the target to become the
  new master.
  <<<<<< o2hb detect the target down and clean the migration
  mle. Now the refcount is 1.

dlm_migrate_lockres woken, and put the mle twice when found the target
goes down which trigger the BUG with the following message:

  "ERROR: bad mle: ".

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6f0748122117..8b9d483e94a6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	spin_lock(&dlm->master_lock);
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 */
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
@@ -2554,6 +2559,7 @@ fail:
 		if (mle_added) {
 			dlm_mle_detach_hb_events(dlm, mle);
 			dlm_put_mle(mle);
+			dlm_put_mle_inuse(mle);
 		} else if (mle) {
 			kmem_cache_free(dlm_mle_cache, mle);
 			mle = NULL;
@@ -2571,17 +2577,6 @@ fail:
 	 * ensure that all assert_master work is flushed. */
 	flush_workqueue(dlm->dlm_worker);
 
-	/* get an extra reference on the mle.
-	 * otherwise the assert_master from the new
-	 * master will destroy this.
-	 * also, make sure that all callers of dlm_get_mle
-	 * take both dlm->spinlock and dlm->master_lock */
-	spin_lock(&dlm->spinlock);
-	spin_lock(&dlm->master_lock);
-	dlm_get_mle_inuse(mle);
-	spin_unlock(&dlm->master_lock);
-	spin_unlock(&dlm->spinlock);
-
 	/* notify new node and send all lock state */
 	/* call send_one_lockres with migration flag.
 	 * this serves as notice to the target node that a
@@ -3312,6 +3307,15 @@ top:
 			    mle->new_master != dead_node)
 				continue;
 
+			if (mle->new_master == dead_node && mle->inuse) {
+				mlog(ML_NOTICE, "%s: target %u died during "
+						"migration from %u, the MLE is "
+						"still keep used, ignore it!\n",
+						dlm->name, dead_node,
+						mle->master);
+				continue;
+			}
+
 			/* If we have reached this point, this mle needs to be
 			 * removed from the list and freed. */
 			dlm_clean_migration_mle(dlm, mle);
-- 
cgit v1.2.3


From 32e493265b2be96404aaa478fb2913be29b06887 Mon Sep 17 00:00:00 2001
From: xuejiufei <xuejiufei@huawei.com>
Date: Thu, 14 Jan 2016 15:17:41 -0800
Subject: ocfs2/dlm: do not insert a new mle when another process is already
 migrating

When two processes are migrating the same lockres,
dlm_add_migration_mle() return -EEXIST, but insert a new mle in hash
list.  dlm_migrate_lockres() will detach the old mle and free the new
one which is already in hash list, that will destroy the list.

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 8b9d483e94a6..9477d6e1de37 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2549,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	}
 
 fail:
-	if (oldmle) {
+	if (ret != -EEXIST && oldmle) {
 		/* master is known, detach if not already detached */
 		dlm_mle_detach_hb_events(dlm, oldmle);
 		dlm_put_mle(oldmle);
@@ -3136,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 				mlog(0, "tried to migrate %.*s, but some "
 				     "process beat me to it\n",
 				     namelen, name);
-				ret = -EEXIST;
+				spin_unlock(&tmp->spinlock);
+				return -EEXIST;
 			} else {
 				/* bad.  2 NODES are trying to migrate! */
 				mlog(ML_ERROR, "migration error  mle: "
-- 
cgit v1.2.3


From 074a6c655f6da12cb1123c8a84bfd8d781138800 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 14 Jan 2016 15:17:44 -0800
Subject: ocfs2: access orphan dinode before delete entry in ocfs2_orphan_del

In ocfs2_orphan_del, currently it finds and deletes entry first, and
then access orphan dir dinode.  This will have a problem once
ocfs2_journal_access_di fails.  In this case, entry will be removed from
orphan dir, but in deed the inode hasn't been deleted successfully.  In
other words, the file is missing but not actually deleted.  So we should
access orphan dinode first like unlink and rename.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jiufei Xue <xuejiufei@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 714320e420eb..ab42c38031b1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2372,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
 	     name, strlen(name));
 
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	/* find it's spot in the orphan directory */
 	status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
 				  &lookup);
@@ -2387,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(orphan_dir_inode),
-					 orphan_dir_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	/* do the i_nlink dance! :) */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
-- 
cgit v1.2.3


From 98e141f266040fea0864c23f1f37375cae5ff02a Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Thu, 14 Jan 2016 15:17:47 -0800
Subject: ocfs2: dlm: remove redundant code

Found this when do patch review, remove to make it clear and save a
little cpu time.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 86fb53614bf4..c5bdf02c213b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2457,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 	 * perhaps later we can genericize this for other waiters. */
 	wake_up(&dlm->migration_wq);
 
-	if (test_bit(idx, dlm->recovery_map))
-		mlog(0, "domain %s, node %u already added "
-		     "to recovery map!\n", dlm->name, idx);
-	else
-		set_bit(idx, dlm->recovery_map);
+	set_bit(idx, dlm->recovery_map);
 }
 
 void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
-- 
cgit v1.2.3


From 3c973b0e71bff472545a2939ede2b567745d0c1c Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 14 Jan 2016 15:17:50 -0800
Subject: ocfs2/dlm: cleanup redunant lksb flags in dlmcommon.h

lksb flags are defined both in dlmapi.h and dlmcommon.h.  So clean them
up from dlmcommon.h.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jiufei Xue <xuejiufei@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmcommon.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c83ff..68c607e63ff6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
 		 lksb_kernel_allocated:1;
 };
 
-
-#define DLM_LKSB_UNUSED1           0x01
-#define DLM_LKSB_PUT_LVB           0x02
-#define DLM_LKSB_GET_LVB           0x04
-#define DLM_LKSB_UNUSED2           0x08
-#define DLM_LKSB_UNUSED3           0x10
-#define DLM_LKSB_UNUSED4           0x20
-#define DLM_LKSB_UNUSED5           0x40
-#define DLM_LKSB_UNUSED6           0x80
-
-
 enum dlm_lockres_list {
 	DLM_GRANTED_LIST = 0,
 	DLM_CONVERTING_LIST = 1,
-- 
cgit v1.2.3


From b2a209ffa605994cbe3c259c8584ba1576d3310c Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 14 Jan 2016 15:18:05 -0800
Subject: Revert "kernfs: do not account ino_ida allocations to memcg"

Currently, all kmem allocations (namely every kmem_cache_alloc, kmalloc,
alloc_kmem_pages call) are accounted to memory cgroup automatically.
Callers have to explicitly opt out if they don't want/need accounting
for some reason.  Such a design decision leads to several problems:

 - kmalloc users are highly sensitive to failures, many of them
   implicitly rely on the fact that kmalloc never fails, while memcg
   makes failures quite plausible.

 - A lot of objects are shared among different containers by design.
   Accounting such objects to one of containers is just unfair.
   Moreover, it might lead to pinning a dead memcg along with its kmem
   caches, which aren't tiny, which might result in noticeable increase
   in memory consumption for no apparent reason in the long run.

 - There are tons of short-lived objects. Accounting them to memcg will
   only result in slight noise and won't change the overall picture, but
   we still have to pay accounting overhead.

For more info, see

 - http://lkml.kernel.org/r/20151105144002.GB15111%40dhcp22.suse.cz
 - http://lkml.kernel.org/r/20151106090555.GK29259@esperanza

Therefore this patchset switches to the white list policy.  Now kmalloc
users have to explicitly opt in by passing __GFP_ACCOUNT flag.

Currently, the list of accounted objects is quite limited and only
includes those allocations that (1) are known to be easily triggered
from userspace and (2) can fail gracefully (for the full list see patch
no.  6) and it still misses many object types.  However, accounting only
those objects should be a satisfactory approximation of the behavior we
used to have for most sane workloads.

This patch (of 6):

Revert 499611ed451508a42d1d7d ("kernfs: do not account ino_ida allocations
to memcg").

Black-list kmem accounting policy (aka __GFP_NOACCOUNT) turned out to be
fragile and difficult to maintain, because there seem to be many more
allocations that should not be accounted than those that should be.
Besides, false accounting an allocation might result in much worse
consequences than not accounting at all, namely increased memory
consumption due to pinned dead kmem caches.

So it was decided to switch to the white-list policy.  This patch reverts
bits introducing the black-list policy.  The white-list policy will be
introduced later in the series.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/kernfs/dir.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 742bf4a230e8..821973853340 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	if (!kn)
 		goto err_out1;
 
-	/*
-	 * If the ino of the sysfs entry created for a kmem cache gets
-	 * allocated from an ida layer, which is accounted to the memcg that
-	 * owns the cache, the memcg will get pinned forever. So do not account
-	 * ino ida allocations.
-	 */
-	ret = ida_simple_get(&root->ino_ida, 1, 0,
-			     GFP_KERNEL | __GFP_NOACCOUNT);
+	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto err_out2;
 	kn->ino = ret;
-- 
cgit v1.2.3


From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 14 Jan 2016 15:18:21 -0800
Subject: kmemcg: account certain kmem allocations to memcg

Mark those kmem allocations that are known to be easily triggered from
userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to
memcg.  For the list, see below:

 - threadinfo
 - task_struct
 - task_delay_info
 - pid
 - cred
 - mm_struct
 - vm_area_struct and vm_region (nommu)
 - anon_vma and anon_vma_chain
 - signal_struct
 - sighand_struct
 - fs_struct
 - files_struct
 - fdtable and fdtable->full_fds_bits
 - dentry and external_name
 - inode for all filesystems. This is the most tedious part, because
   most filesystems overwrite the alloc_inode method.

The list is far from complete, so feel free to add more objects.
Nevertheless, it should be close to "account everything" approach and
keep most workloads within bounds.  Malevolent users will be able to
breach the limit, but this was possible even with the former "account
everything" approach (simply because it did not account everything in
fact).

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c     |  2 +-
 drivers/staging/lustre/lustre/llite/super25.c |  3 ++-
 fs/9p/v9fs.c                                  |  2 +-
 fs/adfs/super.c                               |  2 +-
 fs/affs/super.c                               |  2 +-
 fs/afs/super.c                                |  2 +-
 fs/befs/linuxvfs.c                            |  2 +-
 fs/bfs/inode.c                                |  2 +-
 fs/block_dev.c                                |  2 +-
 fs/btrfs/inode.c                              |  3 ++-
 fs/ceph/super.c                               |  4 ++--
 fs/cifs/cifsfs.c                              |  2 +-
 fs/coda/inode.c                               |  6 +++---
 fs/dcache.c                                   |  5 +++--
 fs/ecryptfs/main.c                            |  6 ++++--
 fs/efs/super.c                                |  6 +++---
 fs/exofs/super.c                              |  4 ++--
 fs/ext2/super.c                               |  2 +-
 fs/ext4/super.c                               |  2 +-
 fs/f2fs/super.c                               |  5 +++--
 fs/fat/inode.c                                |  2 +-
 fs/file.c                                     |  7 ++++---
 fs/fuse/inode.c                               |  4 ++--
 fs/gfs2/main.c                                |  3 ++-
 fs/hfs/super.c                                |  4 ++--
 fs/hfsplus/super.c                            |  2 +-
 fs/hostfs/hostfs_kern.c                       |  2 +-
 fs/hpfs/super.c                               |  2 +-
 fs/hugetlbfs/inode.c                          |  2 +-
 fs/inode.c                                    |  2 +-
 fs/isofs/inode.c                              |  2 +-
 fs/jffs2/super.c                              |  2 +-
 fs/jfs/super.c                                |  2 +-
 fs/logfs/inode.c                              |  3 ++-
 fs/minix/inode.c                              |  2 +-
 fs/ncpfs/inode.c                              |  2 +-
 fs/nfs/inode.c                                |  2 +-
 fs/nilfs2/super.c                             |  3 ++-
 fs/ntfs/super.c                               |  4 ++--
 fs/ocfs2/dlmfs/dlmfs.c                        |  2 +-
 fs/ocfs2/super.c                              |  2 +-
 fs/openpromfs/inode.c                         |  2 +-
 fs/proc/inode.c                               |  3 ++-
 fs/qnx4/inode.c                               |  2 +-
 fs/qnx6/inode.c                               |  2 +-
 fs/reiserfs/super.c                           |  3 ++-
 fs/romfs/super.c                              |  4 ++--
 fs/squashfs/super.c                           |  3 ++-
 fs/sysv/inode.c                               |  2 +-
 fs/ubifs/super.c                              |  4 ++--
 fs/udf/super.c                                |  3 ++-
 fs/ufs/super.c                                |  2 +-
 fs/xfs/kmem.h                                 |  1 +
 fs/xfs/xfs_super.c                            |  4 ++--
 include/linux/thread_info.h                   |  5 +++--
 ipc/mqueue.c                                  |  2 +-
 kernel/cred.c                                 |  4 ++--
 kernel/delayacct.c                            |  2 +-
 kernel/fork.c                                 | 22 +++++++++++++---------
 kernel/pid.c                                  |  2 +-
 mm/nommu.c                                    |  2 +-
 mm/rmap.c                                     |  6 ++++--
 mm/shmem.c                                    |  2 +-
 net/socket.c                                  |  2 +-
 net/sunrpc/rpc_pipe.c                         |  2 +-
 65 files changed, 114 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 11634fa7ab3c..ad4840f86be1 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -767,7 +767,7 @@ static int __init spufs_init(void)
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
index 7a9fafc67693..86c371ef71ea 100644
--- a/drivers/staging/lustre/lustre/llite/super25.c
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -106,7 +106,8 @@ static int __init init_lustre_lite(void)
 	rc = -ENOMEM;
 	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
 					    sizeof(struct ll_inode_info),
-					    0, SLAB_HWCACHE_ALIGN, NULL);
+					    0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
+					    NULL);
 	if (ll_inode_cachep == NULL)
 		goto out_cache;
 
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
 	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
 					  sizeof(struct v9fs_inode),
 					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD),
+					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					  v9fs_inode_init_once);
 	if (!v9fs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8836df5f1e11..2a6713b6b9f4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..81afefe7d8a6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
 	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
 					     sizeof(struct afs_vnode),
 					     0,
-					     SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 25250fa87086..cc0e08252913 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
 	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d878e4860fb7..d33071dd683e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -595,7 +595,7 @@ void __init bdev_cache_init(void)
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-				SLAB_MEM_SPREAD|SLAB_PANIC),
+				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b8856e182ae..394017831692 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9161,7 +9161,8 @@ int btrfs_init_cachep(void)
 {
 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+			init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..ca4d5e8457f1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				      SLAB_ACCOUNT, ceph_inode_init_once);
 	if (ceph_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7fcb3151103..c4c1169814b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1092,7 +1092,7 @@ cifs_init_inodecache(void)
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
 					      sizeof(struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
 int __init coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
-				sizeof(struct coda_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct coda_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 8d38cd07b207..b4539e84e577 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
 	if (name->len > DNAME_INLINE_LEN-1) {
 		size_t size = offsetof(struct external_name, name[1]);
-		struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+		struct external_name *p = kmalloc(size + name->len,
+						  GFP_KERNEL_ACCOUNT);
 		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
@@ -3415,7 +3416,7 @@ static void __init dcache_init(void)
 	 * of the dcache. 
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474bee9..e25b6b06bacf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
+	unsigned long flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
 		.cache = &ecryptfs_inode_info_cache,
 		.name = "ecryptfs_inode_cache",
 		.size = sizeof(struct ecryptfs_inode_info),
+		.flags = SLAB_ACCOUNT,
 		.ctor = inode_info_init_once,
 	},
 	{
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
 		struct ecryptfs_cache_info *info;
 
 		info = &ecryptfs_cache_infos[i];
-		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		*(info->cache) = kmem_cache_create(info->name, info->size, 0,
+				SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
 static int __init init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
-				sizeof(struct efs_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct efs_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
 {
 	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 				sizeof(struct exofs_i_info), 0,
-				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				exofs_init_once);
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				SLAB_ACCOUNT, exofs_init_once);
 	if (exofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..2a188413a2b0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -203,7 +203,7 @@ static int __init init_inodecache(void)
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6e5a..f1b56ff01208 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -966,7 +966,7 @@ static int __init init_inodecache(void)
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3bf990b80026..6134832baaaf 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1541,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
-	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-			sizeof(struct f2fs_inode_info));
+	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
 	if (!f2fs_inode_cachep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..6aece96df19f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void)
 	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/file.c b/fs/file.c
index 1aed0add16a2..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
 	 * vmalloc() if the allocation size will be considered "large" by the VM.
 	 */
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+		void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+				     __GFP_NOWARN | __GFP_NORETRY);
 		if (data != NULL)
 			return data;
 	}
-	return vmalloc(size);
+	return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 
 static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..4d69d5c0bedc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode),
-					      0, SLAB_HWCACHE_ALIGN,
+					      sizeof(struct fuse_inode), 0,
+					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 1d709d496364..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void)
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
-					          SLAB_MEM_SPREAD,
+						  SLAB_MEM_SPREAD|
+						  SLAB_ACCOUNT,
 					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
 	int err;
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
-		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once);
+		sizeof(struct hfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..5d54490a136d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
 	int err;
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
-		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f49be23e78aa..cfaa18c7a337 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
 	if (hi == NULL)
 		return NULL;
 	hi->fd = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d8f51ee8126b..f6820ecf0a11 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1322,7 +1322,7 @@ static int __init init_hugetlbfs_fs(void)
 	error = -ENOMEM;
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once);
+					0, SLAB_ACCOUNT, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		goto out2;
 
diff --git a/fs/inode.c b/fs/inode.c
index 4230f66b7410..e491e54d2430 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1883,7 +1883,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
+					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 61abdc4920da..bcd2d41b318a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
 	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..bb080c272149 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8f9176caf098..900925b5eb8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -898,7 +898,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
-			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 0fce46d62b9c..db9cfc598883 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = {
 int logfs_init_inode_cache(void)
 {
 	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
-			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			sizeof(struct logfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			logfs_init_once);
 	if (!logfs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index cb1789ca1ee6..f975d667c539 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ce1eb3f9dfe8..1af15fcbe57b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bdb4dc7b4ecd..48fd8a87affe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1933,7 +1933,7 @@ static int __init nfs_init_inodecache(void)
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7343844e6b6..7f5d3d9f1c37 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void)
 {
 	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
 			sizeof(struct nilfs_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+			nilfs_inode_init_once);
 	if (!nilfs_inode_cachep)
 		goto fail;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..2f77f8dfb861 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+			SLAB_ACCOUNT, ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
 		goto big_inode_err_out;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..03768bb3aab1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				dlmfs_init_once);
 	if (!dlmfs_inode_cache) {
 		status = -ENOMEM;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c62d761d6bf0..faa1365097bc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1766,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD),
+					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d0e9b9b6223e..42305ddcbaa0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_PANIC),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_PANIC),
 					     init_once);
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index f37b3deb01b4..3a67cfb142d8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -365,7 +365,7 @@ static int init_inodecache(void)
 	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 9728b5499e1d..47bb1de07155 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -625,7 +625,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..05db7473bcb5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-							SLAB_MEM_SPREAD),
+						      SLAB_MEM_SPREAD|
+						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index bb894e78a821..6b00ca357c58 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -619,8 +619,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				  romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				  SLAB_ACCOUNT, romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		pr_err("Failed to initialise inode cache\n");
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index dded920cbc8f..5e79bfa4f260 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -419,7 +419,8 @@ static int __init init_inodecache(void)
 {
 	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
 		sizeof(struct squashfs_inode_info), 0,
-		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+		init_once);
 
 	return squashfs_inode_cachep ? 0 : -ENOMEM;
 }
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 07ac18c355e7..d62c423a5a2d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -346,7 +346,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fd90c079537..a233ba913be4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2248,8 +2248,8 @@ static int __init ubifs_init(void)
 
 	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
 				sizeof(struct ubifs_inode), 0,
-				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
-				&inode_slab_ctor);
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+				SLAB_ACCOUNT, &inode_slab_ctor);
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..9c64a3ca9837 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
 	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
-						 SLAB_MEM_SPREAD),
+						 SLAB_MEM_SPREAD |
+						 SLAB_ACCOUNT),
 					     init_once);
 	if (!udf_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
 #define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
 #define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
 #define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT	SLAB_ACCOUNT
 
 #define kmem_zone	kmem_cache
 #define kmem_zone_t	struct kmem_cache
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b35775752b74..59c9b7bd958d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1714,8 +1714,8 @@ xfs_init_zones(void)
 
 	xfs_inode_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-			xfs_fs_inode_init_once);
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+			KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
 	if (!xfs_inode_zone)
 		goto out_destroy_efi_zone;
 
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..b4c2a485b28a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,9 +56,10 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 #ifdef __KERNEL__
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
+				 __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
 #endif
 
 /*
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 161a1807e6ef..f4617cf07069 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1438,7 +1438,7 @@ static int __init init_mqueue_fs(void)
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
 				sizeof(struct mqueue_inode_info), 0,
-				SLAB_HWCACHE_ALIGN, init_once);
+				SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
 void __init cred_init(void)
 {
 	/* allocate a slab in which we can store credentials */
-	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
-				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 }
 
 /**
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 6774e6b2e96d..51915842f1c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,9 +300,9 @@ void __init fork_init(void)
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
-	task_struct_cachep =
-		kmem_cache_create("task_struct", arch_task_struct_size,
-			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+	task_struct_cachep = kmem_cache_create("task_struct",
+			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -1848,16 +1848,19 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-			SLAB_NOTRACK, sighand_ctor);
+			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
 	 * whole struct cpumask for the OFFSTACK case. We could change
@@ -1867,8 +1870,9 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..f4ad91b746f1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..fbf6f0f1d6c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -560,7 +560,7 @@ void __init mmap_init(void)
 
 	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
 	VM_BUG_ON(ret);
-	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..3c3f1d21f075 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data)
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
-	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+			anon_vma_ctor);
+	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+			SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 5813b7fa85b6..9e60093aca3f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3064,7 +3064,7 @@ static int shmem_init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, SLAB_PANIC, shmem_init_inode);
+				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
 	return 0;
 }
 
diff --git a/net/socket.c b/net/socket.c
index 91c2de6f5020..c044d1e8508c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -294,7 +294,7 @@ static int init_inodecache(void)
 					      0,
 					      (SLAB_HWCACHE_ALIGN |
 					       SLAB_RECLAIM_ACCOUNT |
-					       SLAB_MEM_SPREAD),
+					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d81186d34558..14f45bf0410c 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void)
 	rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
 				sizeof(struct rpc_inode),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				init_once);
 	if (!rpc_inode_cachep)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 4a8c7bb59ac85b038c29adf6d32ff56e11fbb267 Mon Sep 17 00:00:00 2001
From: Nathan Zimmer <nzimmer@sgi.com>
Date: Thu, 14 Jan 2016 15:18:36 -0800
Subject: mm/mempolicy.c: convert the shared_policy lock to a rwlock

When running the SPECint_rate gcc on some very large boxes it was
noticed that the system was spending lots of time in
mpol_shared_policy_lookup().  The gamess benchmark can also show it and
is what I mostly used to chase down the issue since the setup for that I
found to be easier.

To be clear the binaries were on tmpfs because of disk I/O requirements.
We then used text replication to avoid icache misses and having all the
copies banging on the memory where the instruction code resides.  This
results in us hitting a bottleneck in mpol_shared_policy_lookup() since
lookup is serialised by the shared_policy lock.

I have only reproduced this on very large (3k+ cores) boxes.  The
problem starts showing up at just a few hundred ranks getting worse
until it threatens to livelock once it gets large enough.  For example
on the gamess benchmark at 128 ranks this area consumes only ~1% of
time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over
90%.

To alleviate the contention in this area I converted the spinlock to an
rwlock.  This allows a large number of lookups to happen simultaneously.
The results were quite good reducing this consumtion at max ranks to
around 2%.

[akpm@linux-foundation.org: tidy up code comments]
Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c      |  2 +-
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 30 +++++++++++++++++-------------
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f6820ecf0a11..a1cb8fd2289b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -738,7 +738,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		/*
 		 * The policy is initialized here even if we are creating a
 		 * private inode because initialization simply creates an
-		 * an empty rb tree and calls spin_lock_init(), later when we
+		 * an empty rb tree and calls rwlock_init(), later when we
 		 * call mpol_free_shared_policy() it will just return because
 		 * the rb tree will still be empty.
 		 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3d385c81c153..2696c1f05ed1 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -122,7 +122,7 @@ struct sp_node {
 
 struct shared_policy {
 	struct rb_root root;
-	spinlock_t lock;
+	rwlock_t lock;
 };
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..d8caff071a30 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2142,12 +2142,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  *
  * Remember policies even when nobody has shared memory mapped.
  * The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->lock spinlock, which should be held
+ * They are protected by the sp->lock rwlock, which should be held
  * for any accesses to the tree.
  */
 
-/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/*
+ * lookup first element intersecting start-end.  Caller holds sp->lock for
+ * reading or for writing
+ */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2178,8 +2180,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 	return rb_entry(n, struct sp_node, nd);
 }
 
-/* Insert a new shared policy into the list. */
-/* Caller holds sp->lock */
+/*
+ * Insert a new shared policy into the list.  Caller holds sp->lock for
+ * writing.
+ */
 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 {
 	struct rb_node **p = &sp->root.rb_node;
@@ -2211,13 +2215,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 
 	if (!sp->root.rb_node)
 		return NULL;
-	spin_lock(&sp->lock);
+	read_lock(&sp->lock);
 	sn = sp_lookup(sp, idx, idx+1);
 	if (sn) {
 		mpol_get(sn->policy);
 		pol = sn->policy;
 	}
-	spin_unlock(&sp->lock);
+	read_unlock(&sp->lock);
 	return pol;
 }
 
@@ -2360,7 +2364,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 	int ret = 0;
 
 restart:
-	spin_lock(&sp->lock);
+	write_lock(&sp->lock);
 	n = sp_lookup(sp, start, end);
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
@@ -2393,7 +2397,7 @@ restart:
 	}
 	if (new)
 		sp_insert(sp, new);
-	spin_unlock(&sp->lock);
+	write_unlock(&sp->lock);
 	ret = 0;
 
 err_out:
@@ -2405,7 +2409,7 @@ err_out:
 	return ret;
 
 alloc_new:
-	spin_unlock(&sp->lock);
+	write_unlock(&sp->lock);
 	ret = -ENOMEM;
 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n_new)
@@ -2431,7 +2435,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
-	spin_lock_init(&sp->lock);
+	rwlock_init(&sp->lock);
 
 	if (mpol) {
 		struct vm_area_struct pvma;
@@ -2497,14 +2501,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
 
 	if (!p->root.rb_node)
 		return;
-	spin_lock(&p->lock);
+	write_lock(&p->lock);
 	next = rb_first(&p->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
 		sp_delete(p, n);
 	}
-	spin_unlock(&p->lock);
+	write_unlock(&p->lock);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-- 
cgit v1.2.3


From c261e7d94f0dd33a34b6cf98686e8b9699b62340 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:17 -0800
Subject: mm, proc: account for shmem swap in /proc/pid/smaps

Currently, /proc/pid/smaps will always show "Swap: 0 kB" for
shmem-backed mappings, even if the mapped portion does contain pages
that were swapped out.  This is because unlike private anonymous
mappings, shmem does not change pte to swap entry, but pte_none when
swapping the page out.  In the smaps page walk, such page thus looks
like it was never faulted in.

This patch changes smaps_pte_entry() to determine the swap status for
such pte_none entries for shmem mappings, similarly to how
mincore_page() does it.  Swapped out shmem pages are thus accounted for.
For private mappings of tmpfs files that COWed some of the pages, swaped
out status of the original shmem pages is naturally ignored.  If some of
the private copies was also swapped out, they are accounted via their
page table swap entries, so the resulting reported swap usage is then a
sum of both swapped out private copies, and swapped out shmem pages that
were not COWed.  No double accounting can thus happen.

The accounting is arguably still not as precise as for private anonymous
mappings, since now we will count also pages that the process in
question never accessed, but another process populated them and then let
them become swapped out.  I believe it is still less confusing and
subtle than not showing any swap usage by shmem mappings at all.
Swapped out counter might of interest of users who would like to prevent
from future swapins during performance critical operation and pre-fault
them at their convenience.  Especially for larger swapped out regions
the cost of swapin is much higher than a fresh page allocation.  So a
differentiation between pte_none vs.  swapped out is important for those
usecases.

One downside of this patch is that it makes /proc/pid/smaps more
expensive for shmem mappings, as we consult the radix tree for each
pte_none entry, so the overal complexity is O(n*log(n)).  I have
measured this on a process that creates a 2GB mapping and dirties single
pages with a stride of 2MB, and time how long does it take to cat
/proc/pid/smaps of this process 100 times.

Private anonymous mapping:

real    0m0.949s
user    0m0.116s
sys     0m0.348s

Mapping of a /dev/shm/file:

real    0m3.831s
user    0m0.180s
sys     0m3.212s

The difference is rather substantial, so the next patch will reduce the
cost for shared or read-only mappings.

In a less controlled experiment, I've gathered pids of processes on my
desktop that have either '/dev/shm/*' or 'SYSV*' in smaps.  This
included the Chrome browser and some KDE processes.  Again, I've run cat
/proc/pid/smaps on each 100 times.

Before this patch:

real    0m9.050s
user    0m0.518s
sys     0m8.066s

After this patch:

real    0m9.221s
user    0m0.541s
sys     0m8.187s

This suggests low impact on average systems.

Note that this patch doesn't attempt to adjust the SwapPss field for
shmem mappings, which would need extra work to determine who else could
have the pages mapped.  Thus the value stays zero except for COWed
swapped out pages in a shmem mapping, which are accounted as usual.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  5 +++-
 fs/proc/task_mmu.c                 | 51 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9f13b6e28676..fdeb5b33349f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -460,7 +460,10 @@ and a page is modified, the file page is replaced by a private anonymous copy.
 hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
 reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
 "Swap" shows how much would-be-anonymous memory is also used, but out on swap.
-"SwapPss" shows proportional swap share of this mapping.
+For shmem mappings, "Swap" includes also the size of the mapped (and not
+replaced by copy-on-write) part of the underlying shmem object out on swap.
+"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
+does not take into account swapped out page of underlying shmem objects.
 "Locked" indicates whether the mapping is locked in memory or not.
 
 "VmFlags" field deserves a separate description. This member represents the kernel
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..85ef60fdf2c0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -451,6 +451,7 @@ struct mem_size_stats {
 	unsigned long private_hugetlb;
 	u64 pss;
 	u64 swap_pss;
+	bool check_shmem_swap;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -485,6 +486,45 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	}
 }
 
+#ifdef CONFIG_SHMEM
+static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	struct page *page;
+
+	page = find_get_entry(vma->vm_file->f_mapping,
+					linear_page_index(vma, addr));
+	if (!page)
+		return 0;
+
+	if (radix_tree_exceptional_entry(page))
+		return PAGE_SIZE;
+
+	page_cache_release(page);
+	return 0;
+
+}
+
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+
+	while (addr < end) {
+		mss->swap += smaps_shmem_swap(walk->vma, addr);
+		addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+#else
+static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	return 0;
+}
+#endif
+
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		struct mm_walk *walk)
 {
@@ -512,6 +552,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			}
 		} else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
+	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+							&& pte_none(*pte))) {
+		mss->swap += smaps_shmem_swap(vma, addr);
 	}
 
 	if (!page)
@@ -671,6 +714,14 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 	};
 
 	memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+		mss.check_shmem_swap = true;
+		smaps_walk.pte_hole = smaps_pte_hole;
+	}
+#endif
+
 	/* mmap_sem is held in m_start */
 	walk_page_vma(vma, &smaps_walk);
 
-- 
cgit v1.2.3


From 6a15a37097c7e02390bb08d83dac433c9f10144f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:20 -0800
Subject: mm, proc: reduce cost of /proc/pid/smaps for shmem mappings

The previous patch has improved swap accounting for shmem mapping, which
however made /proc/pid/smaps more expensive for shmem mappings, as we
consult the radix tree for each pte_none entry, so the overal complexity
is O(n*log(n)).

We can reduce this significantly for mappings that cannot contain COWed
pages, because then we can either use the statistics tha shmem object
itself tracks (if the mapping contains the whole object, or the swap
usage of the whole object is zero), or use the radix tree iterator,
which is much more effective than repeated find_get_entry() calls.

This patch therefore introduces a function shmem_swap_usage(vma) and
makes /proc/pid/smaps use it when possible.  Only for writable private
mappings of shmem objects (i.e.  tmpfs files) with the shmem object
itself (partially) swapped outwe have to resort to the find_get_entry()
approach.

Hopefully such mappings are relatively uncommon.

To demonstrate the diference, I have measured this on a process that
creates a 2GB mapping and dirties single pages with a stride of 2MB, and
time how long does it take to cat /proc/pid/smaps of this process 100
times.

Private writable mapping of a /dev/shm/file (the most complex case):

real    0m3.831s
user    0m0.180s
sys     0m3.212s

Shared mapping of an almost full mapping of a partially swapped /dev/shm/file
(which needs to employ the radix tree iterator).

real    0m1.351s
user    0m0.096s
sys     0m0.768s

Same, but with /dev/shm/file not swapped (so no radix tree walk needed)

real    0m0.935s
user    0m0.128s
sys     0m0.344s

Private anonymous mapping:

real    0m0.949s
user    0m0.116s
sys     0m0.348s

The cost is now much closer to the private anonymous mapping case, unless
the shmem mapping is private and writable.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c       | 22 +++++++++++++--
 include/linux/shmem_fs.h |  2 ++
 mm/shmem.c               | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85ef60fdf2c0..5830b2e129ed 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -717,8 +718,25 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 #ifdef CONFIG_SHMEM
 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
-		mss.check_shmem_swap = true;
-		smaps_walk.pte_hole = smaps_pte_hole;
+		/*
+		 * For shared or readonly shmem mappings we know that all
+		 * swapped out pages belong to the shmem object, and we can
+		 * obtain the swap value much more efficiently. For private
+		 * writable mappings, we might have COW pages that are
+		 * not affected by the parent swapped out pages of the shmem
+		 * object, so we have to distinguish them during the page walk.
+		 * Unless we know that the shmem object (or the part mapped by
+		 * our VMA) has no swapped out pages at all.
+		 */
+		unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+		if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+					!(vma->vm_flags & VM_WRITE)) {
+			mss.swap = shmem_swapped;
+		} else {
+			mss.check_shmem_swap = true;
+			smaps_walk.pte_hole = smaps_pte_hole;
+		}
 	}
 #endif
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 50777b5b1e4c..bd58be5e7a2a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -60,6 +60,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 
+extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+
 static inline struct page *shmem_read_mapping_page(
 				struct address_space *mapping, pgoff_t index)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 9e60093aca3f..e978621de1ef 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -359,6 +359,76 @@ static int shmem_free_swap(struct address_space *mapping,
 	return 0;
 }
 
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long swapped;
+	pgoff_t start, end;
+	struct radix_tree_iter iter;
+	void **slot;
+	struct page *page;
+
+	/* Be careful as we don't hold info->lock */
+	swapped = READ_ONCE(info->swapped);
+
+	/*
+	 * The easier cases are when the shmem object has nothing in swap, or
+	 * the vma maps it whole. Then we can simply use the stats that we
+	 * already track.
+	 */
+	if (!swapped)
+		return 0;
+
+	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+		return swapped << PAGE_SHIFT;
+
+	swapped = 0;
+
+	/* Here comes the more involved part */
+	start = linear_page_index(vma, vma->vm_start);
+	end = linear_page_index(vma, vma->vm_end);
+
+	rcu_read_lock();
+
+restart:
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+		if (iter.index >= end)
+			break;
+
+		page = radix_tree_deref_slot(slot);
+
+		/*
+		 * This should only be possible to happen at index 0, so we
+		 * don't need to reset the counter, nor do we risk infinite
+		 * restarts.
+		 */
+		if (radix_tree_deref_retry(page))
+			goto restart;
+
+		if (radix_tree_exceptional_entry(page))
+			swapped++;
+
+		if (need_resched()) {
+			cond_resched_rcu();
+			start = iter.index + 1;
+			goto restart;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return swapped << PAGE_SHIFT;
+}
+
 /*
  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
  */
-- 
cgit v1.2.3


From 48131e03ca4ed71d73fbe55c311a258c6fa2a090 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:23 -0800
Subject: mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem
 mappings

Following the previous patch, further reduction of /proc/pid/smaps cost
is possible for private writable shmem mappings with unpopulated areas
where the page walk invokes the .pte_hole function.  We can use radix
tree iterator for each such area instead of calling find_get_entry() in
a loop.  This is possible at the extra maintenance cost of introducing
another shmem function shmem_partial_swap_usage().

To demonstrate the diference, I have measured this on a process that
creates a private writable 2GB mapping of a partially swapped out
/dev/shm/file (which cannot employ the optimizations from the prvious
patch) and doesn't populate it at all.  I time how long does it take to
cat /proc/pid/smaps of this process 100 times.

Before this patch:

real    0m3.831s
user    0m0.180s
sys     0m3.212s

After this patch:

real    0m1.176s
user    0m0.180s
sys     0m0.684s

The time is similar to the case where a radix tree iterator is employed
on the whole mapping.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c       | 42 ++++++++++---------------------
 include/linux/shmem_fs.h |  2 ++
 mm/shmem.c               | 65 ++++++++++++++++++++++++++++--------------------
 3 files changed, 53 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5830b2e129ed..8a03759bda38 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -488,42 +488,16 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 }
 
 #ifdef CONFIG_SHMEM
-static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
-		unsigned long addr)
-{
-	struct page *page;
-
-	page = find_get_entry(vma->vm_file->f_mapping,
-					linear_page_index(vma, addr));
-	if (!page)
-		return 0;
-
-	if (radix_tree_exceptional_entry(page))
-		return PAGE_SIZE;
-
-	page_cache_release(page);
-	return 0;
-
-}
-
 static int smaps_pte_hole(unsigned long addr, unsigned long end,
 		struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 
-	while (addr < end) {
-		mss->swap += smaps_shmem_swap(walk->vma, addr);
-		addr += PAGE_SIZE;
-	}
+	mss->swap += shmem_partial_swap_usage(
+			walk->vma->vm_file->f_mapping, addr, end);
 
 	return 0;
 }
-#else
-static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
-		unsigned long addr)
-{
-	return 0;
-}
 #endif
 
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
@@ -555,7 +529,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			page = migration_entry_to_page(swpent);
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 							&& pte_none(*pte))) {
-		mss->swap += smaps_shmem_swap(vma, addr);
+		page = find_get_entry(vma->vm_file->f_mapping,
+						linear_page_index(vma, addr));
+		if (!page)
+			return;
+
+		if (radix_tree_exceptional_entry(page))
+			mss->swap += PAGE_SIZE;
+		else
+			page_cache_release(page);
+
+		return;
 	}
 
 	if (!page)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index bd58be5e7a2a..a43f41cb3c43 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -61,6 +61,8 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+						pgoff_t start, pgoff_t end);
 
 static inline struct page *shmem_read_mapping_page(
 				struct address_space *mapping, pgoff_t index)
diff --git a/mm/shmem.c b/mm/shmem.c
index e978621de1ef..760d90cf2a41 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -361,41 +361,18 @@ static int shmem_free_swap(struct address_space *mapping,
 
 /*
  * Determine (in bytes) how many of the shmem object's pages mapped by the
- * given vma is swapped out.
+ * given offsets are swapped out.
  *
  * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
  * as long as the inode doesn't go away and racy results are not a problem.
  */
-unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+						pgoff_t start, pgoff_t end)
 {
-	struct inode *inode = file_inode(vma->vm_file);
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long swapped;
-	pgoff_t start, end;
 	struct radix_tree_iter iter;
 	void **slot;
 	struct page *page;
-
-	/* Be careful as we don't hold info->lock */
-	swapped = READ_ONCE(info->swapped);
-
-	/*
-	 * The easier cases are when the shmem object has nothing in swap, or
-	 * the vma maps it whole. Then we can simply use the stats that we
-	 * already track.
-	 */
-	if (!swapped)
-		return 0;
-
-	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
-		return swapped << PAGE_SHIFT;
-
-	swapped = 0;
-
-	/* Here comes the more involved part */
-	start = linear_page_index(vma, vma->vm_start);
-	end = linear_page_index(vma, vma->vm_end);
+	unsigned long swapped = 0;
 
 	rcu_read_lock();
 
@@ -429,6 +406,40 @@ restart:
 	return swapped << PAGE_SHIFT;
 }
 
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long swapped;
+
+	/* Be careful as we don't hold info->lock */
+	swapped = READ_ONCE(info->swapped);
+
+	/*
+	 * The easier cases are when the shmem object has nothing in swap, or
+	 * the vma maps it whole. Then we can simply use the stats that we
+	 * already track.
+	 */
+	if (!swapped)
+		return 0;
+
+	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+		return swapped << PAGE_SHIFT;
+
+	/* Here comes the more involved part */
+	return shmem_partial_swap_usage(mapping,
+			linear_page_index(vma, vma->vm_start),
+			linear_page_index(vma, vma->vm_end));
+}
+
 /*
  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
  */
-- 
cgit v1.2.3


From eca56ff906bdd0239485e8b47154a6e73dd9a2f3 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 14 Jan 2016 15:19:26 -0800
Subject: mm, shmem: add internal shmem resident memory accounting

Currently looking at /proc/<pid>/status or statm, there is no way to
distinguish shmem pages from pages mapped to a regular file (shmem pages
are mapped to /dev/zero), even though their implication in actual memory
use is quite different.

The internal accounting currently counts shmem pages together with
regular files.  As a preparation to extend the userspace interfaces,
this patch adds MM_SHMEMPAGES counter to mm_rss_stat to account for
shmem pages separately from MM_FILEPAGES.  The next patch will expose it
to userspace - this patch doesn't change the exported values yet, by
adding up MM_SHMEMPAGES to MM_FILEPAGES at places where MM_FILEPAGES was
used before.  The only user-visible change after this patch is the OOM
killer message that separates the reported "shmem-rss" from "file-rss".

[vbabka@suse.cz: forward-porting, tweak changelog]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/mm/pgtable.c   |  5 +----
 fs/proc/task_mmu.c       |  3 ++-
 include/linux/mm.h       | 18 +++++++++++++++++-
 include/linux/mm_types.h |  7 ++++---
 kernel/events/uprobes.c  |  2 +-
 mm/memory.c              | 30 ++++++++++--------------------
 mm/oom_kill.c            |  5 +++--
 mm/rmap.c                | 12 +++---------
 8 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 63b039899a5e..aa34af0a0b26 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -603,10 +603,7 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
 	else if (is_migration_entry(entry)) {
 		struct page *page = migration_entry_to_page(entry);
 
-		if (PageAnon(page))
-			dec_mm_counter(mm, MM_ANONPAGES);
-		else
-			dec_mm_counter(mm, MM_FILEPAGES);
+		dec_mm_counter(mm, mm_counter(page));
 	}
 	free_swap_and_cache(entry);
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a03759bda38..45eb24145978 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -83,7 +83,8 @@ unsigned long task_statm(struct mm_struct *mm,
 			 unsigned long *shared, unsigned long *text,
 			 unsigned long *data, unsigned long *resident)
 {
-	*shared = get_mm_counter(mm, MM_FILEPAGES);
+	*shared = get_mm_counter(mm, MM_FILEPAGES) +
+			get_mm_counter(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad7793788..a8ab1fc0e9bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1361,10 +1361,26 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
 	atomic_long_dec(&mm->rss_stat.count[member]);
 }
 
+/* Optimized variant when page is already known not to be PageAnon */
+static inline int mm_counter_file(struct page *page)
+{
+	if (PageSwapBacked(page))
+		return MM_SHMEMPAGES;
+	return MM_FILEPAGES;
+}
+
+static inline int mm_counter(struct page *page)
+{
+	if (PageAnon(page))
+		return MM_ANONPAGES;
+	return mm_counter_file(page);
+}
+
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
 {
 	return get_mm_counter(mm, MM_FILEPAGES) +
-		get_mm_counter(mm, MM_ANONPAGES);
+		get_mm_counter(mm, MM_ANONPAGES) +
+		get_mm_counter(mm, MM_SHMEMPAGES);
 }
 
 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492a114f..207890be93c8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -369,9 +369,10 @@ struct core_state {
 };
 
 enum {
-	MM_FILEPAGES,
-	MM_ANONPAGES,
-	MM_SWAPENTS,
+	MM_FILEPAGES,	/* Resident file mapping pages */
+	MM_ANONPAGES,	/* Resident anonymous pages */
+	MM_SWAPENTS,	/* Anonymous swap entries */
+	MM_SHMEMPAGES,	/* Resident shared memory pages */
 	NR_MM_COUNTERS
 };
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..bb0669169716 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	lru_cache_add_active_or_unevictable(kpage, vma);
 
 	if (!PageAnon(page)) {
-		dec_mm_counter(mm, MM_FILEPAGES);
+		dec_mm_counter(mm, mm_counter_file(page));
 		inc_mm_counter(mm, MM_ANONPAGES);
 	}
 
diff --git a/mm/memory.c b/mm/memory.c
index c387430f06c3..f7026c035940 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -832,10 +832,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		} else if (is_migration_entry(entry)) {
 			page = migration_entry_to_page(entry);
 
-			if (PageAnon(page))
-				rss[MM_ANONPAGES]++;
-			else
-				rss[MM_FILEPAGES]++;
+			rss[mm_counter(page)]++;
 
 			if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
@@ -874,10 +871,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (page) {
 		get_page(page);
 		page_dup_rmap(page);
-		if (PageAnon(page))
-			rss[MM_ANONPAGES]++;
-		else
-			rss[MM_FILEPAGES]++;
+		rss[mm_counter(page)]++;
 	}
 
 out_set_pte:
@@ -1113,9 +1107,8 @@ again:
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
-			if (PageAnon(page))
-				rss[MM_ANONPAGES]--;
-			else {
+
+			if (!PageAnon(page)) {
 				if (pte_dirty(ptent)) {
 					force_flush = 1;
 					set_page_dirty(page);
@@ -1123,8 +1116,8 @@ again:
 				if (pte_young(ptent) &&
 				    likely(!(vma->vm_flags & VM_SEQ_READ)))
 					mark_page_accessed(page);
-				rss[MM_FILEPAGES]--;
 			}
+			rss[mm_counter(page)]--;
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
 				print_bad_pte(vma, addr, ptent, page);
@@ -1146,11 +1139,7 @@ again:
 			struct page *page;
 
 			page = migration_entry_to_page(entry);
-
-			if (PageAnon(page))
-				rss[MM_ANONPAGES]--;
-			else
-				rss[MM_FILEPAGES]--;
+			rss[mm_counter(page)]--;
 		}
 		if (unlikely(!free_swap_and_cache(entry)))
 			print_bad_pte(vma, addr, ptent, NULL);
@@ -1460,7 +1449,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter_fast(mm, MM_FILEPAGES);
+	inc_mm_counter_fast(mm, mm_counter_file(page));
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
@@ -2097,7 +2086,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter_fast(mm, MM_FILEPAGES);
+				dec_mm_counter_fast(mm,
+						mm_counter_file(old_page));
 				inc_mm_counter_fast(mm, MM_ANONPAGES);
 			}
 		} else {
@@ -2820,7 +2810,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
-		inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
 		page_add_file_rmap(page);
 	}
 	set_pte_at(vma->vm_mm, address, pte, entry);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c12680993ff3..dc490c06941b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	 */
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mark_oom_victim(victim);
-	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
-		K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
+		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
 	task_unlock(victim);
 
 	/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 3c3f1d21f075..622756c16ac8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1364,10 +1364,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (PageHuge(page)) {
 			hugetlb_count_sub(1 << compound_order(page), mm);
 		} else {
-			if (PageAnon(page))
-				dec_mm_counter(mm, MM_ANONPAGES);
-			else
-				dec_mm_counter(mm, MM_FILEPAGES);
+			dec_mm_counter(mm, mm_counter(page));
 		}
 		set_pte_at(mm, address, pte,
 			   swp_entry_to_pte(make_hwpoison_entry(page)));
@@ -1377,10 +1374,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		 * interest anymore. Simply discard the pte, vmscan
 		 * will take care of the rest.
 		 */
-		if (PageAnon(page))
-			dec_mm_counter(mm, MM_ANONPAGES);
-		else
-			dec_mm_counter(mm, MM_FILEPAGES);
+		dec_mm_counter(mm, mm_counter(page));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
 		swp_entry_t entry;
 		pte_t swp_pte;
@@ -1420,7 +1414,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
 	} else
-		dec_mm_counter(mm, MM_FILEPAGES);
+		dec_mm_counter(mm, mm_counter_file(page));
 
 	page_remove_rmap(page);
 	page_cache_release(page);
-- 
cgit v1.2.3


From 8cee852ec53fb530f10ccabf1596734209ae336b Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 14 Jan 2016 15:19:29 -0800
Subject: mm, procfs: breakdown RSS for anon, shmem and file in
 /proc/pid/status

There are several shortcomings with the accounting of shared memory
(SysV shm, shared anonymous mapping, mapping of a tmpfs file).  The
values in /proc/<pid>/status and <...>/statm don't allow to distinguish
between shmem memory and a shared mapping to a regular file, even though
theirs implication on memory usage are quite different: during reclaim,
file mapping can be dropped or written back on disk, while shmem needs a
place in swap.

Also, to distinguish the memory occupied by anonymous and file mappings,
one has to read the /proc/pid/statm file, which has a field for the file
mappings (again, including shmem) and total memory occupied by these
mappings (i.e.  equivalent to VmRSS in the <...>/status file.  Getting
the value for anonymous mappings only is thus not exactly user-friendly
(the statm file is intended to be rather efficiently machine-readable).

To address both of these shortcomings, this patch adds a breakdown of
VmRSS in /proc/<pid>/status via new fields RssAnon, RssFile and
RssShmem, making use of the previous preparatory patch.  These fields
tell the user the memory occupied by private anonymous pages, mapped
regular files and shmem, respectively.  Other existing fields in /status
and /statm files are left without change.  The /statm file can be
extended in the future, if there's a need for that.

Example (part of) /proc/pid/status output including the new Rss* fields:

VmPeak:  2001008 kB
VmSize:  2001004 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5108 kB
VmRSS:      5108 kB
RssAnon:              92 kB
RssFile:            1324 kB
RssShmem:           3692 kB
VmData:      192 kB
VmStk:       136 kB
VmExe:         4 kB
VmLib:      1784 kB
VmPTE:      3928 kB
VmPMD:        20 kB
VmSwap:        0 kB
HugetlbPages:          0 kB

[vbabka@suse.cz: forward-porting, tweak changelog]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 13 +++++++++++--
 fs/proc/task_mmu.c                 | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fdeb5b33349f..ffcd49589ab5 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -169,6 +169,9 @@ read the file /proc/PID/status:
   VmLck:         0 kB
   VmHWM:       476 kB
   VmRSS:       476 kB
+  RssAnon:             352 kB
+  RssFile:             120 kB
+  RssShmem:              4 kB
   VmData:      156 kB
   VmStk:        88 kB
   VmExe:        68 kB
@@ -231,7 +234,12 @@ Table 1-2: Contents of the status files (as of 4.1)
  VmSize                      total program size
  VmLck                       locked memory size
  VmHWM                       peak resident set size ("high water mark")
- VmRSS                       size of memory portions
+ VmRSS                       size of memory portions. It contains the three
+                             following parts (VmRSS = RssAnon + RssFile + RssShmem)
+ RssAnon                     size of resident anonymous memory
+ RssFile                     size of resident file mappings
+ RssShmem                    size of resident shmem memory (includes SysV shm,
+                             mapping of tmpfs and shared anonymous mappings)
  VmData                      size of data, stack, and text segments
  VmStk                       size of data, stack, and text segments
  VmExe                       size of text segment
@@ -266,7 +274,8 @@ Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
  Field    Content
  size     total program size (pages)		(same as VmSize in status)
  resident size of memory portions (pages)	(same as VmRSS in status)
- shared   number of pages that are shared	(i.e. backed by a file)
+ shared   number of pages that are shared	(i.e. backed by a file, same
+						as RssFile+RssShmem in status)
  trs      number of pages that are 'code'	(not including libs; broken,
 							includes data segment)
  lrs      number of pages of library		(always 0 on 2.6)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 45eb24145978..18a30aedfa5d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -23,9 +23,13 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap, ptes, pmds;
+	unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
+	anon = get_mm_counter(mm, MM_ANONPAGES);
+	file = get_mm_counter(mm, MM_FILEPAGES);
+	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
 	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
@@ -36,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	hiwater_vm = total_vm = mm->total_vm;
 	if (hiwater_vm < mm->hiwater_vm)
 		hiwater_vm = mm->hiwater_vm;
-	hiwater_rss = total_rss = get_mm_rss(mm);
+	hiwater_rss = total_rss = anon + file + shmem;
 	if (hiwater_rss < mm->hiwater_rss)
 		hiwater_rss = mm->hiwater_rss;
 
@@ -53,6 +57,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPin:\t%8lu kB\n"
 		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
+		"RssAnon:\t%8lu kB\n"
+		"RssFile:\t%8lu kB\n"
+		"RssShmem:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
@@ -66,6 +73,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
 		total_rss << (PAGE_SHIFT-10),
+		anon << (PAGE_SHIFT-10),
+		file << (PAGE_SHIFT-10),
+		shmem << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
-- 
cgit v1.2.3


From b832861cca231c0ef27e238b28788045da3262f2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 14 Jan 2016 15:19:44 -0800
Subject: fs/block_dev.c:bdev_write_page(): use blk_queue_enter(..., GFP_NOIO)

bdev_write_page() is used by swapout and by writepage where we cannot
use __GFP_FS or __GFP_IO.  So it is misleading to mention GFP_KERNEL
here.

blk_queue_enter() only actually looks at __GFP_DIRECT_RECLAIM, so no
bugs were harmed in the making of this patch.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d33071dd683e..81c0705558be 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -437,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	result = blk_queue_enter(bdev->bd_queue, GFP_NOIO);
 	if (result)
 		return result;
 
-- 
cgit v1.2.3


From 84ad5802a33a4964a49b8f7d24d80a214a096b19 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Jan 2016 15:20:18 -0800
Subject: proc: meminfo: estimate available memory more conservatively

The MemAvailable item in /proc/meminfo is to give users a hint of how
much memory is allocatable without causing swapping, so it excludes the
zones' low watermarks as unavailable to userspace.

However, for a userspace allocation, kswapd will actually reclaim until
the free pages hit a combination of the high watermark and the page
allocator's lowmem protection that keeps a certain amount of DMA and
DMA32 memory from userspace as well.

Subtract the full amount we know to be unavailable to userspace from the
number of free pages when calculating MemAvailable.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 9155a5a0d3b9..df4661abadc4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	/*
 	 * Estimate the amount of memory available for userspace allocations,
 	 * without causing swapping.
-	 *
-	 * Free memory cannot be taken below the low watermark, before the
-	 * system starts swapping.
 	 */
-	available = i.freeram - wmark_low;
+	available = i.freeram - totalreserve_pages;
 
 	/*
 	 * Not all the page cache can be freed, otherwise the system will
-- 
cgit v1.2.3


From 0e41e277971799bad2764ad4e6284817e9a6da5b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 14 Jan 2016 15:21:46 -0800
Subject: mm: /proc/pid/clear_refs: no need to clear VM_SOFTDIRTY in
 clear_soft_dirty_pmd()

clear_soft_dirty_pmd() is called by clear_refs_write(CLEAR_REFS_SOFT_DIRTY),
VM_SOFTDIRTY was already cleared before walk_page_range().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18a30aedfa5d..46d9619d0aea 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -881,9 +881,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_clear_soft_dirty(pmd);
 
-	if (vma->vm_flags & VM_SOFTDIRTY)
-		vma->vm_flags &= ~VM_SOFTDIRTY;
-
 	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 }
 #else
-- 
cgit v1.2.3


From 3e89e1c5ea84211b99199c7636a7d73c50c6b512 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 14 Jan 2016 15:21:52 -0800
Subject: hugetlb: make mm and fs code explicitly non-modular

The Kconfig currently controlling compilation of this code is:

config HUGETLBFS
        bool "HugeTLB file system support"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that when
reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular case,
the init ordering gets moved to earlier levels when we use the more
appropriate initcalls here.

Originally I had the fs part and the mm part as separate commits, just
by happenstance of the nature of how I detected these non-modular use
cases.  But that can possibly introduce regressions if the patch merge
ordering puts the fs part 1st -- as the 0-day testing reported a splat
at mount time.

Investigating with "initcall_debug" showed that the delta was
init_hugetlbfs_fs being called _before_ hugetlb_init instead of after.  So
both the fs change and the mm change are here together.

In addition, it worked before due to luck of link order, since they were
both in the same initcall category.  So we now have the fs part using
fs_initcall, and the mm part using subsys_initcall, which puts it one
bucket earlier.  It now passes the basic sanity test that failed in
earlier 0-day testing.

We delete the MODULE_LICENSE tag and capture that information at the top
of the file alongside author comments, etc.

We don't replace module.h with init.h since the file already has that.
Also note that MODULE_ALIAS is a no-op for non-modular code.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Reported-by: kernel test robot <ying.huang@linux.intel.com>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 27 ++-------------------------
 mm/hugetlb.c         | 39 +--------------------------------------
 2 files changed, 3 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a1cb8fd2289b..47789292a582 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
  * Nadia Yvette Chambers, 2002
  *
  * Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>		/* remove ASAP */
@@ -1202,7 +1202,6 @@ static struct file_system_type hugetlbfs_fs_type = {
 	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
-MODULE_ALIAS_FS("hugetlbfs");
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 
@@ -1356,26 +1355,4 @@ static int __init init_hugetlbfs_fs(void)
  out2:
 	return error;
 }
-
-static void __exit exit_hugetlbfs_fs(void)
-{
-	struct hstate *h;
-	int i;
-
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(hugetlbfs_inode_cachep);
-	i = 0;
-	for_each_hstate(h)
-		kern_unmount(hugetlbfs_vfsmount[i++]);
-	unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef6963b577fd..be934df69b85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4,7 +4,6 @@
  */
 #include <linux/list.h>
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
@@ -2549,25 +2548,6 @@ static void hugetlb_unregister_node(struct node *node)
 	nhs->hugepages_kobj = NULL;
 }
 
-/*
- * hugetlb module exit:  unregister hstate attributes from node devices
- * that have them.
- */
-static void hugetlb_unregister_all_nodes(void)
-{
-	int nid;
-
-	/*
-	 * disable node device registrations.
-	 */
-	register_hugetlbfs_with_node(NULL, NULL);
-
-	/*
-	 * remove hstate attributes from any nodes that have them.
-	 */
-	for (nid = 0; nid < nr_node_ids; nid++)
-		hugetlb_unregister_node(node_devices[nid]);
-}
 
 /*
  * Register hstate attributes for a single node device.
@@ -2632,27 +2612,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
 	return NULL;
 }
 
-static void hugetlb_unregister_all_nodes(void) { }
-
 static void hugetlb_register_all_nodes(void) { }
 
 #endif
 
-static void __exit hugetlb_exit(void)
-{
-	struct hstate *h;
-
-	hugetlb_unregister_all_nodes();
-
-	for_each_hstate(h) {
-		kobject_put(hstate_kobjs[hstate_index(h)]);
-	}
-
-	kobject_put(hugepages_kobj);
-	kfree(hugetlb_fault_mutex_table);
-}
-module_exit(hugetlb_exit);
-
 static int __init hugetlb_init(void)
 {
 	int i;
@@ -2690,7 +2653,7 @@ static int __init hugetlb_init(void)
 		mutex_init(&hugetlb_fault_mutex_table[i]);
 	return 0;
 }
-module_init(hugetlb_init);
+subsys_initcall(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
 void __init hugetlb_add_hstate(unsigned int order)
-- 
cgit v1.2.3


From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 14 Jan 2016 15:22:07 -0800
Subject: mm: rework virtual memory accounting

When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which
testing the RLIMIT_DATA value to figure out if we're allowed to assign
new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been
commited that RLIMIT_DATA in a form it's implemented now doesn't do
anything useful because most of user-space libraries use mmap() syscall
for dynamic memory allocations.

Linus suggested to convert RLIMIT_DATA rlimit into something suitable
for anonymous memory accounting.  But in this patch we go further, and
the changes are bundled together as:

 * keep vma counting if CONFIG_PROC_FS=n, will be used for limits
 * replace mm->shared_vm with better defined mm->data_vm
 * account anonymous executable areas as executable
 * account file-backed growsdown/up areas as stack
 * drop struct file* argument from vm_stat_account
 * enforce RLIMIT_DATA for size of data areas

This way code looks cleaner: now code/stack/data classification depends
only on vm_flags state:

 VM_EXEC & ~VM_WRITE            -> code  (VmExe + VmLib in proc)
 VM_GROWSUP | VM_GROWSDOWN      -> stack (VmStk)
 VM_WRITE & ~VM_SHARED & !stack -> data  (VmData)

The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called
"shared", but that might be strange beast like readonly-private or VM_IO
area.

 - RLIMIT_AS            limits whole address space "VmSize"
 - RLIMIT_STACK         limits stack "VmStk" (but each vma individually)
 - RLIMIT_DATA          now limits "VmData"

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@google.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  3 +--
 fs/proc/task_mmu.c         |  7 +++---
 include/linux/mm.h         | 13 +++-------
 include/linux/mm_types.h   |  2 +-
 kernel/fork.c              |  5 ++--
 mm/debug.c                 |  4 +--
 mm/mmap.c                  | 63 +++++++++++++++++++++++-----------------------
 mm/mprotect.c              |  8 ++++--
 mm/mremap.c                |  7 +++---
 9 files changed, 54 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 60e02f7747ff..9cd607b06964 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2332,8 +2332,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	 */
 	insert_vm_struct(mm, vma);
 
-	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 46d9619d0aea..a353b4c6e86e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -23,7 +23,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	if (hiwater_rss < mm->hiwater_rss)
 		hiwater_rss = mm->hiwater_rss;
 
-	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -76,7 +75,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		anon << (PAGE_SHIFT-10),
 		file << (PAGE_SHIFT-10),
 		shmem << (PAGE_SHIFT-10),
-		data << (PAGE_SHIFT-10),
+		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
@@ -97,7 +96,7 @@ unsigned long task_statm(struct mm_struct *mm,
 			get_mm_counter(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
-	*data = mm->total_vm - mm->shared_vm;
+	*data = mm->data_vm + mm->stack_vm;
 	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
 	return mm->total_vm;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ec9d4559514d..839d9e9a1c38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1929,7 +1929,9 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 
-extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
+extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
+extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
+
 extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
 				   unsigned long flags,
@@ -2147,15 +2149,6 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
-#else
-static inline void vm_stat_account(struct mm_struct *mm,
-			unsigned long flags, struct file *file, long pages)
-{
-	mm->total_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern bool _debug_pagealloc_enabled;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 207890be93c8..6bc9a0ce2253 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -427,7 +427,7 @@ struct mm_struct {
 	unsigned long total_vm;		/* Total pages mapped */
 	unsigned long locked_vm;	/* Pages that have PG_mlocked set */
 	unsigned long pinned_vm;	/* Refcount permanently increased */
-	unsigned long shared_vm;	/* Shared pages (files) */
+	unsigned long data_vm;		/* VM_WRITE & ~VM_SHARED/GROWSDOWN */
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
 	unsigned long def_flags;
diff --git a/kernel/fork.c b/kernel/fork.c
index 51915842f1c0..2e391c754ae7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
 
 	mm->total_vm = oldmm->total_vm;
-	mm->shared_vm = oldmm->shared_vm;
+	mm->data_vm = oldmm->data_vm;
 	mm->exec_vm = oldmm->exec_vm;
 	mm->stack_vm = oldmm->stack_vm;
 
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
-			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-							-vma_pages(mpnt));
+			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
 		}
 		charge = 0;
diff --git a/mm/debug.c b/mm/debug.c
index 668aa35191ca..5d2072ed8d5e 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm)
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
 		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
-		"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm_nr_pmds((struct mm_struct *)mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
-		mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
 		mm->start_code, mm->end_code, mm->start_data, mm->end_data,
 		mm->start_brk, mm->brk, mm->start_stack,
 		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
diff --git a/mm/mmap.c b/mm/mmap.c
index f32b84ad621a..b3f00b616b81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,24 +1220,6 @@ none:
 	return NULL;
 }
 
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
-						struct file *file, long pages)
-{
-	const unsigned long stack_flags
-		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-
-	mm->total_vm += pages;
-
-	if (file) {
-		mm->shared_vm += pages;
-		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
-			mm->exec_vm += pages;
-	} else if (flags & stack_flags)
-		mm->stack_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
-
 /*
  * If a hint addr is less than mmap_min_addr change hint to be as
  * low as possible but still greater than mmap_min_addr
@@ -1556,7 +1538,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long charged = 0;
 
 	/* Check against address space limit. */
-	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
 		unsigned long nr_pages;
 
 		/*
@@ -1565,7 +1547,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		 */
 		nr_pages = count_vma_pages_range(mm, addr, addr + len);
 
-		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+		if (!may_expand_vm(mm, vm_flags,
+					(len >> PAGE_SHIFT) - nr_pages))
 			return -ENOMEM;
 	}
 
@@ -1664,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 out:
 	perf_event_mmap(vma);
 
-	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm)))
@@ -2111,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 	unsigned long new_start, actual_size;
 
 	/* address space limit tests */
-	if (!may_expand_vm(mm, grow))
+	if (!may_expand_vm(mm, vma->vm_flags, grow))
 		return -ENOMEM;
 
 	/* Stack limit test */
@@ -2208,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				spin_lock(&mm->page_table_lock);
 				if (vma->vm_flags & VM_LOCKED)
 					mm->locked_vm += grow;
-				vm_stat_account(mm, vma->vm_flags,
-						vma->vm_file, grow);
+				vm_stat_account(mm, vma->vm_flags, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
 				anon_vma_interval_tree_post_update_vma(vma);
@@ -2284,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma,
 				spin_lock(&mm->page_table_lock);
 				if (vma->vm_flags & VM_LOCKED)
 					mm->locked_vm += grow;
-				vm_stat_account(mm, vma->vm_flags,
-						vma->vm_file, grow);
+				vm_stat_account(mm, vma->vm_flags, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
@@ -2399,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += nrpages;
-		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+		vm_stat_account(mm, vma->vm_flags, -nrpages);
 		vma = remove_vma(vma);
 	} while (vma);
 	vm_unacct_memory(nr_accounted);
@@ -2769,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	}
 
 	/* Check against address space limits *after* clearing old maps... */
-	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
 	if (mm->map_count > sysctl_max_map_count)
@@ -2804,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
+	mm->data_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
@@ -2995,9 +2977,28 @@ out:
  * Return true if the calling process may expand its vm space by the passed
  * number of pages
  */
-int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
 {
-	return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT;
+	if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+		return false;
+
+	if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
+				(VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
+		return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+
+	return true;
+}
+
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+	mm->total_vm += npages;
+
+	if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+		mm->exec_vm += npages;
+	else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+		mm->stack_vm += npages;
+	else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+		mm->data_vm += npages;
 }
 
 static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3079,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping(
 	if (ret)
 		goto out;
 
-	mm->total_vm += len >> PAGE_SHIFT;
+	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
 
 	perf_event_mmap(vma);
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..c764402c464f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * even if read-only so there is no need to account for them here
 	 */
 	if (newflags & VM_WRITE) {
+		/* Check space limits when area turns into data. */
+		if (!may_expand_vm(mm, newflags, nrpages) &&
+				may_expand_vm(mm, oldflags, nrpages))
+			return -ENOMEM;
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
@@ -334,8 +338,8 @@ success:
 		populate_vma_page_range(vma, start, end, NULL);
 	}
 
-	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	vm_stat_account(mm, oldflags, -nrpages);
+	vm_stat_account(mm, newflags, nrpages);
 	perf_event_mmap(vma);
 	return 0;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index de824e72c3e8..e55b157865d5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
 	hiwater_vm = mm->hiwater_vm;
-	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
 
 	/* Tell pfnmap has moved from this vma */
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
@@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 			return ERR_PTR(-EAGAIN);
 	}
 
-	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+	if (!may_expand_vm(mm, vma->vm_flags,
+				(new_len - old_len) >> PAGE_SHIFT))
 		return ERR_PTR(-ENOMEM);
 
 	if (vma->vm_flags & VM_ACCOUNT) {
@@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 				goto out;
 			}
 
-			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+			vm_stat_account(mm, vma->vm_flags, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				mm->locked_vm += pages;
 				locked = true;
-- 
cgit v1.2.3