From a9aec5881b9d4aca184b29d33484a6a58d23f7f2 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 20 Jan 2016 14:58:35 -0800
Subject: lib/iomap_copy.c: add __ioread32_copy()

Some drivers need to read data out of iomem areas 32-bits at a time.
Add an API to do this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Cc: <zajec5@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Paul Walmsley <paul@pwsan.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/io.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/io.h b/include/linux/io.h
index fffd88d7f426..32403b5716e5 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -29,6 +29,7 @@ struct device;
 struct resource;
 
 __visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+void __ioread32_copy(void *to, const void __iomem *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From 243c2137cda52599f6112f52b6be5e61fa6536ae Mon Sep 17 00:00:00 2001
From: Adam Barth <aurorean@gmail.com>
Date: Wed, 20 Jan 2016 14:59:09 -0800
Subject: include/linux/radix-tree.h: fix error in docs about locks

This text refers to the "first 7 functions", which was correct when
written but became incorrect when Johannes Weiner added another function
to the list in 139e561660fe ("lib: radix_tree: tree node interface").

Change the text to correctly refer to the first 8 functions.

Signed-off-by: Adam Barth <aurorean@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 33170dbd9db4..57e7d87d2d4c 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -154,7 +154,7 @@ do {									\
  * radix_tree_gang_lookup_tag_slot
  * radix_tree_tagged
  *
- * The first 7 functions are able to be called locklessly, using RCU. The
+ * The first 8 functions are able to be called locklessly, using RCU. The
  * caller must ensure calls to these functions are made within rcu_read_lock()
  * regions. Other readers (lock-free or otherwise) and modifications may be
  * running concurrently.
-- 
cgit v1.2.3


From df0108c5da561c66c333bb46bfe3c1fc65905898 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 20 Jan 2016 14:59:24 -0800
Subject: epoll: add EPOLLEXCLUSIVE flag

Currently, epoll file descriptors or epfds (the fd returned from
epoll_create[1]()) that are added to a shared wakeup source are always
added in a non-exclusive manner.  This means that when we have multiple
epfds attached to a shared fd source they are all woken up.  This creates
thundering herd type behavior.

Introduce a new 'EPOLLEXCLUSIVE' flag that can be passed as part of the
'event' argument during an epoll_ctl() EPOLL_CTL_ADD operation.  This new
flag allows for exclusive wakeups when there are multiple epfds attached
to a shared fd event source.

The implementation walks the list of exclusive waiters, and queues an
event to each epfd, until it finds the first waiter that has threads
blocked on it via epoll_wait().  The idea is to search for threads which
are idle and ready to process the wakeup events.  Thus, we queue an event
to at least 1 epfd, but may still potentially queue an event to all epfds
that are attached to the shared fd source.

Performance testing was done by Madars Vitolins using a modified version
of Enduro/X.  The use of the 'EPOLLEXCLUSIVE' flag reduce the length of
this particular workload from 860s down to 24s.

Sample epoll_clt text:

EPOLLEXCLUSIVE

  Sets an exclusive wakeup mode for the epfd file descriptor that is
  being attached to the target file descriptor, fd.  Thus, when an event
  occurs and multiple epfd file descriptors are attached to the same
  target file using EPOLLEXCLUSIVE, one or more epfds will receive an
  event with epoll_wait(2).  The default in this scenario (when
  EPOLLEXCLUSIVE is not set) is for all epfds to receive an event.
  EPOLLEXCLUSIVE may only be specified with the op EPOLL_CTL_ADD.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Tested-by: Madars Vitolins <m@silodev.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c                 | 24 +++++++++++++++++++++---
 include/uapi/linux/eventpoll.h |  3 +++
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e009cad8d5c..ae1dbcf47e97 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -92,7 +92,7 @@
  */
 
 /* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
 
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
@@ -1002,6 +1002,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	unsigned long flags;
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
+	int ewake = 0;
 
 	if ((unsigned long)key & POLLFREE) {
 		ep_pwq_from_wait(wait)->whead = NULL;
@@ -1066,8 +1067,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 	 * wait list.
 	 */
-	if (waitqueue_active(&ep->wq))
+	if (waitqueue_active(&ep->wq)) {
+		ewake = 1;
 		wake_up_locked(&ep->wq);
+	}
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1078,6 +1081,9 @@ out_unlock:
 	if (pwake)
 		ep_poll_safewake(&ep->poll_wait);
 
+	if (epi->event.events & EPOLLEXCLUSIVE)
+		return ewake;
+
 	return 1;
 }
 
@@ -1095,7 +1101,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 		pwq->whead = whead;
 		pwq->base = epi;
-		add_wait_queue(whead, &pwq->wait);
+		if (epi->event.events & EPOLLEXCLUSIVE)
+			add_wait_queue_exclusive(whead, &pwq->wait);
+		else
+			add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
 	} else {
@@ -1861,6 +1870,15 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	if (f.file == tf.file || !is_file_epoll(f.file))
 		goto error_tgt_fput;
 
+	/*
+	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
+	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
+	 * Also, we do not currently supported nested exclusive wakeups.
+	 */
+	if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
+		(op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
+		goto error_tgt_fput;
+
 	/*
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2e1f0e..1c3154913a39 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,9 @@
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
 
+/* Set exclusive wakeup mode for the target file descriptor */
+#define EPOLLEXCLUSIVE (1 << 28)
+
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
  * from happening while those events are being processed.
-- 
cgit v1.2.3


From caaee6234d05a58c5b4d05e7bf766131b810a657 Mon Sep 17 00:00:00 2001
From: Jann Horn <jann@thejh.net>
Date: Wed, 20 Jan 2016 15:00:04 -0800
Subject: ptrace: use fsuid, fsgid, effective creds for fs access checks

By checking the effective credentials instead of the real UID / permitted
capabilities, ensure that the calling process actually intended to use its
credentials.

To ensure that all ptrace checks use the correct caller credentials (e.g.
in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS
flag), use two new flags and require one of them to be set.

The problem was that when a privileged task had temporarily dropped its
privileges, e.g.  by calling setreuid(0, user_uid), with the intent to
perform following syscalls with the credentials of a user, it still passed
ptrace access checks that the user would not be able to pass.

While an attacker should not be able to convince the privileged task to
perform a ptrace() syscall, this is a problem because the ptrace access
check is reused for things in procfs.

In particular, the following somewhat interesting procfs entries only rely
on ptrace access checks:

 /proc/$pid/stat - uses the check for determining whether pointers
     should be visible, useful for bypassing ASLR
 /proc/$pid/maps - also useful for bypassing ASLR
 /proc/$pid/cwd - useful for gaining access to restricted
     directories that contain files with lax permissions, e.g. in
     this scenario:
     lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar
     drwx------ root root /root
     drwxr-xr-x root root /root/foobar
     -rw-r--r-- root root /root/foobar/secret

Therefore, on a system where a root-owned mode 6755 binary changes its
effective credentials as described and then dumps a user-specified file,
this could be used by an attacker to reveal the memory layout of root's
processes or reveal the contents of files he is not allowed to access
(through /proc/$pid/cwd).

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Jann Horn <jann@thejh.net>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: "Serge E. Hallyn" <serge.hallyn@ubuntu.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c        |  2 +-
 fs/proc/base.c         | 21 +++++++++++----------
 fs/proc/namespaces.c   |  4 ++--
 include/linux/ptrace.h | 24 +++++++++++++++++++++++-
 kernel/events/core.c   |  2 +-
 kernel/futex.c         |  2 +-
 kernel/futex_compat.c  |  2 +-
 kernel/kcmp.c          |  4 ++--
 kernel/ptrace.c        | 39 +++++++++++++++++++++++++++++++--------
 mm/process_vm_access.c |  2 +-
 security/commoncap.c   |  7 ++++++-
 11 files changed, 80 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d73291f5f0fc..b6c00ce0e29e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
-	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2cf5d7e37375..e665097c1da5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
 			 struct pid *pid, struct task_struct *task)
 {
-	struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+	struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
 	if (mm && !IS_ERR(mm)) {
 		unsigned int nwords = 0;
 		do {
@@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 
 	wchan = get_wchan(task);
 
-	if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
+	if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+			&& !lookup_symbol_name(wchan, symname))
 		seq_printf(m, "%s", symname);
 	else
 		seq_putc(m, '0');
@@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task)
 	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
 	if (err)
 		return err;
-	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
 		mutex_unlock(&task->signal->cred_guard_mutex);
 		return -EPERM;
 	}
@@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	 */
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 		put_task_struct(task);
 	}
 	return allowed;
@@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 		return true;
 	if (in_group_p(pid->pid_gid))
 		return true;
-	return ptrace_may_access(task, PTRACE_MODE_READ);
+	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 }
 
 
@@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 	struct mm_struct *mm = ERR_PTR(-ESRCH);
 
 	if (task) {
-		mm = mm_access(task, mode);
+		mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
 		put_task_struct(task);
 
 		if (!IS_ERR_OR_NULL(mm)) {
@@ -1860,7 +1861,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (!task)
 		goto out_notask;
 
-	mm = mm_access(task, PTRACE_MODE_READ);
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
 	if (IS_ERR_OR_NULL(mm))
 		goto out;
 
@@ -2013,7 +2014,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 		goto out;
 
 	result = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 		goto out_put_task;
 
 	result = -ENOENT;
@@ -2066,7 +2067,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 		goto out_put_task;
 
 	ret = 0;
@@ -2533,7 +2534,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	if (result)
 		return result;
 
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		result = -EACCES;
 		goto out_unlock;
 	}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 1dece8781f91..276f12431dbf 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -46,7 +46,7 @@ static const char *proc_ns_get_link(struct dentry *dentry,
 	if (!task)
 		return error;
 
-	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		error = ns_get_path(&ns_path, task, ns_ops);
 		if (!error)
 			nd_jump_link(&ns_path);
@@ -67,7 +67,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!task)
 		return res;
 
-	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		res = ns_get_name(name, sizeof(name), task, ns_ops);
 		if (res >= 0)
 			res = readlink_copy(buffer, buflen, name);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 061265f92876..504c98a278d4 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 #define PTRACE_MODE_READ	0x01
 #define PTRACE_MODE_ATTACH	0x02
 #define PTRACE_MODE_NOAUDIT	0x04
-/* Returns true on success, false on denial. */
+#define PTRACE_MODE_FSCREDS 0x08
+#define PTRACE_MODE_REALCREDS 0x10
+
+/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
+#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
+#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
+#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
+#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
+
+/**
+ * ptrace_may_access - check whether the caller is permitted to access
+ * a target task.
+ * @task: target task
+ * @mode: selects type of access and caller credentials
+ *
+ * Returns true on success, false on denial.
+ *
+ * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must
+ * be set in @mode to specify whether the access was requested through
+ * a filesystem syscall (should use effective capabilities and fsuid
+ * of the caller) or through an explicit syscall such as
+ * process_vm_writev or ptrace (and should use the real credentials).
+ */
 extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bf8244190d0f..c0957416b32e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3376,7 +3376,7 @@ find_lively_task_by_vpid(pid_t vpid)
 
 	/* Reuse ptrace permission checks for now. */
 	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
 		goto errout;
 
 	return task;
diff --git a/kernel/futex.c b/kernel/futex.c
index c6f514573b28..0773f2b23b10 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2884,7 +2884,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 	}
 
 	ret = -EPERM;
-	if (!ptrace_may_access(p, PTRACE_MODE_READ))
+	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
 	head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 55c8c9349cfe..4ae3232e7a28 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 	}
 
 	ret = -EPERM;
-	if (!ptrace_may_access(p, PTRACE_MODE_READ))
+	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
 	head = p->compat_robust_list;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea1d8fd..3a47fa998fe0 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 			&task2->signal->cred_guard_mutex);
 	if (ret)
 		goto err;
-	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
-	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) {
 		ret = -EPERM;
 		goto err_unlock;
 	}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aa94aee9d4c9..2341efe7fe02 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
 static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
+	int dumpable = 0;
+	kuid_t caller_uid;
+	kgid_t caller_gid;
+
+	if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
+		WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
+		return -EPERM;
+	}
 
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	int dumpable = 0;
+
 	/* Don't let security modules deny introspection */
 	if (same_thread_group(task, current))
 		return 0;
 	rcu_read_lock();
+	if (mode & PTRACE_MODE_FSCREDS) {
+		caller_uid = cred->fsuid;
+		caller_gid = cred->fsgid;
+	} else {
+		/*
+		 * Using the euid would make more sense here, but something
+		 * in userland might rely on the old behavior, and this
+		 * shouldn't be a security problem since
+		 * PTRACE_MODE_REALCREDS implies that the caller explicitly
+		 * used a syscall that requests access to another process
+		 * (and not a filesystem syscall to procfs).
+		 */
+		caller_uid = cred->uid;
+		caller_gid = cred->gid;
+	}
 	tcred = __task_cred(task);
-	if (uid_eq(cred->uid, tcred->euid) &&
-	    uid_eq(cred->uid, tcred->suid) &&
-	    uid_eq(cred->uid, tcred->uid)  &&
-	    gid_eq(cred->gid, tcred->egid) &&
-	    gid_eq(cred->gid, tcred->sgid) &&
-	    gid_eq(cred->gid, tcred->gid))
+	if (uid_eq(caller_uid, tcred->euid) &&
+	    uid_eq(caller_uid, tcred->suid) &&
+	    uid_eq(caller_uid, tcred->uid)  &&
+	    gid_eq(caller_gid, tcred->egid) &&
+	    gid_eq(caller_gid, tcred->sgid) &&
+	    gid_eq(caller_gid, tcred->gid))
 		goto ok;
 	if (ptrace_has_cap(tcred->user_ns, mode))
 		goto ok;
@@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 		goto out;
 
 	task_lock(task);
-	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e88d071648c2..5d453e58ddbf 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 		goto free_proc_pages;
 	}
 
-	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
 	if (!mm || IS_ERR(mm)) {
 		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		/*
diff --git a/security/commoncap.c b/security/commoncap.c
index 1832cf701c3d..48071ed7c445 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -137,12 +137,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 	int ret = 0;
 	const struct cred *cred, *child_cred;
+	const kernel_cap_t *caller_caps;
 
 	rcu_read_lock();
 	cred = current_cred();
 	child_cred = __task_cred(child);
+	if (mode & PTRACE_MODE_FSCREDS)
+		caller_caps = &cred->cap_effective;
+	else
+		caller_caps = &cred->cap_permitted;
 	if (cred->user_ns == child_cred->user_ns &&
-	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
+	    cap_issubset(child_cred->cap_permitted, *caller_caps))
 		goto out;
 	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
 		goto out;
-- 
cgit v1.2.3


From 4b804c85dc37db6c108832b28cd54673ff7ee037 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 20 Jan 2016 15:00:19 -0800
Subject: kernel/cpu.c: export __cpu_*_mask

Exporting the cpumasks __cpu_possible_mask and friends will allow us to
remove the extra indirection through the cpu_*_mask variables.  It will
also allow the set_cpu_* functions to become static inlines, which will
give a .text reduction.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h |  4 ++++
 kernel/cpu.c            | 14 +++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 59915ea5373c..d4545a1852f2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -89,6 +89,10 @@ extern const struct cpumask *const cpu_possible_mask;
 extern const struct cpumask *const cpu_online_mask;
 extern const struct cpumask *const cpu_present_mask;
 extern const struct cpumask *const cpu_active_mask;
+extern struct cpumask __cpu_possible_mask;
+extern struct cpumask __cpu_online_mask;
+extern struct cpumask __cpu_present_mask;
+extern struct cpumask __cpu_active_mask;
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpumask_weight(cpu_online_mask)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6a96b713cea7..35d1d45be8e9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -759,23 +759,27 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
 
 #ifdef CONFIG_INIT_ALL_POSSIBLE
-static struct cpumask __cpu_possible_mask __read_mostly
+struct cpumask __cpu_possible_mask __read_mostly
 	= {CPU_BITS_ALL};
 #else
-static struct cpumask __cpu_possible_mask __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
 #endif
+EXPORT_SYMBOL(__cpu_possible_mask);
 const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask;
 EXPORT_SYMBOL(cpu_possible_mask);
 
-static struct cpumask __cpu_online_mask __read_mostly;
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
 const struct cpumask *const cpu_online_mask = &__cpu_online_mask;
 EXPORT_SYMBOL(cpu_online_mask);
 
-static struct cpumask __cpu_present_mask __read_mostly;
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
 const struct cpumask *const cpu_present_mask = &__cpu_present_mask;
 EXPORT_SYMBOL(cpu_present_mask);
 
-static struct cpumask __cpu_active_mask __read_mostly;
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
 const struct cpumask *const cpu_active_mask = &__cpu_active_mask;
 EXPORT_SYMBOL(cpu_active_mask);
 
-- 
cgit v1.2.3


From 5aec01b834fd6f8ca49d1aeede665b950d0c148e Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 20 Jan 2016 15:00:25 -0800
Subject: kernel/cpu.c: eliminate cpu_*_mask

Replace the variables cpu_possible_mask, cpu_online_mask, cpu_present_mask
and cpu_active_mask with macros expanding to expressions of the same type
and value, eliminating some indirection.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 8 ++++----
 kernel/cpu.c            | 8 --------
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d4545a1852f2..52ab539aefce 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -85,14 +85,14 @@ extern int nr_cpu_ids;
  *    only one CPU.
  */
 
-extern const struct cpumask *const cpu_possible_mask;
-extern const struct cpumask *const cpu_online_mask;
-extern const struct cpumask *const cpu_present_mask;
-extern const struct cpumask *const cpu_active_mask;
 extern struct cpumask __cpu_possible_mask;
 extern struct cpumask __cpu_online_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
+#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
+#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
+#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
+#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpumask_weight(cpu_online_mask)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 35d1d45be8e9..8734fc74fcbc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -765,23 +765,15 @@ struct cpumask __cpu_possible_mask __read_mostly
 struct cpumask __cpu_possible_mask __read_mostly;
 #endif
 EXPORT_SYMBOL(__cpu_possible_mask);
-const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask;
-EXPORT_SYMBOL(cpu_possible_mask);
 
 struct cpumask __cpu_online_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_online_mask);
-const struct cpumask *const cpu_online_mask = &__cpu_online_mask;
-EXPORT_SYMBOL(cpu_online_mask);
 
 struct cpumask __cpu_present_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_present_mask);
-const struct cpumask *const cpu_present_mask = &__cpu_present_mask;
-EXPORT_SYMBOL(cpu_present_mask);
 
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
-const struct cpumask *const cpu_active_mask = &__cpu_active_mask;
-EXPORT_SYMBOL(cpu_active_mask);
 
 void set_cpu_possible(unsigned int cpu, bool possible)
 {
-- 
cgit v1.2.3


From 9425676a363c0976e3d43dda792dc4711a651d1d Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 20 Jan 2016 15:00:28 -0800
Subject: kernel/cpu.c: make set_cpu_* static inlines

Almost all callers of the set_cpu_* functions pass an explicit true or
false.  Making them static inline thus replaces the function calls with a
simple set_bit/clear_bit, saving some .text.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 43 +++++++++++++++++++++++++++++++++++++++----
 kernel/cpu.c            | 34 ----------------------------------
 2 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 52ab539aefce..fc14275ff34e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -720,14 +720,49 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
-void set_cpu_possible(unsigned int cpu, bool possible);
-void set_cpu_present(unsigned int cpu, bool present);
-void set_cpu_online(unsigned int cpu, bool online);
-void set_cpu_active(unsigned int cpu, bool active);
 void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
 
+static inline void
+set_cpu_possible(unsigned int cpu, bool possible)
+{
+	if (possible)
+		cpumask_set_cpu(cpu, &__cpu_possible_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
+}
+
+static inline void
+set_cpu_present(unsigned int cpu, bool present)
+{
+	if (present)
+		cpumask_set_cpu(cpu, &__cpu_present_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_present_mask);
+}
+
+static inline void
+set_cpu_online(unsigned int cpu, bool online)
+{
+	if (online) {
+		cpumask_set_cpu(cpu, &__cpu_online_mask);
+		cpumask_set_cpu(cpu, &__cpu_active_mask);
+	} else {
+		cpumask_clear_cpu(cpu, &__cpu_online_mask);
+	}
+}
+
+static inline void
+set_cpu_active(unsigned int cpu, bool active)
+{
+	if (active)
+		cpumask_set_cpu(cpu, &__cpu_active_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_active_mask);
+}
+
+
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8734fc74fcbc..5b9d39633ce9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -775,40 +775,6 @@ EXPORT_SYMBOL(__cpu_present_mask);
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
-	if (possible)
-		cpumask_set_cpu(cpu, &__cpu_possible_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
-{
-	if (present)
-		cpumask_set_cpu(cpu, &__cpu_present_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_present_mask);
-}
-
-void set_cpu_online(unsigned int cpu, bool online)
-{
-	if (online) {
-		cpumask_set_cpu(cpu, &__cpu_online_mask);
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	} else {
-		cpumask_clear_cpu(cpu, &__cpu_online_mask);
-	}
-}
-
-void set_cpu_active(unsigned int cpu, bool active)
-{
-	if (active)
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_active_mask);
-}
-
 void init_cpu_present(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_present_mask, src);
-- 
cgit v1.2.3


From 978e30c9b46161c792ecdad0091fd017b21b8ca5 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Wed, 20 Jan 2016 15:00:36 -0800
Subject: kexec: move some memembers and definitions within the scope of
 CONFIG_KEXEC_FILE

Move the stuff currently only used by the kexec file code within
CONFIG_KEXEC_FILE (and CONFIG_KEXEC_VERIFY_SIG).

Also move internal "struct kexec_sha_region" and "struct kexec_buf" into
"kexec_internal.h".

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/machine_kexec_64.c |  2 ++
 include/linux/kexec.h              | 62 +++++++++++++++-----------------------
 kernel/kexec_file.c                |  2 ++
 kernel/kexec_internal.h            | 21 +++++++++++++
 4 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..ba7fbba9831b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return image->fops->cleanup(image->image_loader_data);
 }
 
+#ifdef CONFIG_KEXEC_VERIFY_SIG
 int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
 				 unsigned long kernel_len)
 {
@@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
 
 	return image->fops->verify_sig(kernel, kernel_len);
 }
+#endif
 
 /*
  * Apply purgatory relocations.
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 7b68d2788a56..2cc643c6e870 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -109,11 +109,7 @@ struct compat_kexec_segment {
 };
 #endif
 
-struct kexec_sha_region {
-	unsigned long start;
-	unsigned long len;
-};
-
+#ifdef CONFIG_KEXEC_FILE
 struct purgatory_info {
 	/* Pointer to elf header of read only purgatory */
 	Elf_Ehdr *ehdr;
@@ -130,6 +126,28 @@ struct purgatory_info {
 	unsigned long purgatory_load_addr;
 };
 
+typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size);
+typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf,
+			     unsigned long kernel_len, char *initrd,
+			     unsigned long initrd_len, char *cmdline,
+			     unsigned long cmdline_len);
+typedef int (kexec_cleanup_t)(void *loader_data);
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+typedef int (kexec_verify_sig_t)(const char *kernel_buf,
+				 unsigned long kernel_len);
+#endif
+
+struct kexec_file_ops {
+	kexec_probe_t *probe;
+	kexec_load_t *load;
+	kexec_cleanup_t *cleanup;
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+	kexec_verify_sig_t *verify_sig;
+#endif
+};
+#endif
+
 struct kimage {
 	kimage_entry_t head;
 	kimage_entry_t *entry;
@@ -161,6 +179,7 @@ struct kimage {
 	struct kimage_arch arch;
 #endif
 
+#ifdef CONFIG_KEXEC_FILE
 	/* Additional fields for file based kexec syscall */
 	void *kernel_buf;
 	unsigned long kernel_buf_len;
@@ -179,38 +198,7 @@ struct kimage {
 
 	/* Information for loading purgatory */
 	struct purgatory_info purgatory_info;
-};
-
-/*
- * Keeps track of buffer parameters as provided by caller for requesting
- * memory placement of buffer.
- */
-struct kexec_buf {
-	struct kimage *image;
-	char *buffer;
-	unsigned long bufsz;
-	unsigned long mem;
-	unsigned long memsz;
-	unsigned long buf_align;
-	unsigned long buf_min;
-	unsigned long buf_max;
-	bool top_down;		/* allocate from top of memory hole */
-};
-
-typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size);
-typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf,
-			     unsigned long kernel_len, char *initrd,
-			     unsigned long initrd_len, char *cmdline,
-			     unsigned long cmdline_len);
-typedef int (kexec_cleanup_t)(void *loader_data);
-typedef int (kexec_verify_sig_t)(const char *kernel_buf,
-				 unsigned long kernel_len);
-
-struct kexec_file_ops {
-	kexec_probe_t *probe;
-	kexec_load_t *load;
-	kexec_cleanup_t *cleanup;
-	kexec_verify_sig_t *verify_sig;
+#endif
 };
 
 /* kexec interface functions */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..007b791f676d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return -EINVAL;
 }
 
+#ifdef CONFIG_KEXEC_VERIFY_SIG
 int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 					unsigned long buf_len)
 {
 	return -EKEYREJECTED;
 }
+#endif
 
 /* Apply relocations of type RELA */
 int __weak
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index e4392a698ad4..0a52315d9c62 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image,
 extern struct mutex kexec_mutex;
 
 #ifdef CONFIG_KEXEC_FILE
+struct kexec_sha_region {
+	unsigned long start;
+	unsigned long len;
+};
+
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+	struct kimage *image;
+	char *buffer;
+	unsigned long bufsz;
+	unsigned long mem;
+	unsigned long memsz;
+	unsigned long buf_align;
+	unsigned long buf_min;
+	unsigned long buf_max;
+	bool top_down;		/* allocate from top of memory hole */
+};
+
 void kimage_file_post_load_cleanup(struct kimage *image);
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-- 
cgit v1.2.3


From a460bece027301e079b9e53c5e0f67c8e3eaebc1 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 20 Jan 2016 15:00:42 -0800
Subject: rbtree: use READ_ONCE in RB_EMPTY_ROOT

With commit d72da4a4d97 ("rbtree: Make lockless searches non-fatal") our
rbtrees provide weak guarantees that allows us to do lockless (and very
speculative) reads of the tree.  Such readers cannot see partial stores
on nodes, ie left/right as well as root.  As such, similar to the
WRITE_ONCE semantics when doing rotations, use READ_ONCE when checking
the root node in RB_EMPTY_ROOT.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index a5aa7ae671f4..b6900099ea81 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -50,7 +50,7 @@ struct rb_root {
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
-#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
 
 /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
 #define RB_EMPTY_NODE(node)  \
-- 
cgit v1.2.3


From c6d308534aef6c99904bf5862066360ae067abc4 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:00:55 -0800
Subject: UBSAN: run-time undefined behavior sanity checker

UBSAN uses compile-time instrumentation to catch undefined behavior
(UB).  Compiler inserts code that perform certain kinds of checks before
operations that could cause UB.  If check fails (i.e.  UB detected)
__ubsan_handle_* function called to print error message.

So the most of the work is done by compiler.  This patch just implements
ubsan handlers printing errors.

GCC has this capability since 4.9.x [1] (see -fsanitize=undefined
option and its suboptions).
However GCC 5.x has more checkers implemented [2].
Article [3] has a bit more details about UBSAN in the GCC.

[1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html
[2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html
[3] - http://developerblog.redhat.com/2014/10/16/gcc-undefined-behavior-sanitizer-ubsan/

Issues which UBSAN has found thus far are:

Found bugs:

 * out-of-bounds access - 97840cb67ff5 ("netfilter: nfnetlink: fix
   insufficient validation in nfnetlink_bind")

undefined shifts:

 * d48458d4a768 ("jbd2: use a better hash function for the revoke
   table")

 * 10632008b9e1 ("clockevents: Prevent shift out of bounds")

 * 'x << -1' shift in ext4 -
   http://lkml.kernel.org/r/<5444EF21.8020501@samsung.com>

 * undefined rol32(0) -
   http://lkml.kernel.org/r/<1449198241-20654-1-git-send-email-sasha.levin@oracle.com>

 * undefined dirty_ratelimit calculation -
   http://lkml.kernel.org/r/<566594E2.3050306@odin.com>

 * undefined roundown_pow_of_two(0) -
   http://lkml.kernel.org/r/<1449156616-11474-1-git-send-email-sasha.levin@oracle.com>

 * [WONTFIX] undefined shift in __bpf_prog_run -
   http://lkml.kernel.org/r/<CACT4Y+ZxoR3UjLgcNdUm4fECLMx2VdtfrENMtRRCdgHB2n0bJA@mail.gmail.com>

   WONTFIX here because it should be fixed in bpf program, not in kernel.

signed overflows:

 * 32a8df4e0b33f ("sched: Fix odd values in effective_load()
   calculations")

 * mul overflow in ntp -
   http://lkml.kernel.org/r/<1449175608-1146-1-git-send-email-sasha.levin@oracle.com>

 * incorrect conversion into rtc_time in rtc_time64_to_tm() -
   http://lkml.kernel.org/r/<1449187944-11730-1-git-send-email-sasha.levin@oracle.com>

 * unvalidated timespec in io_getevents() -
   http://lkml.kernel.org/r/<CACT4Y+bBxVYLQ6LtOKrKtnLthqLHcw-BMp3aqP3mjdAvr9FULQ@mail.gmail.com>

 * [NOTABUG] signed overflow in ktime_add_safe() -
   http://lkml.kernel.org/r/<CACT4Y+aJ4muRnWxsUe1CMnA6P8nooO33kwG-c8YZg=0Xc8rJqw@mail.gmail.com>

[akpm@linux-foundation.org: fix unused local warning]
[akpm@linux-foundation.org: fix __int128 build woes]
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yury Gribov <y.gribov@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ubsan.txt               |  84 +++++++
 Makefile                              |   3 +-
 arch/x86/Kconfig                      |   1 +
 arch/x86/boot/Makefile                |   1 +
 arch/x86/boot/compressed/Makefile     |   1 +
 arch/x86/entry/vdso/Makefile          |   1 +
 arch/x86/realmode/rm/Makefile         |   1 +
 drivers/firmware/efi/libstub/Makefile |   1 +
 include/linux/sched.h                 |   3 +
 lib/Kconfig.debug                     |   2 +
 lib/Kconfig.ubsan                     |  29 +++
 lib/Makefile                          |   3 +
 lib/ubsan.c                           | 456 ++++++++++++++++++++++++++++++++++
 lib/ubsan.h                           |  84 +++++++
 mm/kasan/Makefile                     |   1 +
 scripts/Makefile.lib                  |   6 +
 scripts/Makefile.ubsan                |  17 ++
 17 files changed, 693 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ubsan.txt
 create mode 100644 lib/Kconfig.ubsan
 create mode 100644 lib/ubsan.c
 create mode 100644 lib/ubsan.h
 create mode 100644 scripts/Makefile.ubsan

(limited to 'include')

diff --git a/Documentation/ubsan.txt b/Documentation/ubsan.txt
new file mode 100644
index 000000000000..f58215ef5797
--- /dev/null
+++ b/Documentation/ubsan.txt
@@ -0,0 +1,84 @@
+Undefined Behavior Sanitizer - UBSAN
+
+Overview
+--------
+
+UBSAN is a runtime undefined behaviour checker.
+
+UBSAN uses compile-time instrumentation to catch undefined behavior (UB).
+Compiler inserts code that perform certain kinds of checks before operations
+that may cause UB. If check fails (i.e. UB detected) __ubsan_handle_*
+function called to print error message.
+
+GCC has that feature since 4.9.x [1] (see -fsanitize=undefined option and
+its suboptions). GCC 5.x has more checkers implemented [2].
+
+Report example
+---------------
+
+	 ================================================================================
+	 UBSAN: Undefined behaviour in ../include/linux/bitops.h:110:33
+	 shift exponent 32 is to large for 32-bit type 'unsigned int'
+	 CPU: 0 PID: 0 Comm: swapper Not tainted 4.4.0-rc1+ #26
+	  0000000000000000 ffffffff82403cc8 ffffffff815e6cd6 0000000000000001
+	  ffffffff82403cf8 ffffffff82403ce0 ffffffff8163a5ed 0000000000000020
+	  ffffffff82403d78 ffffffff8163ac2b ffffffff815f0001 0000000000000002
+	 Call Trace:
+	  [<ffffffff815e6cd6>] dump_stack+0x45/0x5f
+	  [<ffffffff8163a5ed>] ubsan_epilogue+0xd/0x40
+	  [<ffffffff8163ac2b>] __ubsan_handle_shift_out_of_bounds+0xeb/0x130
+	  [<ffffffff815f0001>] ? radix_tree_gang_lookup_slot+0x51/0x150
+	  [<ffffffff8173c586>] _mix_pool_bytes+0x1e6/0x480
+	  [<ffffffff83105653>] ? dmi_walk_early+0x48/0x5c
+	  [<ffffffff8173c881>] add_device_randomness+0x61/0x130
+	  [<ffffffff83105b35>] ? dmi_save_one_device+0xaa/0xaa
+	  [<ffffffff83105653>] dmi_walk_early+0x48/0x5c
+	  [<ffffffff831066ae>] dmi_scan_machine+0x278/0x4b4
+	  [<ffffffff8111d58a>] ? vprintk_default+0x1a/0x20
+	  [<ffffffff830ad120>] ? early_idt_handler_array+0x120/0x120
+	  [<ffffffff830b2240>] setup_arch+0x405/0xc2c
+	  [<ffffffff830ad120>] ? early_idt_handler_array+0x120/0x120
+	  [<ffffffff830ae053>] start_kernel+0x83/0x49a
+	  [<ffffffff830ad120>] ? early_idt_handler_array+0x120/0x120
+	  [<ffffffff830ad386>] x86_64_start_reservations+0x2a/0x2c
+	  [<ffffffff830ad4f3>] x86_64_start_kernel+0x16b/0x17a
+	 ================================================================================
+
+Usage
+-----
+
+To enable UBSAN configure kernel with:
+
+	CONFIG_UBSAN=y
+
+and to check the entire kernel:
+
+        CONFIG_UBSAN_SANITIZE_ALL=y
+
+To enable instrumentation for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                UBSAN_SANITIZE_main.o := y
+
+        For all files in one directory:
+                UBSAN_SANITIZE := y
+
+To exclude files from being instrumented even if
+CONFIG_UBSAN_SANITIZE_ALL=y, use:
+
+                UBSAN_SANITIZE_main.o := n
+        and:
+                UBSAN_SANITIZE := n
+
+Detection of unaligned accesses controlled through the separate option -
+CONFIG_UBSAN_ALIGNMENT. It's off by default on architectures that support
+unaligned accesses (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y). One could
+still enable it in config, just note that it will produce a lot of UBSAN
+reports.
+
+References
+----------
+
+[1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html
+[2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html
diff --git a/Makefile b/Makefile
index 7f4ac1ee4a2b..abfb3e8eb0b1 100644
--- a/Makefile
+++ b/Makefile
@@ -411,7 +411,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -784,6 +784,7 @@ endif
 
 include scripts/Makefile.kasan
 include scripts/Makefile.extrawarn
+include scripts/Makefile.ubsan
 
 # Add any arch overrides and user supplied CPPFLAGS, AFLAGS and CFLAGS as the
 # last assignments
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a10ba9e95da..92b2a73162ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 2ee62dba0373..bbe1a62efc02 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -60,6 +60,7 @@ clean-files += cpustr.h
 KBUILD_CFLAGS	:= $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
+UBSAN_SANITIZE := n
 
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0a291cdfaf77..f9ce75d80101 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
+UBSAN_SANITIZE :=n
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 265c0ed68118..c854541d93ff 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -4,6 +4,7 @@
 
 KBUILD_CFLAGS += $(DISABLE_LTO)
 KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
 
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..3e75fcf6b836 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -70,3 +70,4 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
 		   -I$(srctree)/arch/x86/boot
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
+UBSAN_SANITIZE := n
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 9c12e18031d5..aaf9c0bab42e 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -22,6 +22,7 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 
 GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n
+UBSAN_SANITIZE			:= n
 
 lib-y				:= efi-stub-helper.o
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61aa9bbea871..02dabf281b2f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1643,6 +1643,9 @@ struct task_struct {
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
 	gfp_t lockdep_reclaim_gfp;
 #endif
+#ifdef CONFIG_UBSAN
+	unsigned int in_ubsan;
+#endif
 
 /* journalling filesystem info */
 	void *journal_info;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f75a33f29f6e..157220b9ff05 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1893,6 +1893,8 @@ source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
 
+source "lib/Kconfig.ubsan"
+
 config ARCH_HAS_DEVMEM_IS_ALLOWED
 	bool
 
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
new file mode 100644
index 000000000000..49518fb48cab
--- /dev/null
+++ b/lib/Kconfig.ubsan
@@ -0,0 +1,29 @@
+config ARCH_HAS_UBSAN_SANITIZE_ALL
+	bool
+
+config UBSAN
+	bool "Undefined behaviour sanity checker"
+	help
+	  This option enables undefined behaviour sanity checker
+	  Compile-time instrumentation is used to detect various undefined
+	  behaviours in runtime. Various types of checks may be enabled
+	  via boot parameter ubsan_handle (see: Documentation/ubsan.txt).
+
+config UBSAN_SANITIZE_ALL
+	bool "Enable instrumentation for the entire kernel"
+	depends on UBSAN
+	depends on ARCH_HAS_UBSAN_SANITIZE_ALL
+	default y
+	help
+	  This option activates instrumentation for the entire kernel.
+	  If you don't enable this option, you have to explicitly specify
+	  UBSAN_SANITIZE := y for the files/directories you want to check for UB.
+
+config UBSAN_ALIGNMENT
+	bool "Enable checking of pointers alignment"
+	depends on UBSAN
+	default y if !HAVE_EFFICIENT_UNALIGNED_ACCESS
+	help
+	  This option enables detection of unaligned memory accesses.
+	  Enabling this option on architectures that support unalligned
+	  accesses may produce a lot of false positives.
diff --git a/lib/Makefile b/lib/Makefile
index b2a82e600987..2d4bc33d09b4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -209,3 +209,6 @@ quiet_cmd_build_OID_registry = GEN     $@
 clean-files	+= oid_registry_data.c
 
 obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
+obj-$(CONFIG_UBSAN) += ubsan.o
+
+UBSAN_SANITIZE_ubsan.o := n
diff --git a/lib/ubsan.c b/lib/ubsan.c
new file mode 100644
index 000000000000..8799ae5e2e42
--- /dev/null
+++ b/lib/ubsan.c
@@ -0,0 +1,456 @@
+/*
+ * UBSAN error reporting functions
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+
+#include "ubsan.h"
+
+const char *type_check_kinds[] = {
+	"load of",
+	"store to",
+	"reference binding to",
+	"member access within",
+	"member call on",
+	"constructor call on",
+	"downcast of",
+	"downcast of"
+};
+
+#define REPORTED_BIT 31
+
+#if (BITS_PER_LONG == 64) && defined(__BIG_ENDIAN)
+#define COLUMN_MASK (~(1U << REPORTED_BIT))
+#define LINE_MASK   (~0U)
+#else
+#define COLUMN_MASK   (~0U)
+#define LINE_MASK (~(1U << REPORTED_BIT))
+#endif
+
+#define VALUE_LENGTH 40
+
+static bool was_reported(struct source_location *location)
+{
+	return test_and_set_bit(REPORTED_BIT, &location->reported);
+}
+
+static void print_source_location(const char *prefix,
+				struct source_location *loc)
+{
+	pr_err("%s %s:%d:%d\n", prefix, loc->file_name,
+		loc->line & LINE_MASK, loc->column & COLUMN_MASK);
+}
+
+static bool suppress_report(struct source_location *loc)
+{
+	return current->in_ubsan || was_reported(loc);
+}
+
+static bool type_is_int(struct type_descriptor *type)
+{
+	return type->type_kind == type_kind_int;
+}
+
+static bool type_is_signed(struct type_descriptor *type)
+{
+	WARN_ON(!type_is_int(type));
+	return  type->type_info & 1;
+}
+
+static unsigned type_bit_width(struct type_descriptor *type)
+{
+	return 1 << (type->type_info >> 1);
+}
+
+static bool is_inline_int(struct type_descriptor *type)
+{
+	unsigned inline_bits = sizeof(unsigned long)*8;
+	unsigned bits = type_bit_width(type);
+
+	WARN_ON(!type_is_int(type));
+
+	return bits <= inline_bits;
+}
+
+static s_max get_signed_val(struct type_descriptor *type, unsigned long val)
+{
+	if (is_inline_int(type)) {
+		unsigned extra_bits = sizeof(s_max)*8 - type_bit_width(type);
+		return ((s_max)val) << extra_bits >> extra_bits;
+	}
+
+	if (type_bit_width(type) == 64)
+		return *(s64 *)val;
+
+	return *(s_max *)val;
+}
+
+static bool val_is_negative(struct type_descriptor *type, unsigned long val)
+{
+	return type_is_signed(type) && get_signed_val(type, val) < 0;
+}
+
+static u_max get_unsigned_val(struct type_descriptor *type, unsigned long val)
+{
+	if (is_inline_int(type))
+		return val;
+
+	if (type_bit_width(type) == 64)
+		return *(u64 *)val;
+
+	return *(u_max *)val;
+}
+
+static void val_to_string(char *str, size_t size, struct type_descriptor *type,
+	unsigned long value)
+{
+	if (type_is_int(type)) {
+		if (type_bit_width(type) == 128) {
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+			u_max val = get_unsigned_val(type, value);
+
+			scnprintf(str, size, "0x%08x%08x%08x%08x",
+				(u32)(val >> 96),
+				(u32)(val >> 64),
+				(u32)(val >> 32),
+				(u32)(val));
+#else
+			WARN_ON(1);
+#endif
+		} else if (type_is_signed(type)) {
+			scnprintf(str, size, "%lld",
+				(s64)get_signed_val(type, value));
+		} else {
+			scnprintf(str, size, "%llu",
+				(u64)get_unsigned_val(type, value));
+		}
+	}
+}
+
+static bool location_is_valid(struct source_location *loc)
+{
+	return loc->file_name != NULL;
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+static void ubsan_prologue(struct source_location *location,
+			unsigned long *flags)
+{
+	current->in_ubsan++;
+	spin_lock_irqsave(&report_lock, *flags);
+
+	pr_err("========================================"
+		"========================================\n");
+	print_source_location("UBSAN: Undefined behaviour in", location);
+}
+
+static void ubsan_epilogue(unsigned long *flags)
+{
+	dump_stack();
+	pr_err("========================================"
+		"========================================\n");
+	spin_unlock_irqrestore(&report_lock, *flags);
+	current->in_ubsan--;
+}
+
+static void handle_overflow(struct overflow_data *data, unsigned long lhs,
+			unsigned long rhs, char op)
+{
+
+	struct type_descriptor *type = data->type;
+	unsigned long flags;
+	char lhs_val_str[VALUE_LENGTH];
+	char rhs_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
+	val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
+	pr_err("%s integer overflow:\n",
+		type_is_signed(type) ? "signed" : "unsigned");
+	pr_err("%s %c %s cannot be represented in type %s\n",
+		lhs_val_str,
+		op,
+		rhs_val_str,
+		type->type_name);
+
+	ubsan_epilogue(&flags);
+}
+
+void __ubsan_handle_add_overflow(struct overflow_data *data,
+				unsigned long lhs,
+				unsigned long rhs)
+{
+
+	handle_overflow(data, lhs, rhs, '+');
+}
+EXPORT_SYMBOL(__ubsan_handle_add_overflow);
+
+void __ubsan_handle_sub_overflow(struct overflow_data *data,
+				unsigned long lhs,
+				unsigned long rhs)
+{
+	handle_overflow(data, lhs, rhs, '-');
+}
+EXPORT_SYMBOL(__ubsan_handle_sub_overflow);
+
+void __ubsan_handle_mul_overflow(struct overflow_data *data,
+				unsigned long lhs,
+				unsigned long rhs)
+{
+	handle_overflow(data, lhs, rhs, '*');
+}
+EXPORT_SYMBOL(__ubsan_handle_mul_overflow);
+
+void __ubsan_handle_negate_overflow(struct overflow_data *data,
+				unsigned long old_val)
+{
+	unsigned long flags;
+	char old_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
+
+	pr_err("negation of %s cannot be represented in type %s:\n",
+		old_val_str, data->type->type_name);
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
+
+
+void __ubsan_handle_divrem_overflow(struct overflow_data *data,
+				unsigned long lhs,
+				unsigned long rhs)
+{
+	unsigned long flags;
+	char rhs_val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
+
+	if (type_is_signed(data->type) && get_signed_val(data->type, rhs) == -1)
+		pr_err("division of %s by -1 cannot be represented in type %s\n",
+			rhs_val_str, data->type->type_name);
+	else
+		pr_err("division by zero\n");
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_divrem_overflow);
+
+static void handle_null_ptr_deref(struct type_mismatch_data *data)
+{
+	unsigned long flags;
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	pr_err("%s null pointer of type %s\n",
+		type_check_kinds[data->type_check_kind],
+		data->type->type_name);
+
+	ubsan_epilogue(&flags);
+}
+
+static void handle_missaligned_access(struct type_mismatch_data *data,
+				unsigned long ptr)
+{
+	unsigned long flags;
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	pr_err("%s misaligned address %p for type %s\n",
+		type_check_kinds[data->type_check_kind],
+		(void *)ptr, data->type->type_name);
+	pr_err("which requires %ld byte alignment\n", data->alignment);
+
+	ubsan_epilogue(&flags);
+}
+
+static void handle_object_size_mismatch(struct type_mismatch_data *data,
+					unsigned long ptr)
+{
+	unsigned long flags;
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+	pr_err("%s address %pk with insufficient space\n",
+		type_check_kinds[data->type_check_kind],
+		(void *) ptr);
+	pr_err("for an object of type %s\n", data->type->type_name);
+	ubsan_epilogue(&flags);
+}
+
+void __ubsan_handle_type_mismatch(struct type_mismatch_data *data,
+				unsigned long ptr)
+{
+
+	if (!ptr)
+		handle_null_ptr_deref(data);
+	else if (data->alignment && !IS_ALIGNED(ptr, data->alignment))
+		handle_missaligned_access(data, ptr);
+	else
+		handle_object_size_mismatch(data, ptr);
+}
+EXPORT_SYMBOL(__ubsan_handle_type_mismatch);
+
+void __ubsan_handle_nonnull_return(struct nonnull_return_data *data)
+{
+	unsigned long flags;
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	pr_err("null pointer returned from function declared to never return null\n");
+
+	if (location_is_valid(&data->attr_location))
+		print_source_location("returns_nonnull attribute specified in",
+				&data->attr_location);
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_nonnull_return);
+
+void __ubsan_handle_vla_bound_not_positive(struct vla_bound_data *data,
+					unsigned long bound)
+{
+	unsigned long flags;
+	char bound_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(bound_str, sizeof(bound_str), data->type, bound);
+	pr_err("variable length array bound value %s <= 0\n", bound_str);
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_vla_bound_not_positive);
+
+void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data,
+				unsigned long index)
+{
+	unsigned long flags;
+	char index_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(index_str, sizeof(index_str), data->index_type, index);
+	pr_err("index %s is out of range for type %s\n", index_str,
+		data->array_type->type_name);
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_out_of_bounds);
+
+void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
+					unsigned long lhs, unsigned long rhs)
+{
+	unsigned long flags;
+	struct type_descriptor *rhs_type = data->rhs_type;
+	struct type_descriptor *lhs_type = data->lhs_type;
+	char rhs_str[VALUE_LENGTH];
+	char lhs_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs);
+	val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs);
+
+	if (val_is_negative(rhs_type, rhs))
+		pr_err("shift exponent %s is negative\n", rhs_str);
+
+	else if (get_unsigned_val(rhs_type, rhs) >=
+		type_bit_width(lhs_type))
+		pr_err("shift exponent %s is too large for %u-bit type %s\n",
+			rhs_str,
+			type_bit_width(lhs_type),
+			lhs_type->type_name);
+	else if (val_is_negative(lhs_type, lhs))
+		pr_err("left shift of negative value %s\n",
+			lhs_str);
+	else
+		pr_err("left shift of %s by %s places cannot be"
+			" represented in type %s\n",
+			lhs_str, rhs_str,
+			lhs_type->type_name);
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds);
+
+
+void __noreturn
+__ubsan_handle_builtin_unreachable(struct unreachable_data *data)
+{
+	unsigned long flags;
+
+	ubsan_prologue(&data->location, &flags);
+	pr_err("calling __builtin_unreachable()\n");
+	ubsan_epilogue(&flags);
+	panic("can't return from __builtin_unreachable()");
+}
+EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable);
+
+void __ubsan_handle_load_invalid_value(struct invalid_value_data *data,
+				unsigned long val)
+{
+	unsigned long flags;
+	char val_str[VALUE_LENGTH];
+
+	if (suppress_report(&data->location))
+		return;
+
+	ubsan_prologue(&data->location, &flags);
+
+	val_to_string(val_str, sizeof(val_str), data->type, val);
+
+	pr_err("load of value %s is not a valid value for type %s\n",
+		val_str, data->type->type_name);
+
+	ubsan_epilogue(&flags);
+}
+EXPORT_SYMBOL(__ubsan_handle_load_invalid_value);
diff --git a/lib/ubsan.h b/lib/ubsan.h
new file mode 100644
index 000000000000..b2d18d4a53f5
--- /dev/null
+++ b/lib/ubsan.h
@@ -0,0 +1,84 @@
+#ifndef _LIB_UBSAN_H
+#define _LIB_UBSAN_H
+
+enum {
+	type_kind_int = 0,
+	type_kind_float = 1,
+	type_unknown = 0xffff
+};
+
+struct type_descriptor {
+	u16 type_kind;
+	u16 type_info;
+	char type_name[1];
+};
+
+struct source_location {
+	const char *file_name;
+	union {
+		unsigned long reported;
+		struct {
+			u32 line;
+			u32 column;
+		};
+	};
+};
+
+struct overflow_data {
+	struct source_location location;
+	struct type_descriptor *type;
+};
+
+struct type_mismatch_data {
+	struct source_location location;
+	struct type_descriptor *type;
+	unsigned long alignment;
+	unsigned char type_check_kind;
+};
+
+struct nonnull_arg_data {
+	struct source_location location;
+	struct source_location attr_location;
+	int arg_index;
+};
+
+struct nonnull_return_data {
+	struct source_location location;
+	struct source_location attr_location;
+};
+
+struct vla_bound_data {
+	struct source_location location;
+	struct type_descriptor *type;
+};
+
+struct out_of_bounds_data {
+	struct source_location location;
+	struct type_descriptor *array_type;
+	struct type_descriptor *index_type;
+};
+
+struct shift_out_of_bounds_data {
+	struct source_location location;
+	struct type_descriptor *lhs_type;
+	struct type_descriptor *rhs_type;
+};
+
+struct unreachable_data {
+	struct source_location location;
+};
+
+struct invalid_value_data {
+	struct source_location location;
+	struct type_descriptor *type;
+};
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+typedef __int128 s_max;
+typedef unsigned __int128 u_max;
+#else
+typedef s64 s_max;
+typedef u64 u_max;
+#endif
+
+#endif
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..a61460d9f5b0 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
 KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
 
 CFLAGS_REMOVE_kasan.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 39d6bb18ce76..2edbcadb3d7f 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -130,6 +130,12 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_KASAN))
 endif
 
+ifeq ($(CONFIG_UBSAN),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \
+		$(CFLAGS_UBSAN))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
new file mode 100644
index 000000000000..8ab68679cfb5
--- /dev/null
+++ b/scripts/Makefile.ubsan
@@ -0,0 +1,17 @@
+ifdef CONFIG_UBSAN
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=shift)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=vla-bound)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=null)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=returns-nonnull-attribute)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool)
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum)
+
+ifdef CONFIG_UBSAN_ALIGNMENT
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
+endif
+endif
-- 
cgit v1.2.3


From 06af1c52c9ea234e0b1266cc0b52c3e0c6c8fe9f Mon Sep 17 00:00:00 2001
From: Bongkyu Kim <bongkyu.kim@lge.com>
Date: Wed, 20 Jan 2016 15:01:08 -0800
Subject: lz4: fix wrong compress buffer size for 64-bits

The current lz4 compress buffer is 16kb on 32-bits, 32kb on 64-bits
system.  But, lz4 needs only 16kb on both.  On 64-bits, this causes
wasted cpu cycles for additional memset during every compression.

In case of lz4hc, the current buffer size is (256kb + 8) on 32-bits,
(512kb + 16) on 64-bits.  But, lz4hc needs only (256kb + 2 * pointer) on
both.

This patch fixes these wrong compress buffer sizes for 64-bits.

Signed-off-by: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Chanho Min <chanho.min@lge.com>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 4356686b0a39..6b784c59f321 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -9,8 +9,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#define LZ4_MEM_COMPRESS	(4096 * sizeof(unsigned char *))
-#define LZ4HC_MEM_COMPRESS	(65538 * sizeof(unsigned char *))
+#define LZ4_MEM_COMPRESS	(16384)
+#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
 
 /*
  * lz4_compressbound()
-- 
cgit v1.2.3


From 2954e440be7305134be632a94536b412899490f7 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Wed, 20 Jan 2016 15:01:11 -0800
Subject: ipc/shm.c: is_file_shm_hugepages() can be boolean

Make is_file_shm_hugepages() return bool to improve readability due to
this particular function only using either one or zero as its return
value.

No functional change.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shm.h | 6 +++---
 ipc/shm.c           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 6fb801686ad6..04e881829625 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -52,7 +52,7 @@ struct sysv_shm {
 
 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
 	      unsigned long shmlba);
-int is_file_shm_hugepages(struct file *file);
+bool is_file_shm_hugepages(struct file *file);
 void exit_shm(struct task_struct *task);
 #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
 #else
@@ -66,9 +66,9 @@ static inline long do_shmat(int shmid, char __user *shmaddr,
 {
 	return -ENOSYS;
 }
-static inline int is_file_shm_hugepages(struct file *file)
+static inline bool is_file_shm_hugepages(struct file *file)
 {
-	return 0;
+	return false;
 }
 static inline void exit_shm(struct task_struct *task)
 {
diff --git a/ipc/shm.c b/ipc/shm.c
index 41787276e141..ed3027d0f277 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -459,7 +459,7 @@ static const struct file_operations shm_file_operations_huge = {
 	.fallocate	= shm_fallocate,
 };
 
-int is_file_shm_hugepages(struct file *file)
+bool is_file_shm_hugepages(struct file *file)
 {
 	return file->f_op == &shm_file_operations_huge;
 }
-- 
cgit v1.2.3


From e1c7e324539ada3b2b13ca2898bcb4948a9ef9db Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Jan 2016 15:02:05 -0800
Subject: dma-mapping: always provide the dma_map_ops based implementation

Move the generic implementation to <linux/dma-mapping.h> now that all
architectures support it and remove the HAVE_DMA_ATTR Kconfig symbol now
that everyone supports them.

[valentinrothberg@gmail.com: remove leftovers in Kconfig]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Helge Deller <deller@gmx.de>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Valentin Rothberg <valentinrothberg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DMA-API-HOWTO.txt                    |  10 -
 .../features/io/dma_map_attrs/arch-support.txt     |  40 ---
 arch/Kconfig                                       |   3 -
 arch/alpha/Kconfig                                 |   1 -
 arch/alpha/include/asm/dma-mapping.h               |   2 -
 arch/arc/Kconfig                                   |   1 -
 arch/arc/include/asm/dma-mapping.h                 |   2 -
 arch/arm/Kconfig                                   |   1 -
 arch/arm/include/asm/dma-mapping.h                 |   7 -
 arch/arm64/Kconfig                                 |   1 -
 arch/arm64/include/asm/dma-mapping.h               |   2 -
 arch/avr32/Kconfig                                 |   1 -
 arch/avr32/include/asm/dma-mapping.h               |   2 -
 arch/blackfin/Kconfig                              |   1 -
 arch/blackfin/include/asm/dma-mapping.h            |   2 -
 arch/c6x/Kconfig                                   |   1 -
 arch/c6x/include/asm/dma-mapping.h                 |   2 -
 arch/cris/Kconfig                                  |   1 -
 arch/cris/include/asm/dma-mapping.h                |   2 -
 arch/frv/Kconfig                                   |   1 -
 arch/frv/include/asm/dma-mapping.h                 |   2 -
 arch/h8300/Kconfig                                 |   1 -
 arch/h8300/include/asm/dma-mapping.h               |   2 -
 arch/hexagon/Kconfig                               |   1 -
 arch/hexagon/include/asm/dma-mapping.h             |   2 -
 arch/ia64/Kconfig                                  |   1 -
 arch/ia64/include/asm/dma-mapping.h                |   2 -
 arch/m68k/Kconfig                                  |   1 -
 arch/m68k/include/asm/dma-mapping.h                |   2 -
 arch/metag/Kconfig                                 |   1 -
 arch/metag/include/asm/dma-mapping.h               |   2 -
 arch/microblaze/Kconfig                            |   1 -
 arch/microblaze/include/asm/dma-mapping.h          |   2 -
 arch/mips/Kconfig                                  |   1 -
 arch/mips/include/asm/dma-mapping.h                |   2 -
 arch/mn10300/Kconfig                               |   1 -
 arch/mn10300/include/asm/dma-mapping.h             |   2 -
 arch/nios2/Kconfig                                 |   1 -
 arch/openrisc/Kconfig                              |   3 -
 arch/openrisc/include/asm/dma-mapping.h            |   2 -
 arch/parisc/Kconfig                                |   1 -
 arch/parisc/include/asm/dma-mapping.h              |   2 -
 arch/powerpc/Kconfig                               |   1 -
 arch/powerpc/include/asm/dma-mapping.h             |   2 -
 arch/s390/Kconfig                                  |   1 -
 arch/s390/include/asm/dma-mapping.h                |   2 -
 arch/sh/Kconfig                                    |   1 -
 arch/sh/include/asm/dma-mapping.h                  |   2 -
 arch/sparc/Kconfig                                 |   1 -
 arch/sparc/include/asm/dma-mapping.h               |   2 -
 arch/tile/Kconfig                                  |   1 -
 arch/tile/include/asm/dma-mapping.h                |   3 -
 arch/unicore32/Kconfig                             |   1 -
 arch/unicore32/include/asm/dma-mapping.h           |   2 -
 arch/x86/Kconfig                                   |   1 -
 arch/x86/include/asm/dma-mapping.h                 |   2 -
 arch/xtensa/Kconfig                                |   1 -
 arch/xtensa/include/asm/dma-mapping.h              |   2 -
 drivers/gpu/drm/Kconfig                            |   4 +-
 drivers/gpu/drm/imx/Kconfig                        |   2 +-
 drivers/gpu/drm/rcar-du/Kconfig                    |   2 +-
 drivers/gpu/drm/shmobile/Kconfig                   |   2 +-
 drivers/gpu/drm/sti/Kconfig                        |   2 +-
 drivers/gpu/drm/tilcdc/Kconfig                     |   2 +-
 drivers/gpu/drm/vc4/Kconfig                        |   2 +-
 drivers/media/platform/Kconfig                     |   1 -
 include/asm-generic/dma-mapping-broken.h           |  95 ------
 include/asm-generic/dma-mapping-common.h           | 358 -------------------
 include/linux/dma-attrs.h                          |  10 -
 include/linux/dma-mapping.h                        | 379 ++++++++++++++++++++-
 70 files changed, 369 insertions(+), 633 deletions(-)
 delete mode 100644 Documentation/features/io/dma_map_attrs/arch-support.txt
 delete mode 100644 include/asm-generic/dma-mapping-broken.h
 delete mode 100644 include/asm-generic/dma-mapping-common.h

(limited to 'include')

diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index d69b3fc64e14..781024ef9050 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -951,16 +951,6 @@ to "Closing".
    alignment constraints (e.g. the alignment constraints about 64-bit
    objects).
 
-3) Supporting multiple types of IOMMUs
-
-   If your architecture needs to support multiple types of IOMMUs, you
-   can use include/linux/asm-generic/dma-mapping-common.h. It's a
-   library to support the DMA API with multiple types of IOMMUs. Lots
-   of architectures (x86, powerpc, sh, alpha, ia64, microblaze and
-   sparc) use it. Choose one to see how it can be used. If you need to
-   support multiple types of IOMMUs in a single system, the example of
-   x86 or powerpc helps.
-
 			   Closing
 
 This document, and the API itself, would not be in its current
diff --git a/Documentation/features/io/dma_map_attrs/arch-support.txt b/Documentation/features/io/dma_map_attrs/arch-support.txt
deleted file mode 100644
index 51d0f1c02a3e..000000000000
--- a/Documentation/features/io/dma_map_attrs/arch-support.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name:          dma_map_attrs
-#         Kconfig:       HAVE_DMA_ATTRS
-#         description:   arch provides dma_*map*_attrs() APIs
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: |  ok  |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |       avr32: | TODO |
-    |    blackfin: | TODO |
-    |         c6x: | TODO |
-    |        cris: | TODO |
-    |         frv: | TODO |
-    |       h8300: |  ok  |
-    |     hexagon: |  ok  |
-    |        ia64: |  ok  |
-    |        m32r: | TODO |
-    |        m68k: | TODO |
-    |       metag: | TODO |
-    |  microblaze: |  ok  |
-    |        mips: |  ok  |
-    |     mn10300: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: |  ok  |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |        s390: |  ok  |
-    |       score: | TODO |
-    |          sh: |  ok  |
-    |       sparc: |  ok  |
-    |        tile: |  ok  |
-    |          um: | TODO |
-    |   unicore32: |  ok  |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
diff --git a/arch/Kconfig b/arch/Kconfig
index 51c03efb4083..f6b649d88ec8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -205,9 +205,6 @@ config HAVE_NMI_WATCHDOG
 config HAVE_ARCH_TRACEHOOK
 	bool
 
-config HAVE_DMA_ATTRS
-	bool
-
 config HAVE_DMA_CONTIGUOUS
 	bool
 
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index f515a4dbf7a0..9d8a85801ed1 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -9,7 +9,6 @@ config ALPHA
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
-	select HAVE_DMA_ATTRS
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
 	select AUTO_IRQ_AFFINITY if SMP
diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 72a8ca7796d9..3c3451f58ff4 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -10,8 +10,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #define dma_cache_sync(dev, va, size, dir)		  ((void)0)
 
 #endif	/* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8150c2783583..76dde9db7934 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -38,7 +38,6 @@ config ARC
 	select OF_EARLY_FLATTREE
 	select PERF_USE_VMALLOC
 	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_DMA_ATTRS
 
 config TRACE_IRQFLAGS_SUPPORT
 	def_bool y
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
index 2a617f9c1e92..660205414f1d 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -18,6 +18,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &arc_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6a889afa6a2c..52311774e18e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -47,7 +47,6 @@ config ARM
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index ccb3aa64640d..6ad1ceda62a5 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -41,13 +41,6 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 #define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
 
-/*
- * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent
- * implementations, we don't provide a dma_cache_sync function so drivers using
- * this API are highlighted with build warnings.
- */
-#include <asm-generic/dma-mapping-common.h>
-
 #ifdef __arch_page_to_dma
 #error Please update to __arch_pfn_to_dma
 #endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6be3fa2310ee..8cc62289a63e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -64,7 +64,6 @@ config ARM64
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 61e08f360e31..ba437f090a74 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -64,8 +64,6 @@ static inline bool is_device_dma_coherent(struct device *dev)
 	return dev->archdata.dma_coherent;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	return (dma_addr_t)paddr;
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index aac3d6972c30..b6878eb64884 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -7,7 +7,6 @@ config AVR32
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select VIRT_TO_BUS
-	select HAVE_DMA_ATTRS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_ATOMIC64
 	select HARDIRQS_SW_RESEND
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
index 0239ca84eb41..1115f2a645d1 100644
--- a/arch/avr32/include/asm/dma-mapping.h
+++ b/arch/avr32/include/asm/dma-mapping.h
@@ -11,6 +11,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &avr32_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif /* __ASM_AVR32_DMA_MAPPING_H */
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 4be2f905198d..af76634f8d98 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -14,7 +14,6 @@ config BLACKFIN
 	def_bool y
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_DMA_ATTRS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h
index ea5a2e82db7c..3490570aaa82 100644
--- a/arch/blackfin/include/asm/dma-mapping.h
+++ b/arch/blackfin/include/asm/dma-mapping.h
@@ -43,6 +43,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &bfin_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif				/* _BLACKFIN_DMA_MAPPING_H */
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 8602f725e270..79049d432d3c 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -18,7 +18,6 @@ config C6X
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select HAVE_DMA_ATTRS
 
 config MMU
 	def_bool n
diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h
index f881e425d442..6b5cd7b0cf32 100644
--- a/arch/c6x/include/asm/dma-mapping.h
+++ b/arch/c6x/include/asm/dma-mapping.h
@@ -24,8 +24,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &c6x_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 extern void coherent_mem_init(u32 start, u32 size);
 void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		gfp_t gfp, struct dma_attrs *attrs);
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 20d919c93c7f..e086f9e93728 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -54,7 +54,6 @@ config CRIS
 	select GENERIC_ATOMIC64
 	select HAVE_UID16
 	select VIRT_TO_BUS
-	select HAVE_DMA_ATTRS
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IOMAP
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index 34e7c7c7eccb..5a370178a0e9 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -16,8 +16,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 #endif
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index e3837814f593..eefd9a4ed156 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -16,7 +16,6 @@ config FRV
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select HAVE_DMA_ATTRS
 
 config ZONE_DMA
 	bool
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 750951cbba88..9a82bfa4303b 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -21,6 +21,4 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	flush_write_buffers();
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif  /* _ASM_DMA_MAPPING_H */
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 2e20333cbce9..8c7c82586da0 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -15,7 +15,6 @@ config H8300
 	select OF_IRQ
 	select OF_EARLY_FLATTREE
 	select HAVE_MEMBLOCK
-	select HAVE_DMA_ATTRS
 	select CLKSRC_OF
 	select H8300_TMR8
 
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index d9b5b806afe6..7ac7fadffed0 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -8,6 +8,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &h8300_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 4dc89d1f9c48..57298e7b4867 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -27,7 +27,6 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
-	select HAVE_DMA_ATTRS
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 268fde8a4575..aa6203464520 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -49,8 +49,6 @@ extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 			   enum dma_data_direction direction);
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index eb0249e37981..fb0515eb639b 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -25,7 +25,6 @@ config IA64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE if (!ITANIUM)
 	select HAVE_FUNCTION_TRACER
-	select HAVE_DMA_ATTRS
 	select TTY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 9beccf8010bd..d472805edfa9 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -25,8 +25,6 @@ extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
 
 #define get_dma_ops(dev) platform_dma_get_ops(dev)
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index d5d75b3154a1..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -23,7 +23,6 @@ config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-	select HAVE_DMA_ATTRS
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h
index 2c082a63af35..96c536194287 100644
--- a/arch/m68k/include/asm/dma-mapping.h
+++ b/arch/m68k/include/asm/dma-mapping.h
@@ -8,8 +8,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
         return &m68k_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction dir)
 {
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index ad8604c2d9f6..a0fa88da3e31 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,7 +29,6 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
-	select HAVE_DMA_ATTRS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index 768f2e30236d..27af5d479ce6 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -8,8 +8,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &metag_dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 /*
  * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to
  * do any flushing here.
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 5ecd0287a874..53b69deceb99 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -19,7 +19,6 @@ config MICROBLAZE
 	select HAVE_ARCH_KGDB
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index 24b12970c9cf..1884783d15c0 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -44,8 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &dma_direct_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void __dma_sync(unsigned long paddr,
 			      size_t size, enum dma_data_direction direction)
 {
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 71683a853372..fbf3f6670b69 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -31,7 +31,6 @@ config MIPS
 	select RTC_LIB if !MACH_LOONGSON64
 	select GENERIC_ATOMIC64 if !64BIT
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DMA_API_DEBUG
 	select GENERIC_IRQ_PROBE
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index e604f760c4a0..12fa79e2f1b4 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -29,8 +29,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-#include <asm-generic/dma-mapping-common.h>
-
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction);
 
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index e8ebf78f6d21..10607f0d2bcd 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -15,7 +15,6 @@ config MN10300
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select HAVE_DMA_ATTRS
 
 config AM33_2
 	def_bool n
diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h
index e69b0130335c..1dcd44757f32 100644
--- a/arch/mn10300/include/asm/dma-mapping.h
+++ b/arch/mn10300/include/asm/dma-mapping.h
@@ -28,6 +28,4 @@ void dma_cache_sync(void *vaddr, size_t size,
 	mn10300_dcache_flush_inv();
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4b2504d28178..437555424bda 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,7 +16,6 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
-	select HAVE_DMA_ATTRS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 443f44de1020..e118c02cc79a 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -29,9 +29,6 @@ config OPENRISC
 config MMU
 	def_bool y
 
-config HAVE_DMA_ATTRS
-	def_bool y
-
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index 413bfcf86384..1f260bccb368 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -42,6 +42,4 @@ static inline int dma_supported(struct device *dev, u64 dma_mask)
 	return dma_mask == DMA_BIT_MASK(32);
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif	/* __ASM_OPENRISC_DMA_MAPPING_H */
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 1489351134fa..14f655cf542e 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -30,7 +30,6 @@ config PARISC
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select HAVE_DMA_ATTRS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index 4de518647612..16e024602737 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -83,6 +83,4 @@ struct parisc_device;
 void * sba_get_iommu(struct parisc_device *dev);
 #endif
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8310be4ffe31..e4824fd04bb7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -108,7 +108,6 @@ config PPC
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_OPROFILE
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 7f522c021dc3..77816acd4fd9 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -125,8 +125,6 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 #define HAVE_ARCH_DMA_SET_MASK 1
 extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-#include <asm-generic/dma-mapping-common.h>
-
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index dbeeb3a049f2..3be9c832dec1 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -579,7 +579,6 @@ config QDIO
 
 menuconfig PCI
 	bool "PCI support"
-	select HAVE_DMA_ATTRS
 	select PCI_MSI
 	select IOMMU_SUPPORT
 	help
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index b3fd54d93dd2..e64bfcb9702f 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -23,8 +23,6 @@ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 {
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6c391a5d3e5c..e13da05505dc 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -11,7 +11,6 @@ config SUPERH
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_PERF_EVENTS
 	select HAVE_DEBUG_BUGVERBOSE
 	select ARCH_HAVE_CUSTOM_GPIO_H
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index a3745a3fe029..e11cf0c8206b 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -11,8 +11,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #define DMA_ERROR_CODE 0
 
-#include <asm-generic/dma-mapping-common.h>
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction dir);
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3203e42190dd..57ffaf285c2f 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,7 +26,6 @@ config SPARC
 	select RTC_CLASS
 	select RTC_DRV_M48T59
 	select RTC_SYSTOHC
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL if SPARC64
 	select GENERIC_IRQ_SHOW
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 2777092dd851..1180ae254154 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -37,6 +37,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 #endif
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 6bfbe8b71e7e..de4a4fff9323 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
 	def_bool y
 	select HAVE_PERF_EVENTS
 	select USE_PMC if PERF_EVENTS
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_KVM if !TILEGX
 	select GENERIC_FIND_FIRST_BIT
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index c342736e3f1f..01ceb4a895b0 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -73,9 +73,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 }
 
 #define HAVE_ARCH_DMA_SET_MASK 1
-
-#include <asm-generic/dma-mapping-common.h>
-
 int dma_set_mask(struct device *dev, u64 mask);
 
 /*
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 877342640b6e..e5602ee9c610 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -5,7 +5,6 @@ config UNICORE32
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select HAVE_MEMBLOCK
 	select HAVE_GENERIC_DMA_COHERENT
-	select HAVE_DMA_ATTRS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select GENERIC_ATOMIC64
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 8140e053ccd3..4749854afd03 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -28,8 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return &swiotlb_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (dev && dev->dma_mask)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 92b2a73162ee..89159a6fa503 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,7 +100,6 @@ config X86
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 953b7263f844..3a27b93e6261 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
 #define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
 
-#include <asm-generic/dma-mapping-common.h>
-
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
 					struct dma_attrs *attrs);
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 82044f732323..e9df1567d778 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -15,7 +15,6 @@ config XTENSA
 	select GENERIC_PCI_IOMAP
 	select GENERIC_SCHED_CLOCK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_DMA_ATTRS
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if !MMU
 	select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 66c9ba261e30..87b7a7dfbcf3 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -30,8 +30,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 		return &xtensa_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction direction);
 
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 59babd5a5396..8ae7ab68cb97 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -82,13 +82,13 @@ config DRM_TTM
 
 config DRM_GEM_CMA_HELPER
 	bool
-	depends on DRM && HAVE_DMA_ATTRS
+	depends on DRM
 	help
 	  Choose this if you need the GEM CMA helper functions
 
 config DRM_KMS_CMA_HELPER
 	bool
-	depends on DRM && HAVE_DMA_ATTRS
+	depends on DRM
 	select DRM_GEM_CMA_HELPER
 	select DRM_KMS_FB_HELPER
 	select FB_SYS_FILLRECT
diff --git a/drivers/gpu/drm/imx/Kconfig b/drivers/gpu/drm/imx/Kconfig
index 35ca4f007839..a1844b50546c 100644
--- a/drivers/gpu/drm/imx/Kconfig
+++ b/drivers/gpu/drm/imx/Kconfig
@@ -5,7 +5,7 @@ config DRM_IMX
 	select VIDEOMODE_HELPERS
 	select DRM_GEM_CMA_HELPER
 	select DRM_KMS_CMA_HELPER
-	depends on DRM && (ARCH_MXC || ARCH_MULTIPLATFORM) && HAVE_DMA_ATTRS
+	depends on DRM && (ARCH_MXC || ARCH_MULTIPLATFORM)
 	depends on IMX_IPUV3_CORE
 	help
 	  enable i.MX graphics support
diff --git a/drivers/gpu/drm/rcar-du/Kconfig b/drivers/gpu/drm/rcar-du/Kconfig
index d4e0a39568f6..96dcd4a78951 100644
--- a/drivers/gpu/drm/rcar-du/Kconfig
+++ b/drivers/gpu/drm/rcar-du/Kconfig
@@ -1,6 +1,6 @@
 config DRM_RCAR_DU
 	tristate "DRM Support for R-Car Display Unit"
-	depends on DRM && ARM && HAVE_DMA_ATTRS && OF
+	depends on DRM && ARM && OF
 	depends on ARCH_SHMOBILE || COMPILE_TEST
 	select DRM_KMS_HELPER
 	select DRM_KMS_CMA_HELPER
diff --git a/drivers/gpu/drm/shmobile/Kconfig b/drivers/gpu/drm/shmobile/Kconfig
index b9202aa6f8ab..8d17d00ddb4b 100644
--- a/drivers/gpu/drm/shmobile/Kconfig
+++ b/drivers/gpu/drm/shmobile/Kconfig
@@ -1,6 +1,6 @@
 config DRM_SHMOBILE
 	tristate "DRM Support for SH Mobile"
-	depends on DRM && ARM && HAVE_DMA_ATTRS
+	depends on DRM && ARM
 	depends on ARCH_SHMOBILE || COMPILE_TEST
 	depends on FB_SH_MOBILE_MERAM || !FB_SH_MOBILE_MERAM
 	select BACKLIGHT_CLASS_DEVICE
diff --git a/drivers/gpu/drm/sti/Kconfig b/drivers/gpu/drm/sti/Kconfig
index 10c1b1926e6f..5ad43a1bb260 100644
--- a/drivers/gpu/drm/sti/Kconfig
+++ b/drivers/gpu/drm/sti/Kconfig
@@ -1,6 +1,6 @@
 config DRM_STI
 	tristate "DRM Support for STMicroelectronics SoC stiH41x Series"
-	depends on DRM && (SOC_STIH415 || SOC_STIH416 || ARCH_MULTIPLATFORM) && HAVE_DMA_ATTRS
+	depends on DRM && (SOC_STIH415 || SOC_STIH416 || ARCH_MULTIPLATFORM)
 	select RESET_CONTROLLER
 	select DRM_KMS_HELPER
 	select DRM_GEM_CMA_HELPER
diff --git a/drivers/gpu/drm/tilcdc/Kconfig b/drivers/gpu/drm/tilcdc/Kconfig
index 78beafb0742c..f60a1ec84fa4 100644
--- a/drivers/gpu/drm/tilcdc/Kconfig
+++ b/drivers/gpu/drm/tilcdc/Kconfig
@@ -1,6 +1,6 @@
 config DRM_TILCDC
 	tristate "DRM Support for TI LCDC Display Controller"
-	depends on DRM && OF && ARM && HAVE_DMA_ATTRS
+	depends on DRM && OF && ARM
 	select DRM_KMS_HELPER
 	select DRM_KMS_FB_HELPER
 	select DRM_KMS_CMA_HELPER
diff --git a/drivers/gpu/drm/vc4/Kconfig b/drivers/gpu/drm/vc4/Kconfig
index 2d7d115ddf3f..584810474e5b 100644
--- a/drivers/gpu/drm/vc4/Kconfig
+++ b/drivers/gpu/drm/vc4/Kconfig
@@ -1,7 +1,7 @@
 config DRM_VC4
 	tristate "Broadcom VC4 Graphics"
 	depends on ARCH_BCM2835 || COMPILE_TEST
-	depends on DRM && HAVE_DMA_ATTRS
+	depends on DRM
 	select DRM_KMS_HELPER
 	select DRM_KMS_CMA_HELPER
 	select DRM_GEM_CMA_HELPER
diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index 0c53805dff0e..526359447ff9 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -216,7 +216,6 @@ config VIDEO_STI_BDISP
 	tristate "STMicroelectronics BDISP 2D blitter driver"
 	depends on VIDEO_DEV && VIDEO_V4L2
 	depends on ARCH_STI || COMPILE_TEST
-	depends on HAVE_DMA_ATTRS
 	select VIDEOBUF2_DMA_CONTIG
 	select V4L2_MEM2MEM_DEV
 	help
diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h
deleted file mode 100644
index 6c32af918c2f..000000000000
--- a/include/asm-generic/dma-mapping-broken.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef _ASM_GENERIC_DMA_MAPPING_H
-#define _ASM_GENERIC_DMA_MAPPING_H
-
-/* define the dma api to allow compilation but not linking of
- * dma dependent code.  Code that depends on the dma-mapping
- * API needs to set 'depends on HAS_DMA' in its Kconfig
- */
-
-struct scatterlist;
-
-extern void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flag);
-
-extern void
-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-		    dma_addr_t dma_handle);
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				    dma_addr_t *dma_handle, gfp_t flag,
-				    struct dma_attrs *attrs)
-{
-	/* attrs is not supported and ignored */
-	return dma_alloc_coherent(dev, size, dma_handle, flag);
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *cpu_addr, dma_addr_t dma_handle,
-				  struct dma_attrs *attrs)
-{
-	/* attrs is not supported and ignored */
-	dma_free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-extern dma_addr_t
-dma_map_single(struct device *dev, void *ptr, size_t size,
-	       enum dma_data_direction direction);
-
-extern void
-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction);
-
-extern int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-	   enum dma_data_direction direction);
-
-extern void
-dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction);
-
-extern dma_addr_t
-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
-	     size_t size, enum dma_data_direction direction);
-
-extern void
-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction);
-
-extern void
-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-			enum dma_data_direction direction);
-
-extern void
-dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
-			      unsigned long offset, size_t size,
-			      enum dma_data_direction direction);
-
-extern void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction);
-
-#define dma_sync_single_for_device dma_sync_single_for_cpu
-#define dma_sync_single_range_for_device dma_sync_single_range_for_cpu
-#define dma_sync_sg_for_device dma_sync_sg_for_cpu
-
-extern int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
-
-extern int
-dma_supported(struct device *dev, u64 mask);
-
-extern int
-dma_set_mask(struct device *dev, u64 mask);
-
-extern int
-dma_get_cache_alignment(void);
-
-extern void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction);
-
-#endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
deleted file mode 100644
index b1bc954eccf3..000000000000
--- a/include/asm-generic/dma-mapping-common.h
+++ /dev/null
@@ -1,358 +0,0 @@
-#ifndef _ASM_GENERIC_DMA_MAPPING_H
-#define _ASM_GENERIC_DMA_MAPPING_H
-
-#include <linux/kmemcheck.h>
-#include <linux/bug.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-debug.h>
-#include <linux/dma-attrs.h>
-#include <asm-generic/dma-coherent.h>
-
-static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
-					      size_t size,
-					      enum dma_data_direction dir,
-					      struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	kmemcheck_mark_initialized(ptr, size);
-	BUG_ON(!valid_dma_direction(dir));
-	addr = ops->map_page(dev, virt_to_page(ptr),
-			     (unsigned long)ptr & ~PAGE_MASK, size,
-			     dir, attrs);
-	debug_dma_map_page(dev, virt_to_page(ptr),
-			   (unsigned long)ptr & ~PAGE_MASK, size,
-			   dir, addr, true);
-	return addr;
-}
-
-static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir, true);
-}
-
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir,
-				   struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	int i, ents;
-	struct scatterlist *s;
-
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
-	BUG_ON(!valid_dma_direction(dir));
-	ents = ops->map_sg(dev, sg, nents, dir, attrs);
-	BUG_ON(ents < 0);
-	debug_dma_map_sg(dev, sg, nents, ents, dir);
-
-	return ents;
-}
-
-static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-				      int nents, enum dma_data_direction dir,
-				      struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (ops->unmap_sg)
-		ops->unmap_sg(dev, sg, nents, dir, attrs);
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
-				      size_t offset, size_t size,
-				      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
-	BUG_ON(!valid_dma_direction(dir));
-	addr = ops->map_page(dev, page, offset, size, dir, NULL);
-	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
-
-	return addr;
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
-				  size_t size, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, NULL);
-	debug_dma_unmap_page(dev, addr, size, dir, false);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-					   size_t size,
-					   enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr, size, dir);
-	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t addr, size_t size,
-					      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr, size, dir);
-	debug_dma_sync_single_for_device(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-						 dma_addr_t addr,
-						 unsigned long offset,
-						 size_t size,
-						 enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr + offset, size, dir);
-	debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
-}
-
-static inline void dma_sync_single_range_for_device(struct device *dev,
-						    dma_addr_t addr,
-						    unsigned long offset,
-						    size_t size,
-						    enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr + offset, size, dir);
-	debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
-
-}
-
-#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
-#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
-#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
-#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
-
-extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
-			   void *cpu_addr, dma_addr_t dma_addr, size_t size);
-
-void *dma_common_contiguous_remap(struct page *page, size_t size,
-			unsigned long vm_flags,
-			pgprot_t prot, const void *caller);
-
-void *dma_common_pages_remap(struct page **pages, size_t size,
-			unsigned long vm_flags, pgprot_t prot,
-			const void *caller);
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags);
-
-/**
- * dma_mmap_attrs - map a coherent DMA allocation into user space
- * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
- * @vma: vm_area_struct describing requested user mapping
- * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
- * @handle: device-view address returned from dma_alloc_attrs
- * @size: size of memory originally requested in dma_alloc_attrs
- * @attrs: attributes of mapping properties requested in dma_alloc_attrs
- *
- * Map a coherent DMA buffer previously allocated by dma_alloc_attrs
- * into user space.  The coherent DMA buffer must not be freed by the
- * driver until the user space mapping has been released.
- */
-static inline int
-dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr,
-	       dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->mmap)
-		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
-	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
-}
-
-#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL)
-
-int
-dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
-		       void *cpu_addr, dma_addr_t dma_addr, size_t size);
-
-static inline int
-dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
-		      dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->get_sgtable)
-		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
-					attrs);
-	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size);
-}
-
-#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
-
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev, flag)	(true)
-#endif
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t flag,
-				       struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	BUG_ON(!ops);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
-		return cpu_addr;
-
-	if (!arch_dma_alloc_attrs(&dev, &flag))
-		return NULL;
-	if (!ops->alloc)
-		return NULL;
-
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-	return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				     void *cpu_addr, dma_addr_t dma_handle,
-				     struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!ops);
-	WARN_ON(irqs_disabled());
-
-	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-		return;
-
-	if (!ops->free)
-		return;
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline void *dma_alloc_coherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t flag)
-{
-	return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
-}
-
-static inline void dma_free_coherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t dma_handle)
-{
-	return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
-}
-
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp)
-{
-	DEFINE_DMA_ATTRS(attrs);
-
-	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-	return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t dma_handle)
-{
-	DEFINE_DMA_ATTRS(attrs);
-
-	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-	dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	debug_dma_mapping_error(dev, dma_addr);
-
-	if (get_dma_ops(dev)->mapping_error)
-		return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-
-#ifdef DMA_ERROR_CODE
-	return dma_addr == DMA_ERROR_CODE;
-#else
-	return 0;
-#endif
-}
-
-#ifndef HAVE_ARCH_DMA_SUPPORTED
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!ops)
-		return 0;
-	if (!ops->dma_supported)
-		return 1;
-	return ops->dma_supported(dev, mask);
-}
-#endif
-
-#ifndef HAVE_ARCH_DMA_SET_MASK
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (ops->set_dma_mask)
-		return ops->set_dma_mask(dev, mask);
-
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	*dev->dma_mask = mask;
-	return 0;
-}
-#endif
-
-#endif
diff --git a/include/linux/dma-attrs.h b/include/linux/dma-attrs.h
index c8e1831d7572..99c0be00b47c 100644
--- a/include/linux/dma-attrs.h
+++ b/include/linux/dma-attrs.h
@@ -41,7 +41,6 @@ static inline void init_dma_attrs(struct dma_attrs *attrs)
 	bitmap_zero(attrs->flags, __DMA_ATTRS_LONGS);
 }
 
-#ifdef CONFIG_HAVE_DMA_ATTRS
 /**
  * dma_set_attr - set a specific attribute
  * @attr: attribute to set
@@ -67,14 +66,5 @@ static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs)
 	BUG_ON(attr >= DMA_ATTR_MAX);
 	return test_bit(attr, attrs->flags);
 }
-#else /* !CONFIG_HAVE_DMA_ATTRS */
-static inline void dma_set_attr(enum dma_attr attr, struct dma_attrs *attrs)
-{
-}
 
-static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs)
-{
-	return 0;
-}
-#endif /* CONFIG_HAVE_DMA_ATTRS */
 #endif /* _DMA_ATTR_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2e551e2d2d03..cc0517b71c5e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -6,8 +6,12 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/dma-attrs.h>
+#include <linux/dma-debug.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
+#include <linux/kmemcheck.h>
+#include <linux/bug.h>
+#include <asm-generic/dma-coherent.h>
 
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.
@@ -86,7 +90,363 @@ static inline int is_device_dma_capable(struct device *dev)
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 #else
-#include <asm-generic/dma-mapping-broken.h>
+/*
+ * Define the dma api to allow compilation but not linking of
+ * dma dependent code.  Code that depends on the dma-mapping
+ * API needs to set 'depends on HAS_DMA' in its Kconfig
+ */
+extern struct dma_map_ops bad_dma_ops;
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+	return &bad_dma_ops;
+}
+#endif
+
+static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
+					      size_t size,
+					      enum dma_data_direction dir,
+					      struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	kmemcheck_mark_initialized(ptr, size);
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(dev, virt_to_page(ptr),
+			     (unsigned long)ptr & ~PAGE_MASK, size,
+			     dir, attrs);
+	debug_dma_map_page(dev, virt_to_page(ptr),
+			   (unsigned long)ptr & ~PAGE_MASK, size,
+			   dir, addr, true);
+	return addr;
+}
+
+static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
+					  size_t size,
+					  enum dma_data_direction dir,
+					  struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir, true);
+}
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+				   int nents, enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	int i, ents;
+	struct scatterlist *s;
+
+	for_each_sg(sg, s, nents, i)
+		kmemcheck_mark_initialized(sg_virt(s), s->length);
+	BUG_ON(!valid_dma_direction(dir));
+	ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	BUG_ON(ents < 0);
+	debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+	return ents;
+}
+
+static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+	if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+				      size_t offset, size_t size,
+				      enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	kmemcheck_mark_initialized(page_address(page) + offset, size);
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(dev, page, offset, size, dir, NULL);
+	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
+
+	return addr;
+}
+
+static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+				  size_t size, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, NULL);
+	debug_dma_unmap_page(dev, addr, size, dir, false);
+}
+
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+					   size_t size,
+					   enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
+	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void dma_sync_single_for_device(struct device *dev,
+					      dma_addr_t addr, size_t size,
+					      enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
+	debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+static inline void dma_sync_single_range_for_cpu(struct device *dev,
+						 dma_addr_t addr,
+						 unsigned long offset,
+						 size_t size,
+						 enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr + offset, size, dir);
+	debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
+}
+
+static inline void dma_sync_single_range_for_device(struct device *dev,
+						    dma_addr_t addr,
+						    unsigned long offset,
+						    size_t size,
+						    enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr + offset, size, dir);
+	debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+
+}
+
+#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
+#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
+#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
+#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
+
+extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+			   void *cpu_addr, dma_addr_t dma_addr, size_t size);
+
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+			unsigned long vm_flags,
+			pgprot_t prot, const void *caller);
+
+void *dma_common_pages_remap(struct page **pages, size_t size,
+			unsigned long vm_flags, pgprot_t prot,
+			const void *caller);
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags);
+
+/**
+ * dma_mmap_attrs - map a coherent DMA allocation into user space
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @vma: vm_area_struct describing requested user mapping
+ * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
+ * @handle: device-view address returned from dma_alloc_attrs
+ * @size: size of memory originally requested in dma_alloc_attrs
+ * @attrs: attributes of mapping properties requested in dma_alloc_attrs
+ *
+ * Map a coherent DMA buffer previously allocated by dma_alloc_attrs
+ * into user space.  The coherent DMA buffer must not be freed by the
+ * driver until the user space mapping has been released.
+ */
+static inline int
+dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr,
+	       dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	BUG_ON(!ops);
+	if (ops->mmap)
+		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+
+#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL)
+
+int
+dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
+		       void *cpu_addr, dma_addr_t dma_addr, size_t size);
+
+static inline int
+dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
+		      dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	BUG_ON(!ops);
+	if (ops->get_sgtable)
+		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+					attrs);
+	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size);
+}
+
+#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
+
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev, flag)	(true)
+#endif
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				       dma_addr_t *dma_handle, gfp_t flag,
+				       struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *cpu_addr;
+
+	BUG_ON(!ops);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
+		return cpu_addr;
+
+	if (!arch_dma_alloc_attrs(&dev, &flag))
+		return NULL;
+	if (!ops->alloc)
+		return NULL;
+
+	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+	return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				     void *cpu_addr, dma_addr_t dma_handle,
+				     struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!ops);
+	WARN_ON(irqs_disabled());
+
+	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+		return;
+
+	if (!ops->free)
+		return;
+
+	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+	ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t flag)
+{
+	return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t dma_handle)
+{
+	return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
+}
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp)
+{
+	DEFINE_DMA_ATTRS(attrs);
+
+	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+	return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t dma_handle)
+{
+	DEFINE_DMA_ATTRS(attrs);
+
+	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+	dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
+}
+
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	debug_dma_mapping_error(dev, dma_addr);
+
+	if (get_dma_ops(dev)->mapping_error)
+		return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+
+#ifdef DMA_ERROR_CODE
+	return dma_addr == DMA_ERROR_CODE;
+#else
+	return 0;
+#endif
+}
+
+#ifndef HAVE_ARCH_DMA_SUPPORTED
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (!ops)
+		return 0;
+	if (!ops->dma_supported)
+		return 1;
+	return ops->dma_supported(dev, mask);
+}
+#endif
+
+#ifndef HAVE_ARCH_DMA_SET_MASK
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops->set_dma_mask)
+		return ops->set_dma_mask(dev, mask);
+
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+	*dev->dma_mask = mask;
+	return 0;
+}
 #endif
 
 static inline u64 dma_get_mask(struct device *dev)
@@ -259,22 +619,6 @@ static inline void dmam_release_declared_memory(struct device *dev)
 }
 #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
 
-#ifndef CONFIG_HAVE_DMA_ATTRS
-struct dma_attrs;
-
-#define dma_map_single_attrs(dev, cpu_addr, size, dir, attrs) \
-	dma_map_single(dev, cpu_addr, size, dir)
-
-#define dma_unmap_single_attrs(dev, dma_addr, size, dir, attrs) \
-	dma_unmap_single(dev, dma_addr, size, dir)
-
-#define dma_map_sg_attrs(dev, sgl, nents, dir, attrs) \
-	dma_map_sg(dev, sgl, nents, dir)
-
-#define dma_unmap_sg_attrs(dev, sgl, nents, dir, attrs) \
-	dma_unmap_sg(dev, sgl, nents, dir)
-
-#else
 static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
 					   dma_addr_t *dma_addr, gfp_t gfp)
 {
@@ -300,7 +644,6 @@ static inline int dma_mmap_writecombine(struct device *dev,
 	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
 	return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
 }
-#endif /* CONFIG_HAVE_DMA_ATTRS */
 
 #ifdef CONFIG_NEED_DMA_MAP_STATE
 #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
-- 
cgit v1.2.3


From 20d666e41166f8023ff3d960e832d87ded18c5c4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Jan 2016 15:02:09 -0800
Subject: dma-mapping: remove <asm-generic/dma-coherent.h>

This wasn't an asm-generic header to start with, and can be merged into
dma-mapping.h trivially.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Helge Deller <deller@gmx.de>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/xtensa/include/asm/dma-mapping.h |  2 --
 drivers/base/dma-mapping.c            |  3 +--
 include/asm-generic/dma-coherent.h    | 32 --------------------------------
 include/linux/dma-mapping.h           | 34 ++++++++++++++++++++++++++++------
 4 files changed, 29 insertions(+), 42 deletions(-)
 delete mode 100644 include/asm-generic/dma-coherent.h

(limited to 'include')

diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 87b7a7dfbcf3..3fc1170a6488 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -13,8 +13,6 @@
 #include <asm/cache.h>
 #include <asm/io.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
 
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 381e39d5204e..d799662f19eb 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -12,7 +12,6 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <asm-generic/dma-coherent.h>
 
 /*
  * Managed DMA API
@@ -167,7 +166,7 @@ void dmam_free_noncoherent(struct device *dev, size_t size, void *vaddr,
 }
 EXPORT_SYMBOL(dmam_free_noncoherent);
 
-#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 
 static void dmam_coherent_decl_release(struct device *dev, void *res)
 {
diff --git a/include/asm-generic/dma-coherent.h b/include/asm-generic/dma-coherent.h
deleted file mode 100644
index 0297e5875798..000000000000
--- a/include/asm-generic/dma-coherent.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef DMA_COHERENT_H
-#define DMA_COHERENT_H
-
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
-/*
- * These three functions are only for dma allocator.
- * Don't use them in device drivers.
- */
-int dma_alloc_from_coherent(struct device *dev, ssize_t size,
-				       dma_addr_t *dma_handle, void **ret);
-int dma_release_from_coherent(struct device *dev, int order, void *vaddr);
-
-int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma,
-			    void *cpu_addr, size_t size, int *ret);
-/*
- * Standard interface
- */
-#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
-int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				dma_addr_t device_addr, size_t size, int flags);
-
-void dma_release_declared_memory(struct device *dev);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size);
-#else
-#define dma_alloc_from_coherent(dev, size, handle, ret) (0)
-#define dma_release_from_coherent(dev, order, vaddr) (0)
-#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0)
-#endif
-
-#endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index cc0517b71c5e..d6b575bb45a7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -11,7 +11,6 @@
 #include <linux/scatterlist.h>
 #include <linux/kmemcheck.h>
 #include <linux/bug.h>
-#include <asm-generic/dma-coherent.h>
 
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.
@@ -87,6 +86,23 @@ static inline int is_device_dma_capable(struct device *dev)
 	return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE;
 }
 
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+/*
+ * These three functions are only for dma allocator.
+ * Don't use them in device drivers.
+ */
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+				       dma_addr_t *dma_handle, void **ret);
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr);
+
+int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma,
+			    void *cpu_addr, size_t size, int *ret);
+#else
+#define dma_alloc_from_coherent(dev, size, handle, ret) (0)
+#define dma_release_from_coherent(dev, order, vaddr) (0)
+#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0)
+#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
+
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 #else
@@ -568,7 +584,13 @@ static inline int dma_get_cache_alignment(void)
 #define DMA_MEMORY_INCLUDES_CHILDREN	0x04
 #define DMA_MEMORY_EXCLUSIVE		0x08
 
-#ifndef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
+				dma_addr_t device_addr, size_t size, int flags);
+void dma_release_declared_memory(struct device *dev);
+void *dma_mark_declared_memory_occupied(struct device *dev,
+					dma_addr_t device_addr, size_t size);
+#else
 static inline int
 dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 			    dma_addr_t device_addr, size_t size, int flags)
@@ -587,7 +609,7 @@ dma_mark_declared_memory_occupied(struct device *dev,
 {
 	return ERR_PTR(-EBUSY);
 }
-#endif
+#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
 /*
  * Managed DMA API
@@ -600,13 +622,13 @@ extern void *dmam_alloc_noncoherent(struct device *dev, size_t size,
 				    dma_addr_t *dma_handle, gfp_t gfp);
 extern void dmam_free_noncoherent(struct device *dev, size_t size, void *vaddr,
 				  dma_addr_t dma_handle);
-#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 extern int dmam_declare_coherent_memory(struct device *dev,
 					phys_addr_t phys_addr,
 					dma_addr_t device_addr, size_t size,
 					int flags);
 extern void dmam_release_declared_memory(struct device *dev);
-#else /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
+#else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 static inline int dmam_declare_coherent_memory(struct device *dev,
 				phys_addr_t phys_addr, dma_addr_t device_addr,
 				size_t size, gfp_t gfp)
@@ -617,7 +639,7 @@ static inline int dmam_declare_coherent_memory(struct device *dev,
 static inline void dmam_release_declared_memory(struct device *dev)
 {
 }
-#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
+#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
 static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
 					   dma_addr_t *dma_addr, gfp_t gfp)
-- 
cgit v1.2.3


From 8e99469ab0f821bea77625cd4775ca529d4ca7d4 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Wed, 20 Jan 2016 15:02:12 -0800
Subject: dma-mapping: use offset_in_page macro

Use offset_in_page macro instead of (addr & ~PAGE_MASK).

Signed-off-by: Geliang Tang <geliangtang@163.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-mapping.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d6b575bb45a7..75857cda38e9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -129,10 +129,10 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	kmemcheck_mark_initialized(ptr, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, virt_to_page(ptr),
-			     (unsigned long)ptr & ~PAGE_MASK, size,
+			     offset_in_page(ptr), size,
 			     dir, attrs);
 	debug_dma_map_page(dev, virt_to_page(ptr),
-			   (unsigned long)ptr & ~PAGE_MASK, size,
+			   offset_in_page(ptr), size,
 			   dir, addr, true);
 	return addr;
 }
-- 
cgit v1.2.3


From 6d378dac7c4905db38f8127c4e618f0f627a4ced Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:18 -0800
Subject: mm: memcontrol: drop unused @css argument in memcg_init_kmem

This series adds accounting of the historical "kmem" memory consumers to
the cgroup2 memory controller.

These consumers include the dentry cache, the inode cache, kernel stack
pages, and a few others that are pointed out in patch 7/8.  The
footprint of these consumers is directly tied to userspace activity in
common workloads, and so they have to be part of the minimally viable
configuration in order to present a complete feature to our users.

The cgroup2 interface of the memory controller is far from complete, but
this series, along with the socket memory accounting series, provides
the final semantic changes for the existing memory knobs in the cgroup2
interface, which is scheduled for initial release in the next merge
window.

This patch (of 8):

Remove unused css argument frmo memcg_init_kmem()

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/tcp_memcontrol.h | 3 ++-
 mm/memcontrol.c              | 6 +++---
 net/ipv4/tcp_memcontrol.c    | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 01ff7c6efada..020c2dee65e8 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -4,6 +4,7 @@
 struct cgroup_subsys;
 struct mem_cgroup;
 
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
+int tcp_init_cgroup(struct mem_cgroup *memcg);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
+
 #endif /* _TCP_MEMCG_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0eda67376df4..f21f29cd14f9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3583,7 +3583,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg)
 {
 	int ret;
 
@@ -3591,7 +3591,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	if (ret)
 		return ret;
 
-	return tcp_init_cgroup(memcg, ss);
+	return tcp_init_cgroup(memcg);
 }
 
 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
@@ -4274,7 +4274,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	}
 	mutex_unlock(&memcg_create_mutex);
 
-	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
+	ret = memcg_init_kmem(memcg);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 18bc7f745e9c..133eb5eac49f 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,7 +6,7 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+int tcp_init_cgroup(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 	struct page_counter *counter_parent = NULL;
-- 
cgit v1.2.3


From 567e9ab2e614e55feca20e8bcb54b629e9cc1a3b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:24 -0800
Subject: mm: memcontrol: give the kmem states more descriptive names

On any given memcg, the kmem accounting feature has three separate
states: not initialized, structures allocated, and actively accounting
slab memory.  These are represented through a combination of the
kmem_acct_activated and kmem_acct_active flags, which is confusing.

Convert to a kmem_state enum with the states NONE, ALLOCATED, and
ONLINE.  Then rename the functions to modify the state accordingly.
This follows the nomenclature of css object states more closely.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 ++++++++-----
 mm/memcontrol.c            | 52 ++++++++++++++++++++++------------------------
 mm/slab_common.c           |  4 ++--
 mm/vmscan.c                |  2 +-
 4 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 189f04d4d2ec..54dab4d43e6d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -152,6 +152,12 @@ struct mem_cgroup_thresholds {
 	struct mem_cgroup_threshold_ary *spare;
 };
 
+enum memcg_kmem_state {
+	KMEM_NONE,
+	KMEM_ALLOCATED,
+	KMEM_ONLINE,
+};
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -233,8 +239,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM)
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
-	bool kmem_acct_activated;
-	bool kmem_acct_active;
+	enum memcg_kmem_state kmem_state;
 #endif
 
 	int last_scanned_node;
@@ -750,9 +755,9 @@ static inline bool memcg_kmem_enabled(void)
 	return static_branch_unlikely(&memcg_kmem_enabled_key);
 }
 
-static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
 {
-	return memcg->kmem_acct_active;
+	return memcg->kmem_state == KMEM_ONLINE;
 }
 
 /*
@@ -850,7 +855,7 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
-static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+static inline bool memcg_kmem_online(struct mem_cgroup *memcg)
 {
 	return false;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 71dced17b16d..24b6bded13f8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2378,7 +2378,7 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 	struct page_counter *counter;
 	int ret;
 
-	if (!memcg_kmem_is_active(memcg))
+	if (!memcg_kmem_online(memcg))
 		return 0;
 
 	if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
@@ -2861,14 +2861,13 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_activate_kmem(struct mem_cgroup *memcg)
+static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
 	int err = 0;
 	int memcg_id;
 
 	BUG_ON(memcg->kmemcg_id >= 0);
-	BUG_ON(memcg->kmem_acct_activated);
-	BUG_ON(memcg->kmem_acct_active);
+	BUG_ON(memcg->kmem_state);
 
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
@@ -2898,14 +2897,13 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg)
 
 	static_branch_inc(&memcg_kmem_enabled_key);
 	/*
-	 * A memory cgroup is considered kmem-active as soon as it gets
+	 * A memory cgroup is considered kmem-online as soon as it gets
 	 * kmemcg_id. Setting the id after enabling static branching will
 	 * guarantee no one starts accounting before all call sites are
 	 * patched.
 	 */
 	memcg->kmemcg_id = memcg_id;
-	memcg->kmem_acct_activated = true;
-	memcg->kmem_acct_active = true;
+	memcg->kmem_state = KMEM_ONLINE;
 out:
 	return err;
 }
@@ -2917,8 +2915,8 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 
 	mutex_lock(&memcg_limit_mutex);
 	/* Top-level cgroup doesn't propagate from root */
-	if (!memcg_kmem_is_active(memcg)) {
-		ret = memcg_activate_kmem(memcg);
+	if (!memcg_kmem_online(memcg)) {
+		ret = memcg_online_kmem(memcg);
 		if (ret)
 			goto out;
 	}
@@ -2938,11 +2936,12 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 
 	mutex_lock(&memcg_limit_mutex);
 	/*
-	 * If the parent cgroup is not kmem-active now, it cannot be activated
-	 * after this point, because it has at least one child already.
+	 * If the parent cgroup is not kmem-online now, it cannot be
+	 * onlined after this point, because it has at least one child
+	 * already.
 	 */
-	if (memcg_kmem_is_active(parent))
-		ret = memcg_activate_kmem(memcg);
+	if (memcg_kmem_online(parent))
+		ret = memcg_online_kmem(memcg);
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;
 }
@@ -3590,22 +3589,21 @@ static int memcg_init_kmem(struct mem_cgroup *memcg)
 	return tcp_init_cgroup(memcg);
 }
 
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
 {
 	struct cgroup_subsys_state *css;
 	struct mem_cgroup *parent, *child;
 	int kmemcg_id;
 
-	if (!memcg->kmem_acct_active)
+	if (memcg->kmem_state != KMEM_ONLINE)
 		return;
-
 	/*
-	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
-	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
-	 * guarantees no cache will be created for this cgroup after we are
-	 * done (see memcg_create_kmem_cache()).
+	 * Clear the online state before clearing memcg_caches array
+	 * entries. The slab_mutex in memcg_deactivate_kmem_caches()
+	 * guarantees that no cache will be created for this cgroup
+	 * after we are done (see memcg_create_kmem_cache()).
 	 */
-	memcg->kmem_acct_active = false;
+	memcg->kmem_state = KMEM_ALLOCATED;
 
 	memcg_deactivate_kmem_caches(memcg);
 
@@ -3636,9 +3634,9 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 	memcg_free_cache_id(kmemcg_id);
 }
 
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+static void memcg_free_kmem(struct mem_cgroup *memcg)
 {
-	if (memcg->kmem_acct_activated) {
+	if (memcg->kmem_state == KMEM_ALLOCATED) {
 		memcg_destroy_kmem_caches(memcg);
 		static_branch_dec(&memcg_kmem_enabled_key);
 		WARN_ON(page_counter_read(&memcg->kmem));
@@ -3651,11 +3649,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
 {
 }
 
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+static void memcg_free_kmem(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -4308,7 +4306,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	vmpressure_cleanup(&memcg->vmpressure);
 
-	memcg_deactivate_kmem(memcg);
+	memcg_offline_kmem(memcg);
 
 	wb_memcg_offline(memcg);
 }
@@ -4324,7 +4322,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	memcg_destroy_kmem(memcg);
+	memcg_free_kmem(memcg);
 #ifdef CONFIG_INET
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_dec(&memcg_sockets_enabled_key);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e016178063e1..8c262e6dc33e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -503,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	mutex_lock(&slab_mutex);
 
 	/*
-	 * The memory cgroup could have been deactivated while the cache
+	 * The memory cgroup could have been offlined while the cache
 	 * creation work was pending.
 	 */
-	if (!memcg_kmem_is_active(memcg))
+	if (!memcg_kmem_online(memcg))
 		goto out_unlock;
 
 	idx = memcg_cache_id(memcg);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ac86956ff9d..05dd182f04fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -411,7 +411,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
 
-	if (memcg && !memcg_kmem_is_active(memcg))
+	if (memcg && !memcg_kmem_online(memcg))
 		return 0;
 
 	if (nr_scanned == 0)
-- 
cgit v1.2.3


From 127424c86bb6cb87f0b563d9fdcfbbaf3c86ecec Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:32 -0800
Subject: mm: memcontrol: move kmem accounting code to CONFIG_MEMCG

The cgroup2 memory controller will account important in-kernel memory
consumers per default.  Move all necessary components to CONFIG_MEMCG.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_lru.h   |  4 +--
 include/linux/memcontrol.h |  7 +++--
 include/linux/sched.h      |  4 +--
 include/linux/slab.h       |  2 +-
 include/linux/slab_def.h   |  3 +-
 include/linux/slub_def.h   |  2 +-
 mm/list_lru.c              | 12 ++++----
 mm/memcontrol.c            | 69 +++++++++++++++++++++++++++-------------------
 mm/slab.h                  |  6 ++--
 mm/slab_common.c           | 10 +++----
 mm/slub.c                  | 10 +++----
 11 files changed, 72 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 2a6b9947aaa3..cb0ba9f2a9a2 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -40,7 +40,7 @@ struct list_lru_node {
 	spinlock_t		lock;
 	/* global list, used for the root cgroup in cgroup aware lrus */
 	struct list_lru_one	lru;
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
 	struct list_lru_memcg	*memcg_lrus;
 #endif
@@ -48,7 +48,7 @@ struct list_lru_node {
 
 struct list_lru {
 	struct list_lru_node	*node;
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 	struct list_head	list;
 #endif
 };
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 54dab4d43e6d..a87704e3668e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -236,7 +236,7 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct cg_proto tcp_mem;
 #endif
-#if defined(CONFIG_MEMCG_KMEM)
+#ifndef CONFIG_SLOB
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 	enum memcg_kmem_state kmem_state;
@@ -735,7 +735,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 }
 #endif
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 extern struct static_key_false memcg_kmem_enabled_key;
 
 extern int memcg_nr_cache_ids;
@@ -891,5 +891,6 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 02dabf281b2f..f1e81e128592 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1476,10 +1476,10 @@ struct task_struct {
 	unsigned in_iowait:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
-#endif
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
 	unsigned memcg_kmem_skip_account:1;
 #endif
+#endif
 #ifdef CONFIG_COMPAT_BRK
 	unsigned brk_randomized:1;
 #endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3ffee7422012..3627d5c1bc47 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,7 +86,7 @@
 #else
 # define SLAB_FAILSLAB		0x00000000UL
 #endif
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 # define SLAB_ACCOUNT		0x04000000UL	/* Account to memcg */
 #else
 # define SLAB_ACCOUNT		0x00000000UL
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 33d049066c3d..cf139d3fa513 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -69,7 +69,8 @@ struct kmem_cache {
 	 */
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
-#ifdef CONFIG_MEMCG_KMEM
+
+#ifdef CONFIG_MEMCG
 	struct memcg_cache_params memcg_params;
 #endif
 
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 33885118523c..b7e57927f521 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -84,7 +84,7 @@ struct kmem_cache {
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;	/* For sysfs */
 #endif
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	struct memcg_cache_params memcg_params;
 	int max_attr_size; /* for propagation, maximum size of a stored attr */
 #ifdef CONFIG_SYSFS
diff --git a/mm/list_lru.c b/mm/list_lru.c
index afc71ea9a381..1d05cb9d363d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
 #include <linux/mutex.h>
 #include <linux/memcontrol.h>
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 static LIST_HEAD(list_lrus);
 static DEFINE_MUTEX(list_lrus_mutex);
 
@@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru)
 static void list_lru_unregister(struct list_lru *lru)
 {
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 static inline bool list_lru_memcg_aware(struct list_lru *lru)
 {
 	/*
@@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
 {
 	return &nlru->lru;
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
@@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l)
 	l->nr_items = 0;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
 					  int begin, int end)
 {
@@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 static void memcg_destroy_list_lru(struct list_lru *lru)
 {
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 		    struct lock_class_key *key)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f8219b58e0c..fe51d5e61389 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -297,7 +297,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return mem_cgroup_from_css(css);
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
@@ -349,7 +349,7 @@ void memcg_put_cache_ids(void)
 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
 
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
@@ -2203,7 +2203,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 		unlock_page_lru(page, isolated);
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
 static int memcg_alloc_cache_id(void)
 {
 	int id, size;
@@ -2424,7 +2424,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 	page->mem_cgroup = NULL;
 	css_put_many(&memcg->css, nr_pages);
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
@@ -2860,7 +2860,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	}
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
 	int err = 0;
@@ -2908,24 +2908,6 @@ out:
 	return err;
 }
 
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-				   unsigned long limit)
-{
-	int ret;
-
-	mutex_lock(&memcg_limit_mutex);
-	/* Top-level cgroup doesn't propagate from root */
-	if (!memcg_kmem_online(memcg)) {
-		ret = memcg_online_kmem(memcg);
-		if (ret)
-			goto out;
-	}
-	ret = page_counter_limit(&memcg->kmem, limit);
-out:
-	mutex_unlock(&memcg_limit_mutex);
-	return ret;
-}
-
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
 	int ret = 0;
@@ -3000,16 +2982,45 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 	}
 }
 #else
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* !CONFIG_SLOB */
+
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
-	return -EINVAL;
+	int ret;
+
+	mutex_lock(&memcg_limit_mutex);
+	/* Top-level cgroup doesn't propagate from root */
+	if (!memcg_kmem_online(memcg)) {
+		ret = memcg_online_kmem(memcg);
+		if (ret)
+			goto out;
+	}
+	ret = page_counter_limit(&memcg->kmem, limit);
+out:
+	mutex_unlock(&memcg_limit_mutex);
+	return ret;
 }
-static void memcg_offline_kmem(struct mem_cgroup *memcg)
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+				   unsigned long limit)
 {
+	return -EINVAL;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
+
 /*
  * The user of this function is...
  * RES_LIMIT.
@@ -4182,7 +4193,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
 	memcg->kmemcg_id = -1;
 #endif
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -4244,10 +4255,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	}
 	mutex_unlock(&memcg_create_mutex);
 
-#ifdef CONFIG_MEMCG_KMEM
 	ret = memcg_propagate_kmem(memcg);
 	if (ret)
 		return ret;
+
+#ifdef CONFIG_MEMCG_KMEM
 	ret = tcp_init_cgroup(memcg);
 	if (ret)
 		return ret;
@@ -4308,8 +4320,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 		static_branch_dec(&memcg_sockets_enabled_key);
 #endif
 
-#ifdef CONFIG_MEMCG_KMEM
 	memcg_free_kmem(memcg);
+
+#ifdef CONFIG_MEMCG_KMEM
 	tcp_destroy_cgroup(memcg);
 #endif
 
diff --git a/mm/slab.h b/mm/slab.h
index c63b8699cfa3..834ad240c0bb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -173,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
  * slab_mutex.
@@ -251,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
 
 extern void slab_init_memcg_params(struct kmem_cache *);
 
-#else /* !CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 #define for_each_memcg_cache(iter, root) \
 	for ((void)(iter), (void)(root); 0; )
@@ -292,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
 static inline void slab_init_memcg_params(struct kmem_cache *s)
 {
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8c262e6dc33e..b50aef01ccf7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -128,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 	return i;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.is_root_cache = true;
@@ -221,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s,
 static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 /*
  * Find a mergeable slab cache
@@ -477,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
 	}
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 /*
  * memcg_create_kmem_cache - Create a cache for a memory cgroup.
  * @memcg: The memory cgroup the new cache is for.
@@ -689,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
 {
 	return 0;
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
@@ -1123,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 int memcg_slab_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
diff --git a/mm/slub.c b/mm/slub.c
index b21fd24b08b1..2e1355ac056b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5207,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		return -EIO;
 
 	err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
 		struct kmem_cache *c;
 
@@ -5242,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 
 static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 {
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	int i;
 	char *buffer = NULL;
 	struct kmem_cache *root_cache;
@@ -5328,7 +5328,7 @@ static struct kset *slab_kset;
 
 static inline struct kset *cache_kset(struct kmem_cache *s)
 {
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	if (!is_root_cache(s))
 		return s->memcg_params.root_cache->memcg_kset;
 #endif
@@ -5405,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 	if (err)
 		goto out_del_kobj;
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	if (is_root_cache(s)) {
 		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
 		if (!s->memcg_kset) {
@@ -5438,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s)
 		 */
 		return;
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	kset_unregister(s->memcg_kset);
 #endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
-- 
cgit v1.2.3


From 489c2a20a414351fe0813a727c34600c0f7292ae Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:41 -0800
Subject: mm: memcontrol: introduce CONFIG_MEMCG_LEGACY_KMEM

Let the user know that CONFIG_MEMCG_KMEM does not apply to the cgroup2
interface. This also makes legacy-only code sections stand out better.

[arnd@arndb.de: mm: memcontrol: only manage socket pressure for CONFIG_INET]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 ++--
 init/Kconfig               | 10 +++++++++-
 mm/memcontrol.c            | 18 +++++++++---------
 mm/vmpressure.c            |  2 ++
 net/ipv4/Makefile          |  2 +-
 5 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a87704e3668e..2bb14d021cd0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -233,7 +233,7 @@ struct mem_cgroup {
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 	struct cg_proto tcp_mem;
 #endif
 #ifndef CONFIG_SLOB
@@ -717,7 +717,7 @@ extern struct static_key_false memcg_sockets_enabled_key;
 #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (memcg->tcp_mem.memory_pressure)
 		return true;
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 5b86082fa238..a0a15cec8daf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -964,10 +964,13 @@ config MEMCG_SWAP_ENABLED
 	  For those who want to have the feature enabled by default should
 	  select this option (if, for some reason, they need to disable it
 	  then swapaccount=0 does the trick).
+config MEMCG_LEGACY_KMEM
+       bool
 config MEMCG_KMEM
-	bool "Memory Resource Controller Kernel Memory accounting"
+	bool "Legacy Memory Resource Controller Kernel Memory accounting"
 	depends on MEMCG
 	depends on SLUB || SLAB
+	select MEMCG_LEGACY_KMEM
 	help
 	  The Kernel Memory extension for Memory Resource Controller can limit
 	  the amount of memory used by kernel objects in the system. Those are
@@ -1071,6 +1074,11 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+	  This option affects the ORIGINAL cgroup interface. The cgroup2 memory
+	  controller includes important in-kernel memory consumers per default.
+
+	  If you're using cgroup2, say N.
+
 config CGROUP_HUGETLB
 	bool "HugeTLB controller"
 	depends on HUGETLB_PAGE
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2239e6dd4d4c..92e8ab67b6df 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3001,7 +3001,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 }
 #endif /* !CONFIG_SLOB */
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
@@ -3025,7 +3025,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 {
 	return -EINVAL;
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG_LEGACY_KMEM */
 
 
 /*
@@ -4039,7 +4039,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memcg_numa_stat_show,
 	},
 #endif
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	{
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4266,13 +4266,13 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_INET
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	ret = tcp_init_cgroup(memcg);
 	if (ret)
 		return ret;
 #endif
 
-#ifdef CONFIG_INET
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_inc(&memcg_sockets_enabled_key);
 #endif
@@ -4329,7 +4329,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 
 	memcg_free_kmem(memcg);
 
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 	tcp_destroy_cgroup(memcg);
 #endif
 
@@ -5558,7 +5558,7 @@ void sock_update_memcg(struct sock *sk)
 	memcg = mem_cgroup_from_task(current);
 	if (memcg == root_mem_cgroup)
 		goto out;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
 		goto out;
 #endif
@@ -5587,7 +5587,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	gfp_t gfp_mask = GFP_KERNEL;
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
 		struct page_counter *counter;
 
@@ -5619,7 +5619,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
  */
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
 		page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
 				      nr_pages);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..89b1d441af4b 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -275,6 +275,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 
 		level = vmpressure_calc_level(scanned, reclaimed);
 
+#ifdef CONFIG_INET
 		if (level > VMPRESSURE_LOW) {
 			/*
 			 * Let the socket buffer allocator know that
@@ -286,6 +287,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 			 */
 			memcg->socket_pressure = jiffies + HZ;
 		}
+#endif
 	}
 }
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c29809f765dc..bee5055832a1 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,7 +56,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_LEGACY_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
-- 
cgit v1.2.3


From d55f90bfab40e3b5db323711d28186ff09461692 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:02:44 -0800
Subject: net: drop tcp_memcontrol.c

tcp_memcontrol.c only contains legacy memory.tcp.kmem.* file definitions
and mem_cgroup->tcp_mem init/destroy stuff.  This doesn't belong to
network subsys.  Let's move it to memcontrol.c.  This also allows us to
reuse generic code for handling legacy memcg files.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/tcp_memcontrol.h |  10 ---
 mm/memcontrol.c              |  98 +++++++++++++++++++--
 net/ipv4/Makefile            |   1 -
 net/ipv4/sysctl_net_ipv4.c   |   1 -
 net/ipv4/tcp_ipv4.c          |   1 -
 net/ipv4/tcp_memcontrol.c    | 200 -------------------------------------------
 net/ipv6/tcp_ipv6.c          |   1 -
 7 files changed, 90 insertions(+), 222 deletions(-)
 delete mode 100644 include/net/tcp_memcontrol.h
 delete mode 100644 net/ipv4/tcp_memcontrol.c

(limited to 'include')

diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
deleted file mode 100644
index 020c2dee65e8..000000000000
--- a/include/net/tcp_memcontrol.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _TCP_MEMCG_H
-#define _TCP_MEMCG_H
-
-struct cgroup_subsys;
-struct mem_cgroup;
-
-int tcp_init_cgroup(struct mem_cgroup *memcg);
-void tcp_destroy_cgroup(struct mem_cgroup *memcg);
-
-#endif /* _TCP_MEMCG_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 92e8ab67b6df..15896708429b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,7 +66,6 @@
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
-#include <net/tcp_memcontrol.h>
 #include "slab.h"
 
 #include <asm/uaccess.h>
@@ -242,6 +241,7 @@ enum res_type {
 	_MEMSWAP,
 	_OOM_TYPE,
 	_KMEM,
+	_TCP,
 };
 
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
@@ -2842,6 +2842,11 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+	case _TCP:
+		counter = &memcg->tcp_mem.memory_allocated;
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -3028,6 +3033,48 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 #endif /* CONFIG_MEMCG_LEGACY_KMEM */
 
 
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+	int ret;
+
+	mutex_lock(&memcg_limit_mutex);
+
+	ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit);
+	if (ret)
+		goto out;
+
+	if (!memcg->tcp_mem.active) {
+		/*
+		 * The active flag needs to be written after the static_key
+		 * update. This is what guarantees that the socket activation
+		 * function is the last one to run. See sock_update_memcg() for
+		 * details, and note that we don't mark any socket as belonging
+		 * to this memcg until that flag is up.
+		 *
+		 * We need to do this, because static_keys will span multiple
+		 * sites, but we can't control their order. If we mark a socket
+		 * as accounted, but the accounting functions are not patched in
+		 * yet, we'll lose accounting.
+		 *
+		 * We never race with the readers in sock_update_memcg(),
+		 * because when this value change, the code to process it is not
+		 * patched in yet.
+		 */
+		static_branch_inc(&memcg_sockets_enabled_key);
+		memcg->tcp_mem.active = true;
+	}
+out:
+	mutex_unlock(&memcg_limit_mutex);
+	return ret;
+}
+#else
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */
+
 /*
  * The user of this function is...
  * RES_LIMIT.
@@ -3060,6 +3107,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 		case _KMEM:
 			ret = memcg_update_kmem_limit(memcg, nr_pages);
 			break;
+		case _TCP:
+			ret = memcg_update_tcp_limit(memcg, nr_pages);
+			break;
 		}
 		break;
 	case RES_SOFT_LIMIT:
@@ -3086,6 +3136,11 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+	case _TCP:
+		counter = &memcg->tcp_mem.memory_allocated;
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -4072,6 +4127,31 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memcg_slab_show,
 	},
 #endif
+#ifdef CONFIG_INET
+	{
+		.name = "kmem.tcp.limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+		.write = mem_cgroup_write,
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "kmem.tcp.usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "kmem.tcp.failcnt",
+		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+		.write = mem_cgroup_reset,
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "kmem.tcp.max_usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+		.write = mem_cgroup_reset,
+		.read_u64 = mem_cgroup_read_u64,
+	},
+#endif
 #endif
 	{ },	/* terminate */
 };
@@ -4241,6 +4321,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+		page_counter_init(&memcg->tcp_mem.memory_allocated,
+				  &parent->tcp_mem.memory_allocated);
+#endif
 
 		/*
 		 * No need to take a reference to the parent because cgroup
@@ -4252,6 +4336,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
+#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+		page_counter_init(&memcg->tcp_mem.memory_allocated, NULL);
+#endif
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -4267,12 +4354,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		return ret;
 
 #ifdef CONFIG_INET
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
-	ret = tcp_init_cgroup(memcg);
-	if (ret)
-		return ret;
-#endif
-
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_inc(&memcg_sockets_enabled_key);
 #endif
@@ -4330,7 +4411,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	memcg_free_kmem(memcg);
 
 #if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
-	tcp_destroy_cgroup(memcg);
+	if (memcg->tcp_mem.active)
+		static_branch_dec(&memcg_sockets_enabled_key);
 #endif
 
 	__mem_cgroup_free(memcg);
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bee5055832a1..62c049b647e9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_LEGACY_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 46ce410703b1..4d367b4139a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,7 +24,6 @@
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
 #include <net/ping.h>
-#include <net/tcp_memcontrol.h>
 
 static int zero;
 static int one = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c7d1fb50f381..5ced3e4013e3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,7 +73,6 @@
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
 #include <net/busy_poll.h>
 
 #include <linux/inet.h>
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index 133eb5eac49f..000000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,200 +0,0 @@
-#include <net/tcp.h>
-#include <net/tcp_memcontrol.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <linux/nsproxy.h>
-#include <linux/memcontrol.h>
-#include <linux/module.h>
-
-int tcp_init_cgroup(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	struct page_counter *counter_parent = NULL;
-	/*
-	 * The root cgroup does not use page_counters, but rather,
-	 * rely on the data already collected by the network
-	 * subsystem
-	 */
-	if (memcg == root_mem_cgroup)
-		return 0;
-
-	memcg->tcp_mem.memory_pressure = 0;
-
-	if (parent)
-		counter_parent = &parent->tcp_mem.memory_allocated;
-
-	page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent);
-
-	return 0;
-}
-
-void tcp_destroy_cgroup(struct mem_cgroup *memcg)
-{
-	if (memcg == root_mem_cgroup)
-		return;
-
-	if (memcg->tcp_mem.active)
-		static_branch_dec(&memcg_sockets_enabled_key);
-}
-
-static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
-	int ret;
-
-	if (memcg == root_mem_cgroup)
-		return -EINVAL;
-
-	ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages);
-	if (ret)
-		return ret;
-
-	if (!memcg->tcp_mem.active) {
-		/*
-		 * The active flag needs to be written after the static_key
-		 * update. This is what guarantees that the socket activation
-		 * function is the last one to run. See sock_update_memcg() for
-		 * details, and note that we don't mark any socket as belonging
-		 * to this memcg until that flag is up.
-		 *
-		 * We need to do this, because static_keys will span multiple
-		 * sites, but we can't control their order. If we mark a socket
-		 * as accounted, but the accounting functions are not patched in
-		 * yet, we'll lose accounting.
-		 *
-		 * We never race with the readers in sock_update_memcg(),
-		 * because when this value change, the code to process it is not
-		 * patched in yet.
-		 */
-		static_branch_inc(&memcg_sockets_enabled_key);
-		memcg->tcp_mem.active = true;
-	}
-
-	return 0;
-}
-
-enum {
-	RES_USAGE,
-	RES_LIMIT,
-	RES_MAX_USAGE,
-	RES_FAILCNT,
-};
-
-static DEFINE_MUTEX(tcp_limit_mutex);
-
-static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off)
-{
-	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-	unsigned long nr_pages;
-	int ret = 0;
-
-	buf = strstrip(buf);
-
-	switch (of_cft(of)->private) {
-	case RES_LIMIT:
-		/* see memcontrol.c */
-		ret = page_counter_memparse(buf, "-1", &nr_pages);
-		if (ret)
-			break;
-		mutex_lock(&tcp_limit_mutex);
-		ret = tcp_update_limit(memcg, nr_pages);
-		mutex_unlock(&tcp_limit_mutex);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	return ret ?: nbytes;
-}
-
-static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	u64 val;
-
-	switch (cft->private) {
-	case RES_LIMIT:
-		if (memcg == root_mem_cgroup)
-			val = PAGE_COUNTER_MAX;
-		else
-			val = memcg->tcp_mem.memory_allocated.limit;
-		val *= PAGE_SIZE;
-		break;
-	case RES_USAGE:
-		if (memcg == root_mem_cgroup)
-			val = atomic_long_read(&tcp_memory_allocated);
-		else
-			val = page_counter_read(&memcg->tcp_mem.memory_allocated);
-		val *= PAGE_SIZE;
-		break;
-	case RES_FAILCNT:
-		if (memcg == root_mem_cgroup)
-			return 0;
-		val = memcg->tcp_mem.memory_allocated.failcnt;
-		break;
-	case RES_MAX_USAGE:
-		if (memcg == root_mem_cgroup)
-			return 0;
-		val = memcg->tcp_mem.memory_allocated.watermark;
-		val *= PAGE_SIZE;
-		break;
-	default:
-		BUG();
-	}
-	return val;
-}
-
-static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = mem_cgroup_from_css(of_css(of));
-	if (memcg == root_mem_cgroup)
-		return nbytes;
-
-	switch (of_cft(of)->private) {
-	case RES_MAX_USAGE:
-		page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated);
-		break;
-	case RES_FAILCNT:
-		memcg->tcp_mem.memory_allocated.failcnt = 0;
-		break;
-	}
-
-	return nbytes;
-}
-
-static struct cftype tcp_files[] = {
-	{
-		.name = "kmem.tcp.limit_in_bytes",
-		.write = tcp_cgroup_write,
-		.read_u64 = tcp_cgroup_read,
-		.private = RES_LIMIT,
-	},
-	{
-		.name = "kmem.tcp.usage_in_bytes",
-		.read_u64 = tcp_cgroup_read,
-		.private = RES_USAGE,
-	},
-	{
-		.name = "kmem.tcp.failcnt",
-		.private = RES_FAILCNT,
-		.write = tcp_cgroup_reset,
-		.read_u64 = tcp_cgroup_read,
-	},
-	{
-		.name = "kmem.tcp.max_usage_in_bytes",
-		.private = RES_MAX_USAGE,
-		.write = tcp_cgroup_reset,
-		.read_u64 = tcp_cgroup_read,
-	},
-	{ }	/* terminate */
-};
-
-static int __init tcp_memcontrol_init(void)
-{
-	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
-	return 0;
-}
-__initcall(tcp_memcontrol_init);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ad8edb46f7c..006396e31cb0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -61,7 +61,6 @@
 #include <net/timewait_sock.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
 #include <net/busy_poll.h>
 
 #include <linux/proc_fs.h>
-- 
cgit v1.2.3


From d886f4e483ce63a3304adc9eda87031b93341c28 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:47 -0800
Subject: mm: memcontrol: rein in the CONFIG space madness

What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory
controller code is insignificant, having these conditionals is not
worth the complication and fragility that comes with them.

[akpm@linux-foundation.org: rework mem_cgroup_css_free() statement ordering]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 +++++-------
 init/Kconfig               | 21 +++---------------
 mm/memcontrol.c            | 53 ++++------------------------------------------
 mm/vmpressure.c            |  2 --
 4 files changed, 12 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2bb14d021cd0..47995b499429 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -233,9 +233,11 @@ struct mem_cgroup {
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+	unsigned long		socket_pressure;
+
+	/* Legacy tcp memory accounting */
 	struct cg_proto tcp_mem;
-#endif
+
 #ifndef CONFIG_SLOB
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
@@ -254,10 +256,6 @@ struct mem_cgroup {
 	struct wb_domain cgwb_domain;
 #endif
 
-#ifdef CONFIG_INET
-	unsigned long		socket_pressure;
-#endif
-
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -712,15 +710,13 @@ void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
-#if defined(CONFIG_MEMCG) && defined(CONFIG_INET)
+#ifdef CONFIG_MEMCG
 extern struct static_key_false memcg_sockets_enabled_key;
 #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (memcg->tcp_mem.memory_pressure)
 		return true;
-#endif
 	do {
 		if (time_before(jiffies, memcg->socket_pressure))
 			return true;
diff --git a/init/Kconfig b/init/Kconfig
index a0a15cec8daf..22320804fbaf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -964,20 +964,6 @@ config MEMCG_SWAP_ENABLED
 	  For those who want to have the feature enabled by default should
 	  select this option (if, for some reason, they need to disable it
 	  then swapaccount=0 does the trick).
-config MEMCG_LEGACY_KMEM
-       bool
-config MEMCG_KMEM
-	bool "Legacy Memory Resource Controller Kernel Memory accounting"
-	depends on MEMCG
-	depends on SLUB || SLAB
-	select MEMCG_LEGACY_KMEM
-	help
-	  The Kernel Memory extension for Memory Resource Controller can limit
-	  the amount of memory used by kernel objects in the system. Those are
-	  fundamentally different from the entities handled by the standard
-	  Memory Controller, which are page-based, and can be swapped. Users of
-	  the kmem extension can use it to guarantee that no group of processes
-	  will ever exhaust kernel resources alone.
 
 config BLK_CGROUP
 	bool "IO controller"
@@ -1190,10 +1176,9 @@ config USER_NS
 	  to provide different user info for different servers.
 
 	  When user namespaces are enabled in the kernel it is
-	  recommended that the MEMCG and MEMCG_KMEM options also be
-	  enabled and that user-space use the memory control groups to
-	  limit the amount of memory a memory unprivileged users can
-	  use.
+	  recommended that the MEMCG option also be enabled and that
+	  user-space use the memory control groups to limit the amount
+	  of memory a memory unprivileged users can use.
 
 	  If unsure, say N.
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 15896708429b..379f9911b87b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2842,11 +2842,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 	case _TCP:
 		counter = &memcg->tcp_mem.memory_allocated;
 		break;
-#endif
 	default:
 		BUG();
 	}
@@ -3006,7 +3004,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 }
 #endif /* !CONFIG_SLOB */
 
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
@@ -3024,16 +3021,7 @@ out:
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;
 }
-#else
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-				   unsigned long limit)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_MEMCG_LEGACY_KMEM */
 
-
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
 {
 	int ret;
@@ -3068,12 +3056,6 @@ out:
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;
 }
-#else
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */
 
 /*
  * The user of this function is...
@@ -3136,11 +3118,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 	case _TCP:
 		counter = &memcg->tcp_mem.memory_allocated;
 		break;
-#endif
 	default:
 		BUG();
 	}
@@ -4094,7 +4074,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memcg_numa_stat_show,
 	},
 #endif
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	{
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4127,7 +4106,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memcg_slab_show,
 	},
 #endif
-#ifdef CONFIG_INET
 	{
 		.name = "kmem.tcp.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
@@ -4151,8 +4129,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
-#endif
-#endif
 	{ },	/* terminate */
 };
 
@@ -4280,14 +4256,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
+	memcg->socket_pressure = jiffies;
 #ifndef CONFIG_SLOB
 	memcg->kmemcg_id = -1;
 #endif
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&memcg->cgwb_list);
-#endif
-#ifdef CONFIG_INET
-	memcg->socket_pressure = jiffies;
 #endif
 	return &memcg->css;
 
@@ -4321,10 +4295,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 		page_counter_init(&memcg->tcp_mem.memory_allocated,
 				  &parent->tcp_mem.memory_allocated);
-#endif
 
 		/*
 		 * No need to take a reference to the parent because cgroup
@@ -4336,9 +4308,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 		page_counter_init(&memcg->tcp_mem.memory_allocated, NULL);
-#endif
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -4353,10 +4323,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_INET
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_inc(&memcg_sockets_enabled_key);
-#endif
 
 	/*
 	 * Make sure the memcg is initialized: mem_cgroup_iter()
@@ -4403,18 +4371,13 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-#ifdef CONFIG_INET
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_dec(&memcg_sockets_enabled_key);
-#endif
-
-	memcg_free_kmem(memcg);
 
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
 	if (memcg->tcp_mem.active)
 		static_branch_dec(&memcg_sockets_enabled_key);
-#endif
 
+	memcg_free_kmem(memcg);
 	__mem_cgroup_free(memcg);
 }
 
@@ -5613,8 +5576,6 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
 	commit_charge(newpage, memcg, true);
 }
 
-#ifdef CONFIG_INET
-
 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
 EXPORT_SYMBOL(memcg_sockets_enabled_key);
 
@@ -5640,10 +5601,8 @@ void sock_update_memcg(struct sock *sk)
 	memcg = mem_cgroup_from_task(current);
 	if (memcg == root_mem_cgroup)
 		goto out;
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
 		goto out;
-#endif
 	if (css_tryget_online(&memcg->css))
 		sk->sk_memcg = memcg;
 out:
@@ -5669,7 +5628,6 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	gfp_t gfp_mask = GFP_KERNEL;
 
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
 		struct page_counter *counter;
 
@@ -5682,7 +5640,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 		memcg->tcp_mem.memory_pressure = 1;
 		return false;
 	}
-#endif
+
 	/* Don't block in the packet receive path */
 	if (in_softirq())
 		gfp_mask = GFP_NOWAIT;
@@ -5701,19 +5659,16 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
  */
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
 		page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
 				      nr_pages);
 		return;
 	}
-#endif
+
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	css_put_many(&memcg->css, nr_pages);
 }
 
-#endif /* CONFIG_INET */
-
 static int __init cgroup_memory(char *s)
 {
 	char *token;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 89b1d441af4b..9a6c0704211c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -275,7 +275,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 
 		level = vmpressure_calc_level(scanned, reclaimed);
 
-#ifdef CONFIG_INET
 		if (level > VMPRESSURE_LOW) {
 			/*
 			 * Let the socket buffer allocator know that
@@ -287,7 +286,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 			 */
 			memcg->socket_pressure = jiffies + HZ;
 		}
-#endif
 	}
 }
 
-- 
cgit v1.2.3


From 0db1529817b7b16226421f01470c5ba982c5f302 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:50 -0800
Subject: mm: memcontrol: flatten struct cg_proto

There are no more external users of struct cg_proto, flatten the
structure into struct mem_cgroup.

Since using those struct members doesn't stand out as much anymore,
add cgroup2 static branches to make it clearer which code is legacy.

Suggested-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 ++++++--------
 mm/memcontrol.c            | 33 +++++++++++++++------------------
 2 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 47995b499429..a3869bf97746 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,12 +85,6 @@ enum mem_cgroup_events_target {
 	MEM_CGROUP_NTARGETS,
 };
 
-struct cg_proto {
-	struct page_counter	memory_allocated;	/* Current allocated memory. */
-	int			memory_pressure;
-	bool			active;
-};
-
 #ifdef CONFIG_MEMCG
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
@@ -169,8 +163,11 @@ struct mem_cgroup {
 
 	/* Accounted resources */
 	struct page_counter memory;
+
+	/* Legacy consumer-oriented counters */
 	struct page_counter memsw;
 	struct page_counter kmem;
+	struct page_counter tcpmem;
 
 	/* Normal memory consumption range */
 	unsigned long low;
@@ -236,7 +233,8 @@ struct mem_cgroup {
 	unsigned long		socket_pressure;
 
 	/* Legacy tcp memory accounting */
-	struct cg_proto tcp_mem;
+	bool			tcpmem_active;
+	int			tcpmem_pressure;
 
 #ifndef CONFIG_SLOB
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
@@ -715,7 +713,7 @@ extern struct static_key_false memcg_sockets_enabled_key;
 #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-	if (memcg->tcp_mem.memory_pressure)
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
 		return true;
 	do {
 		if (time_before(jiffies, memcg->socket_pressure))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 379f9911b87b..6937f16f5ecb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2843,7 +2843,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 		counter = &memcg->kmem;
 		break;
 	case _TCP:
-		counter = &memcg->tcp_mem.memory_allocated;
+		counter = &memcg->tcpmem;
 		break;
 	default:
 		BUG();
@@ -3028,11 +3028,11 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
 
 	mutex_lock(&memcg_limit_mutex);
 
-	ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit);
+	ret = page_counter_limit(&memcg->tcpmem, limit);
 	if (ret)
 		goto out;
 
-	if (!memcg->tcp_mem.active) {
+	if (!memcg->tcpmem_active) {
 		/*
 		 * The active flag needs to be written after the static_key
 		 * update. This is what guarantees that the socket activation
@@ -3050,7 +3050,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
 		 * patched in yet.
 		 */
 		static_branch_inc(&memcg_sockets_enabled_key);
-		memcg->tcp_mem.active = true;
+		memcg->tcpmem_active = true;
 	}
 out:
 	mutex_unlock(&memcg_limit_mutex);
@@ -3119,7 +3119,7 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
 		counter = &memcg->kmem;
 		break;
 	case _TCP:
-		counter = &memcg->tcp_mem.memory_allocated;
+		counter = &memcg->tcpmem;
 		break;
 	default:
 		BUG();
@@ -4295,8 +4295,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
-		page_counter_init(&memcg->tcp_mem.memory_allocated,
-				  &parent->tcp_mem.memory_allocated);
+		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
 
 		/*
 		 * No need to take a reference to the parent because cgroup
@@ -4308,7 +4307,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
-		page_counter_init(&memcg->tcp_mem.memory_allocated, NULL);
+		page_counter_init(&memcg->tcpmem, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -4374,7 +4373,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_dec(&memcg_sockets_enabled_key);
 
-	if (memcg->tcp_mem.active)
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
 		static_branch_dec(&memcg_sockets_enabled_key);
 
 	memcg_free_kmem(memcg);
@@ -5601,7 +5600,7 @@ void sock_update_memcg(struct sock *sk)
 	memcg = mem_cgroup_from_task(current);
 	if (memcg == root_mem_cgroup)
 		goto out;
-	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
 		goto out;
 	if (css_tryget_online(&memcg->css))
 		sk->sk_memcg = memcg;
@@ -5629,15 +5628,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
-		struct page_counter *counter;
+		struct page_counter *fail;
 
-		if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
-					    nr_pages, &counter)) {
-			memcg->tcp_mem.memory_pressure = 0;
+		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+			memcg->tcpmem_pressure = 0;
 			return true;
 		}
-		page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
-		memcg->tcp_mem.memory_pressure = 1;
+		page_counter_charge(&memcg->tcpmem, nr_pages);
+		memcg->tcpmem_pressure = 1;
 		return false;
 	}
 
@@ -5660,8 +5658,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
-		page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
-				      nr_pages);
+		page_counter_uncharge(&memcg->tcpmem, nr_pages);
 		return;
 	}
 
-- 
cgit v1.2.3


From 0b8f73e104285a4badf9d768d1c39b06d77d1f97 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:02:53 -0800
Subject: mm: memcontrol: clean up alloc, online, offline, free functions

The creation and teardown of struct mem_cgroup is fairly messy and
that has attracted mistakes and subtle bugs before.

The main cause for this is that there is no clear model about what
needs to happen when, and that attracts more chaos. So create one:

1. mem_cgroup_alloc() should allocate struct mem_cgroup and its
   auxiliary members and initialize work items, locks etc. so that the
   object it returns is fully initialized and in a neutral state.

2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new
   memcg object and configure it and the system according to the role
   of the new memory-controlled cgroup in the hierarchy.

3. mem_cgroup_css_online() is no longer needed to synchronize with
   iterators, but it verifies css->id which isn't available earlier.

4. mem_cgroup_css_offline() implements stuff that needs to happen upon
   the user-visible destruction of a cgroup, which includes stopping
   all user interfacing as well as releasing certain structures when
   continued memory consumption would be unexpected at that point.

5. mem_cgroup_css_free() prepares the system and the memcg object for
   the object's disappearance, neutralizes its state, and then gives
   it back to mem_cgroup_free().

6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory.

[arnd@arndb.de: fix SLOB build regression]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   3 -
 mm/memcontrol.c            | 257 +++++++++++++++------------------------------
 2 files changed, 84 insertions(+), 176 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3869bf97746..27123e597eca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,9 +181,6 @@ struct mem_cgroup {
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 
-	/* css_online() has been completed */
-	int initialized;
-
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6937f16f5ecb..f6bc78f4ed13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,13 +250,6 @@ enum res_type {
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
@@ -899,17 +892,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 		if (css == &root->css)
 			break;
 
-		if (css_tryget(css)) {
-			/*
-			 * Make sure the memcg is initialized:
-			 * mem_cgroup_css_online() orders the the
-			 * initialization against setting the flag.
-			 */
-			if (smp_load_acquire(&memcg->initialized))
-				break;
-
-			css_put(css);
-		}
+		if (css_tryget(css))
+			break;
 
 		memcg = NULL;
 	}
@@ -2690,14 +2674,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
 {
 	bool ret;
 
-	/*
-	 * The lock does not prevent addition or deletion of children, but
-	 * it prevents a new child from being initialized based on this
-	 * parent in css_online(), so it's enough to decide whether
-	 * hierarchically inherited attributes can still be changed or not.
-	 */
-	lockdep_assert_held(&memcg_create_mutex);
-
 	rcu_read_lock();
 	ret = css_next_child(NULL, &memcg->css);
 	rcu_read_unlock();
@@ -2760,10 +2736,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
 
-	mutex_lock(&memcg_create_mutex);
-
 	if (memcg->use_hierarchy == val)
-		goto out;
+		return 0;
 
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
@@ -2782,9 +2756,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	} else
 		retval = -EINVAL;
 
-out:
-	mutex_unlock(&memcg_create_mutex);
-
 	return retval;
 }
 
@@ -2872,37 +2843,14 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 #ifndef CONFIG_SLOB
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
-	int err = 0;
 	int memcg_id;
 
 	BUG_ON(memcg->kmemcg_id >= 0);
 	BUG_ON(memcg->kmem_state);
 
-	/*
-	 * For simplicity, we won't allow this to be disabled.  It also can't
-	 * be changed if the cgroup has children already, or if tasks had
-	 * already joined.
-	 *
-	 * If tasks join before we set the limit, a person looking at
-	 * kmem.usage_in_bytes will have no way to determine when it took
-	 * place, which makes the value quite meaningless.
-	 *
-	 * After it first became limited, changes in the value of the limit are
-	 * of course permitted.
-	 */
-	mutex_lock(&memcg_create_mutex);
-	if (cgroup_is_populated(memcg->css.cgroup) ||
-	    (memcg->use_hierarchy && memcg_has_children(memcg)))
-		err = -EBUSY;
-	mutex_unlock(&memcg_create_mutex);
-	if (err)
-		goto out;
-
 	memcg_id = memcg_alloc_cache_id();
-	if (memcg_id < 0) {
-		err = memcg_id;
-		goto out;
-	}
+	if (memcg_id < 0)
+		return memcg_id;
 
 	static_branch_inc(&memcg_kmem_enabled_key);
 	/*
@@ -2913,17 +2861,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 	 */
 	memcg->kmemcg_id = memcg_id;
 	memcg->kmem_state = KMEM_ONLINE;
-out:
-	return err;
+
+	return 0;
 }
 
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+				struct mem_cgroup *memcg)
 {
 	int ret = 0;
-	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-
-	if (!parent)
-		return 0;
 
 	mutex_lock(&memcg_limit_mutex);
 	/*
@@ -2985,6 +2930,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 
 static void memcg_free_kmem(struct mem_cgroup *memcg)
 {
+	/* css_alloc() failed, offlining didn't happen */
+	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+		memcg_offline_kmem(memcg);
+
 	if (memcg->kmem_state == KMEM_ALLOCATED) {
 		memcg_destroy_kmem_caches(memcg);
 		static_branch_dec(&memcg_kmem_enabled_key);
@@ -2992,7 +2941,11 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 	}
 }
 #else
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
+{
+	return 0;
+}
+static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
 	return 0;
 }
@@ -3007,11 +2960,16 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&memcg_limit_mutex);
 	/* Top-level cgroup doesn't propagate from root */
 	if (!memcg_kmem_online(memcg)) {
+		if (cgroup_is_populated(memcg->css.cgroup) ||
+		    (memcg->use_hierarchy && memcg_has_children(memcg)))
+			ret = -EBUSY;
+		if (ret)
+			goto out;
 		ret = memcg_online_kmem(memcg);
 		if (ret)
 			goto out;
@@ -4167,90 +4125,44 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 	kfree(memcg->nodeinfo[node]);
 }
 
-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
-	struct mem_cgroup *memcg;
-	size_t size;
-
-	size = sizeof(struct mem_cgroup);
-	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
-	memcg = kzalloc(size, GFP_KERNEL);
-	if (!memcg)
-		return NULL;
-
-	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-	if (!memcg->stat)
-		goto out_free;
-
-	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
-		goto out_free_stat;
-
-	return memcg;
-
-out_free_stat:
-	free_percpu(memcg->stat);
-out_free:
-	kfree(memcg);
-	return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 
-	cancel_work_sync(&memcg->high_work);
-
-	mem_cgroup_remove_from_trees(memcg);
-
+	memcg_wb_domain_exit(memcg);
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
-
 	free_percpu(memcg->stat);
-	memcg_wb_domain_exit(memcg);
 	kfree(memcg);
 }
 
-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
-	long error = -ENOMEM;
+	size_t size;
 	int node;
 
-	memcg = mem_cgroup_alloc();
+	size = sizeof(struct mem_cgroup);
+	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+	memcg = kzalloc(size, GFP_KERNEL);
 	if (!memcg)
-		return ERR_PTR(error);
+		return NULL;
+
+	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+	if (!memcg->stat)
+		goto fail;
 
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_zone_info(memcg, node))
-			goto free_out;
+			goto fail;
 
-	/* root ? */
-	if (parent_css == NULL) {
-		root_mem_cgroup = memcg;
-		page_counter_init(&memcg->memory, NULL);
-		memcg->high = PAGE_COUNTER_MAX;
-		memcg->soft_limit = PAGE_COUNTER_MAX;
-		page_counter_init(&memcg->memsw, NULL);
-		page_counter_init(&memcg->kmem, NULL);
-	}
+	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+		goto fail;
 
 	INIT_WORK(&memcg->high_work, high_work_func);
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
-	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
@@ -4263,48 +4175,37 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&memcg->cgwb_list);
 #endif
-	return &memcg->css;
-
-free_out:
-	__mem_cgroup_free(memcg);
-	return ERR_PTR(error);
+	return memcg;
+fail:
+	mem_cgroup_free(memcg);
+	return NULL;
 }
 
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
-	int ret;
-
-	if (css->id > MEM_CGROUP_ID_MAX)
-		return -ENOSPC;
-
-	if (!parent)
-		return 0;
-
-	mutex_lock(&memcg_create_mutex);
+	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+	struct mem_cgroup *memcg;
+	long error = -ENOMEM;
 
-	memcg->use_hierarchy = parent->use_hierarchy;
-	memcg->oom_kill_disable = parent->oom_kill_disable;
-	memcg->swappiness = mem_cgroup_swappiness(parent);
+	memcg = mem_cgroup_alloc();
+	if (!memcg)
+		return ERR_PTR(error);
 
-	if (parent->use_hierarchy) {
+	memcg->high = PAGE_COUNTER_MAX;
+	memcg->soft_limit = PAGE_COUNTER_MAX;
+	if (parent) {
+		memcg->swappiness = mem_cgroup_swappiness(parent);
+		memcg->oom_kill_disable = parent->oom_kill_disable;
+	}
+	if (parent && parent->use_hierarchy) {
+		memcg->use_hierarchy = true;
 		page_counter_init(&memcg->memory, &parent->memory);
-		memcg->high = PAGE_COUNTER_MAX;
-		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
-
-		/*
-		 * No need to take a reference to the parent because cgroup
-		 * core guarantees its existence.
-		 */
 	} else {
 		page_counter_init(&memcg->memory, NULL);
-		memcg->high = PAGE_COUNTER_MAX;
-		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		page_counter_init(&memcg->tcpmem, NULL);
@@ -4316,21 +4217,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		if (parent != root_mem_cgroup)
 			memory_cgrp_subsys.broken_hierarchy = true;
 	}
-	mutex_unlock(&memcg_create_mutex);
 
-	ret = memcg_propagate_kmem(memcg);
-	if (ret)
-		return ret;
+	/* The following stuff does not apply to the root */
+	if (!parent) {
+		root_mem_cgroup = memcg;
+		return &memcg->css;
+	}
+
+	error = memcg_propagate_kmem(parent, memcg);
+	if (error)
+		goto fail;
 
 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
 		static_branch_inc(&memcg_sockets_enabled_key);
 
-	/*
-	 * Make sure the memcg is initialized: mem_cgroup_iter()
-	 * orders reading memcg->initialized against its callers
-	 * reading the memcg members.
-	 */
-	smp_store_release(&memcg->initialized, 1);
+	return &memcg->css;
+fail:
+	mem_cgroup_free(memcg);
+	return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+	if (css->id > MEM_CGROUP_ID_MAX)
+		return -ENOSPC;
 
 	return 0;
 }
@@ -4352,10 +4263,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	}
 	spin_unlock(&memcg->event_list_lock);
 
-	vmpressure_cleanup(&memcg->vmpressure);
-
 	memcg_offline_kmem(memcg);
-
 	wb_memcg_offline(memcg);
 }
 
@@ -4376,8 +4284,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
 		static_branch_dec(&memcg_sockets_enabled_key);
 
+	vmpressure_cleanup(&memcg->vmpressure);
+	cancel_work_sync(&memcg->high_work);
+	mem_cgroup_remove_from_trees(memcg);
 	memcg_free_kmem(memcg);
-	__mem_cgroup_free(memcg);
+	mem_cgroup_free(memcg);
 }
 
 /**
-- 
cgit v1.2.3


From 37e84351198be087335ad2b2253b35c7cc76a5ad Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:02:56 -0800
Subject: mm: memcontrol: charge swap to cgroup2

This patchset introduces swap accounting to cgroup2.

This patch (of 7):

In the legacy hierarchy we charge memsw, which is dubious, because:

 - memsw.limit must be >= memory.limit, so it is impossible to limit
   swap usage less than memory usage. Taking into account the fact that
   the primary limiting mechanism in the unified hierarchy is
   memory.high while memory.limit is either left unset or set to a very
   large value, moving memsw.limit knob to the unified hierarchy would
   effectively make it impossible to limit swap usage according to the
   user preference.

 - memsw.usage != memory.usage + swap.usage, because a page occupying
   both swap entry and a swap cache page is charged only once to memsw
   counter. As a result, it is possible to effectively eat up to
   memory.limit of memory pages *and* memsw.limit of swap entries, which
   looks unexpected.

That said, we should provide a different swap limiting mechanism for
cgroup2.

This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup.  It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.

The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.

Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e.  when all references to a swap entry, including the one
taken by swap cache, are gone.  This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry.  At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   1 +
 include/linux/swap.h       |   6 +++
 mm/memcontrol.c            | 118 ++++++++++++++++++++++++++++++++++++++++++---
 mm/shmem.c                 |   4 ++
 mm/swap_state.c            |   5 ++
 mm/swapfile.c              |   4 +-
 6 files changed, 128 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 27123e597eca..6e0126230878 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -163,6 +163,7 @@ struct mem_cgroup {
 
 	/* Accounted resources */
 	struct page_counter memory;
+	struct page_counter swap;
 
 	/* Legacy consumer-oriented counters */
 	struct page_counter memsw;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 414e101cd061..83b95f343ab1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,11 +368,17 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
 extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
 #else
 static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
 }
+static inline int mem_cgroup_try_charge_swap(struct page *page,
+					     swp_entry_t entry)
+{
+	return 0;
+}
 static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f6bc78f4ed13..1ff552e3722b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 		pr_cont(":");
 
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-			if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 				continue;
 			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
@@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 	limit = memcg->memory.limit;
 	if (mem_cgroup_swappiness(memcg)) {
 		unsigned long memsw_limit;
+		unsigned long swap_limit;
 
 		memsw_limit = memcg->memsw.limit;
-		limit = min(limit + total_swap_pages, memsw_limit);
+		swap_limit = memcg->swap.limit;
+		swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
+		limit = min(limit + swap_limit, memsw_limit);
 	}
 	return limit;
 }
@@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (parent && parent->use_hierarchy) {
 		memcg->use_hierarchy = true;
 		page_counter_init(&memcg->memory, &parent->memory);
+		page_counter_init(&memcg->swap, &parent->swap);
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
 	} else {
 		page_counter_init(&memcg->memory, NULL);
+		page_counter_init(&memcg->swap, NULL);
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		page_counter_init(&memcg->tcpmem, NULL);
@@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		if (page->mem_cgroup)
 			goto out;
 
-		if (do_memsw_account()) {
+		if (do_swap_account) {
 			swp_entry_t ent = { .val = page_private(page), };
 			unsigned short id = lookup_swap_cgroup_id(ent);
 
@@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	memcg_check_events(memcg, page);
 }
 
+/*
+ * mem_cgroup_try_charge_swap - try charging a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+	struct mem_cgroup *memcg;
+	struct page_counter *counter;
+	unsigned short oldid;
+
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+		return 0;
+
+	memcg = page->mem_cgroup;
+
+	/* Readahead page, never charged */
+	if (!memcg)
+		return 0;
+
+	if (!mem_cgroup_is_root(memcg) &&
+	    !page_counter_try_charge(&memcg->swap, 1, &counter))
+		return -ENOMEM;
+
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+	VM_BUG_ON_PAGE(oldid, page);
+	mem_cgroup_swap_statistics(memcg, true);
+
+	css_get(&memcg->css);
+	return 0;
+}
+
 /**
  * mem_cgroup_uncharge_swap - uncharge a swap entry
  * @entry: swap entry to uncharge
  *
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 
-	if (!do_memsw_account())
+	if (!do_swap_account)
 		return;
 
 	id = swap_cgroup_record(entry, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_from_id(id);
 	if (memcg) {
-		if (!mem_cgroup_is_root(memcg))
-			page_counter_uncharge(&memcg->memsw, 1);
+		if (!mem_cgroup_is_root(memcg)) {
+			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+				page_counter_uncharge(&memcg->swap, 1);
+			else
+				page_counter_uncharge(&memcg->memsw, 1);
+		}
 		mem_cgroup_swap_statistics(memcg, false);
 		css_put(&memcg->css);
 	}
@@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s)
 }
 __setup("swapaccount=", enable_swap_account);
 
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long max = READ_ONCE(memcg->swap.limit);
+
+	if (max == PAGE_COUNTER_MAX)
+		seq_puts(m, "max\n");
+	else
+		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+			      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long max;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "max", &max);
+	if (err)
+		return err;
+
+	mutex_lock(&memcg_limit_mutex);
+	err = page_counter_limit(&memcg->swap, max);
+	mutex_unlock(&memcg_limit_mutex);
+	if (err)
+		return err;
+
+	return nbytes;
+}
+
+static struct cftype swap_files[] = {
+	{
+		.name = "swap.current",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = swap_current_read,
+	},
+	{
+		.name = "swap.max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = swap_max_show,
+		.write = swap_max_write,
+	},
+	{ }	/* terminate */
+};
+
 static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
@@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account) {
 		do_swap_account = 1;
+		WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+					       swap_files));
 		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
 						  memsw_cgroup_files));
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index b98e1011858c..fa2ceb2d2655 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	if (!swap.val)
 		goto redirty;
 
+	if (mem_cgroup_try_charge_swap(page, swap))
+		goto free_swap;
+
 	/*
 	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
 	 * if it's not already there.  Do it now before the page is
@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	mutex_unlock(&shmem_swaplist_mutex);
+free_swap:
 	swapcache_free(swap);
 redirty:
 	set_page_dirty(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 676ff2991380..69cb2464e7dc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
 	if (!entry.val)
 		return 0;
 
+	if (mem_cgroup_try_charge_swap(page, entry)) {
+		swapcache_free(entry);
+		return 0;
+	}
+
 	if (unlikely(PageTransHuge(page)))
 		if (unlikely(split_huge_page_to_list(page, list))) {
 			swapcache_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bb30aa3a412..22a7a1fc1e47 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 			count--;
 	}
 
-	if (!count)
-		mem_cgroup_uncharge_swap(entry);
-
 	usage = count | has_cache;
 	p->swap_map[offset] = usage;
 
 	/* free if no reference */
 	if (!usage) {
+		mem_cgroup_uncharge_swap(entry);
 		dec_cluster_info_page(p, p->cluster_info, offset);
 		if (offset < p->lowest_bit)
 			p->lowest_bit = offset;
-- 
cgit v1.2.3


From eb01aaab43084f1c919ce66183fea005033351b9 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:03:02 -0800
Subject: mm: memcontrol: replace mem_cgroup_lruvec_online with
 mem_cgroup_online

mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg.  Since
get_scan_count(), which is the only user of this function, now possesses
pointer to memcg, let's pass memcg directly to mem_cgroup_online() instead
of picking it out of lruvec and rename the function accordingly.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 27 ++++++++++-----------------
 mm/vmscan.c                |  2 +-
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6e0126230878..166661708410 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -355,6 +355,13 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
+static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return true;
+	return !!(memcg->css.flags & CSS_ONLINE);
+}
+
 /*
  * For memory reclaim.
  */
@@ -363,20 +370,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int nr_pages);
 
-static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup *memcg;
-
-	if (mem_cgroup_disabled())
-		return true;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-	memcg = mz->memcg;
-
-	return !!(memcg->css.flags & CSS_ONLINE);
-}
-
 static inline
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
@@ -589,13 +582,13 @@ static inline bool mem_cgroup_disabled(void)
 	return true;
 }
 
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 {
 	return true;
 }
 
-static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+static inline bool
+mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return true;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 014ff89a4aa5..9a8556e49cd9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1997,7 +1997,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	if (current_is_kswapd()) {
 		if (!zone_reclaimable(zone))
 			force_scan = true;
-		if (!mem_cgroup_lruvec_online(lruvec))
+		if (!mem_cgroup_online(memcg))
 			force_scan = true;
 	}
 	if (!global_reclaim(sc))
-- 
cgit v1.2.3


From 6f2cb2f17700a39567cf3e9a2e95041def5f3688 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:03:05 -0800
Subject: swap.h: move memcg related stuff to the end of the file

The following patches will add more functions to the memcg section of
include/linux/swap.h.  Some of them will need values defined below the
current location of the section.  So let's move the section to the end of
the file.  No functional changes intended.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 70 ++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 83b95f343ab1..c2bd163a5e7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -350,39 +350,7 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
-#ifdef CONFIG_MEMCG
-static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-	/* root ? */
-	if (mem_cgroup_disabled() || !memcg->css.parent)
-		return vm_swappiness;
-
-	return memcg->swappiness;
-}
 
-#else
-static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
-{
-	return vm_swappiness;
-}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
-#else
-static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
-{
-}
-static inline int mem_cgroup_try_charge_swap(struct page *page,
-					     swp_entry_t entry)
-{
-	return 0;
-}
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
-{
-}
-#endif
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
@@ -561,5 +529,43 @@ static inline swp_entry_t get_swap_page(void)
 }
 
 #endif /* CONFIG_SWAP */
+
+#ifdef CONFIG_MEMCG
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+	/* root ? */
+	if (mem_cgroup_disabled() || !memcg->css.parent)
+		return vm_swappiness;
+
+	return memcg->swappiness;
+}
+
+#else
+static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
+{
+	return vm_swappiness;
+}
+#endif
+
+#ifdef CONFIG_MEMCG_SWAP
+extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
+extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
+#else
+static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+}
+
+static inline int mem_cgroup_try_charge_swap(struct page *page,
+					     swp_entry_t entry)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
-- 
cgit v1.2.3


From d8b38438a0bcb362c396f49d8279ef7b505917f4 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:03:07 -0800
Subject: mm: vmscan: do not scan anon pages if memcg swap limit is hit

We don't scan anonymous memory if we ran out of swap, neither should we do
it in case memcg swap limit is hit, because swap out is impossible anyway.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  6 ++++++
 mm/memcontrol.c      | 13 +++++++++++++
 mm/vmscan.c          |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c2bd163a5e7e..a587050204f9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -551,6 +551,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
 extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
 extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
+extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
 #else
 static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
@@ -565,6 +566,11 @@ static inline int mem_cgroup_try_charge_swap(struct page *page,
 static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 }
+
+static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+	return get_nr_swap_pages();
+}
 #endif
 
 #endif /* __KERNEL__*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1ff552e3722b..bcb38718e174 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5748,6 +5748,19 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
 	rcu_read_unlock();
 }
 
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+	long nr_swap_pages = get_nr_swap_pages();
+
+	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return nr_swap_pages;
+	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+		nr_swap_pages = min_t(long, nr_swap_pages,
+				      READ_ONCE(memcg->swap.limit) -
+				      page_counter_read(&memcg->swap));
+	return nr_swap_pages;
+}
+
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a8556e49cd9..3be5f9db6c42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2004,7 +2004,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
-- 
cgit v1.2.3


From 5ccc5abaaf6f9242cc63342c5286990233f392fa Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Wed, 20 Jan 2016 15:03:10 -0800
Subject: mm: free swap cache aggressively if memcg swap is full

Swap cache pages are freed aggressively if swap is nearly full (>50%
currently), because otherwise we are likely to stop scanning anonymous
when we near the swap limit even if there is plenty of freeable swap cache
pages.  We should follow the same trend in case of memory cgroup, which
has its own swap limit.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  6 ++++++
 mm/memcontrol.c      | 22 ++++++++++++++++++++++
 mm/memory.c          |  3 ++-
 mm/swapfile.c        |  2 +-
 mm/vmscan.c          |  2 +-
 5 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a587050204f9..d18b65c53dbb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -552,6 +552,7 @@ extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
 extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
 extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
+extern bool mem_cgroup_swap_full(struct page *page);
 #else
 static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
@@ -571,6 +572,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 {
 	return get_nr_swap_pages();
 }
+
+static inline bool mem_cgroup_swap_full(struct page *page)
+{
+	return vm_swap_full();
+}
 #endif
 
 #endif /* __KERNEL__*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bcb38718e174..6a0007965e31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5761,6 +5761,28 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 	return nr_swap_pages;
 }
 
+bool mem_cgroup_swap_full(struct page *page)
+{
+	struct mem_cgroup *memcg;
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	if (vm_swap_full())
+		return true;
+	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return false;
+
+	memcg = page->mem_cgroup;
+	if (!memcg)
+		return false;
+
+	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+			return true;
+
+	return false;
+}
+
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
diff --git a/mm/memory.c b/mm/memory.c
index ff17850a52d9..30991f83d0bf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2582,7 +2582,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	swap_free(entry);
-	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+	if (mem_cgroup_swap_full(page) ||
+	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (page != swapcache) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 22a7a1fc1e47..c43f654a7b64 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1006,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry)
 		 * Also recheck PageSwapCache now page is locked (above).
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
-				(!page_mapped(page) || vm_swap_full())) {
+		    (!page_mapped(page) || mem_cgroup_swap_full(page))) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3be5f9db6c42..bd620b65db52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1214,7 +1214,7 @@ cull_mlocked:
 
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && vm_swap_full())
+		if (PageSwapCache(page) && mem_cgroup_swap_full(page))
 			try_to_free_swap(page);
 		VM_BUG_ON_PAGE(PageActive(page), page);
 		SetPageActive(page);
-- 
cgit v1.2.3


From b2807f07f4f87362925b8a5b8cbb7b624da10f03 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Jan 2016 15:03:22 -0800
Subject: mm: memcontrol: add "sock" to cgroup2 memory.stat

Provide statistics on how much of a cgroup's memory footprint is made up
of socket buffers from network connections owned by the group.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 ++++-
 mm/memcontrol.c            | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 166661708410..9ae48d4aeb5e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -50,6 +50,9 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
+	/* default hierarchy stats */
+	MEMCG_SOCK,
+	MEMCG_NR_STAT,
 };
 
 struct mem_cgroup_reclaim_cookie {
@@ -87,7 +90,7 @@ enum mem_cgroup_events_target {
 
 #ifdef CONFIG_MEMCG
 struct mem_cgroup_stat_cpu {
-	long count[MEM_CGROUP_STAT_NSTATS];
+	long count[MEMCG_NR_STAT];
 	unsigned long events[MEMCG_NR_EVENTS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 98f4109bff6c..ca052f2a4a0b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5128,6 +5128,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 		   (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
 		   (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+	seq_printf(m, "sock %llu\n",
+		   (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
 
 	seq_printf(m, "file_mapped %llu\n",
 		   (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
@@ -5631,6 +5633,8 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 	if (in_softirq())
 		gfp_mask = GFP_NOWAIT;
 
+	this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
 	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
 		return true;
 
@@ -5650,6 +5654,8 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 		return;
 	}
 
+	this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	css_put_many(&memcg->css, nr_pages);
 }
-- 
cgit v1.2.3